Merge pull request #206 from water111/w/ir2-stacking

[Decompiler] Add stacking framework
This commit is contained in:
water111 2021-01-22 21:08:10 -05:00 committed by GitHub
commit 8135c18e91
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 394 additions and 10 deletions

View File

@ -34,7 +34,9 @@ add_library(
IR2/AtomicOpTypeAnalysis.cpp
IR2/cfg_builder.cpp
IR2/Env.cpp
IR2/expression_build.cpp
IR2/Form.cpp
IR2/FormStack.cpp
IR2/reg_usage.cpp
IR2/variable_naming.cpp

View File

@ -169,6 +169,8 @@ class Function {
Env env;
FormPool form_pool;
Form* top_form = nullptr;
std::string debug_form_string;
bool print_debug_forms = false;
} ir2;
private:

View File

@ -20,6 +20,18 @@ FormPool::~FormPool() {
}
}
///////////////////
// FormElement
///////////////////
std::string FormElement::to_string(const Env& env) const {
return to_form(env).print();
}
void FormElement::push_to_stack(const Env& env, FormStack&) {
throw std::runtime_error("push_to_stack not implemented for " + to_string(env));
}
///////////////////
// Form
//////////////////
@ -38,6 +50,10 @@ goos::Object Form::to_form(const Env& env) const {
}
}
std::string Form::to_string(const Env& env) const {
return to_form(env).print();
}
void Form::inline_forms(std::vector<goos::Object>& forms, const Env& env) const {
for (auto& x : m_elements) {
forms.push_back(x->to_form(env));

View File

@ -11,7 +11,7 @@
namespace decompiler {
class Form;
class Env;
class IR2_Stack;
class FormStack;
/*!
* A "FormElement" represents a single LISP form that's not a begin.
@ -27,14 +27,10 @@ class FormElement {
virtual void apply_form(const std::function<void(Form*)>& f) = 0;
virtual bool is_sequence_point() const { return true; }
virtual void collect_vars(VariableSet& vars) const = 0;
std::string to_string(const Env& env) const;
// // push the result of this operation to the operation stack
// // this is used for the forms that aren't last in a multi-form.
// virtual void push_to_stack(const Env& env, IR2_Stack& stack) = 0;
//
// // this is used for the final of a multi-form only.
// // using the current expressions on the stack, simplify myself.
// virtual FormElement* simplify(const Env& env, FormPool& pool, IR2_Stack& stack) = 0;
// push the result of this operation to the operation stack
virtual void push_to_stack(const Env& env, FormStack& stack);
protected:
friend class Form;
@ -138,6 +134,10 @@ class SetVarElement : public FormElement {
bool m_is_sequence_point = true;
};
/*!
* A wrapper around a single AtomicOp.
* The "important" special AtomicOps have their own Form type, like FuncitonCallElement.
*/
class AtomicOpElement : public FormElement {
public:
explicit AtomicOpElement(const AtomicOp* op);
@ -150,6 +150,14 @@ class AtomicOpElement : public FormElement {
const AtomicOp* m_op;
};
/*!
* A "condition" like (< a b). This can be used as a boolean value directly: (set! a (< b c))
* or it can be used as a branch condition: (if (< a b)).
*
* In the first case, it can be either a conditional move or actually branching. GOAL seems to use
* the branching when sometimes it could have used the conditional move, and for now, we don't
* care about the difference.
*/
class ConditionElement : public FormElement {
public:
ConditionElement(IR2_Condition::Kind kind, Form* src0, Form* src1);
@ -164,6 +172,9 @@ class ConditionElement : public FormElement {
Form* m_src[2] = {nullptr, nullptr};
};
/*!
* Wrapper around an AtomicOp call.
*/
class FunctionCallElement : public FormElement {
public:
explicit FunctionCallElement(const CallOp* op);
@ -176,6 +187,10 @@ class FunctionCallElement : public FormElement {
const CallOp* m_op;
};
/*!
* Wrapper around an AtomicOp branch. These are inserted when directly converting blocks to Form,
* but should be eliminated after the cfg_builder pass completes.
*/
class BranchElement : public FormElement {
public:
explicit BranchElement(const BranchOp* op);
@ -189,6 +204,10 @@ class BranchElement : public FormElement {
const BranchOp* m_op;
};
/*!
* Represents a (return-from #f x) form, which immediately returns from the function.
* This always has some "dead code" after it that can't be reached, which is the "dead_code".
*/
class ReturnElement : public FormElement {
public:
Form* return_code = nullptr;
@ -201,6 +220,27 @@ class ReturnElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
/*!
* Represents a (return-from Lxxx x) form, which returns from a block which ends before the end
* of the function. These are used pretty rarely. As a result, I'm not planning to allow these to
* next within other expressions. This means that the following code:
*
* (set! x (block my-block
* (if (condition?)
* (return-from my-block 12))
* 2))
*
* Would become
*
* (block my-block
* (when (condition?)
* (set! x 12)
* (return-from my-block none))
* (set! x 2)
* )
*
* which seems fine to me.
*/
class BreakElement : public FormElement {
public:
Form* return_code = nullptr;
@ -213,6 +253,21 @@ class BreakElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
/*!
* Condition (cond, if, when, unless) which has an "else" case.
* The condition of the first entry may contain too much and will need to be adjusted later.
* Example:
*
* (set! x 10)
* (if (something?) ... )
*
* might become
* (if (begin (set! x 10) (something?)) ... )
*
* We want to wait until after expressions are built to move the extra stuff up to avoid splitting
* up a complicated expression used as the condition. But this should happen before variable
* scoping.
*/
class CondWithElseElement : public FormElement {
public:
struct Entry {
@ -230,6 +285,14 @@ class CondWithElseElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
/*!
* An empty element. This is used to fill the body of control forms with nothing in them.
* For example, I believe that (cond ((x y) (else none))) will generate an else case with an
* "empty" and looks different from (cond ((x y))).
*
* We _could_ simplify out the use of empty, but I think it's more "authentic" to leave them in, and
* might give us more clues about how the code was originally written
*/
class EmptyElement : public FormElement {
public:
EmptyElement() = default;
@ -239,6 +302,11 @@ class EmptyElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
/*!
* Represents a GOAL while loop and more complicated loops which have the "while" format of checking
* the condition before the first loop. This will not include infinite while loops.
* Unlike CondWithElseElement, this will correctly identify the start and end of the condition.
*/
class WhileElement : public FormElement {
public:
WhileElement(Form* _condition, Form* _body) : condition(_condition), body(_body) {}
@ -251,6 +319,11 @@ class WhileElement : public FormElement {
bool cleaned = false;
};
/*!
* Represents a GOAL until loop and more complicated loops which use the "until" format of checking
* the condition after the first iteration. Has the same limitation as CondWithElseElement for the
* condition.
*/
class UntilElement : public FormElement {
public:
UntilElement(Form* _condition, Form* _body) : condition(_condition), body(_body) {}
@ -262,6 +335,11 @@ class UntilElement : public FormElement {
Form* body = nullptr;
};
/*!
* Represents a GOAL short-circuit expression, either AND or OR.
* The first "element" in ShortCircuitElement may be too large, see the comment on
* CondWithElseElement
*/
class ShortCircuitElement : public FormElement {
public:
struct Entry {
@ -286,6 +364,11 @@ class ShortCircuitElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
/*!
* Represents a GOAL cond/if/when/unless statement which does not have an explicit else case. The
* compiler will then move #f into the result register in the delay slot. The first condition may be
* too large at first, see CondWithElseElement
*/
class CondNoElseElement : public FormElement {
public:
struct Entry {
@ -305,6 +388,9 @@ class CondNoElseElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
/*!
* Represents a (abs x) expression.
*/
class AbsElement : public FormElement {
public:
explicit AbsElement(Form* _source);
@ -315,6 +401,11 @@ class AbsElement : public FormElement {
Form* source = nullptr;
};
/*!
* Represents an (ash x y) expression. There is also an "unsigned" version of this using logical
* shifts. This only recognizes the fancy version where the shift amount isn't known at compile time
* and the compiler emits code that branches depending on the sign of the shift amount.
*/
class AshElement : public FormElement {
public:
Form* shift_amount = nullptr;
@ -328,6 +419,10 @@ class AshElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
/*!
* Represents a form which gets the runtime type of a boxed object. This is for the most general
* "object" case where we check for pair, binteger, or basic and there's actually branching.
*/
class TypeOfElement : public FormElement {
public:
Form* value;
@ -339,6 +434,24 @@ class TypeOfElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
/*!
* Represents an unpaired cmove #f. GOAL may emit code like
* (set! x #t)
* (... evaluate something)
* (cmov x y #f)
* where the stuff in between is potentially very large.
* GOAL has no "condition move" keyword available to the programmer - this would only happen if when
* doing something like (set! x (zero? y)), in the code for creating a GOAL boolean.
*
* Code like (if x (set! y z)) will branch, the compiler isn't smart enough to use movn/movz here.
*
* These cannot be compacted into a single form until expression building, so we leave these
* placeholders in.
*
* Note - some conditionals put the (set! x #t) immediately before the cmove, but not all. Those
* that do will be correctly recognized and will be a ConditionElement. zero! seems to be the most
* common one that's split, and it happens reasonably often, so I will try to actually correct it.
*/
class ConditionalMoveFalseElement : public FormElement {
public:
Variable dest;
@ -351,6 +464,37 @@ class ConditionalMoveFalseElement : public FormElement {
void collect_vars(VariableSet& vars) const override;
};
///*!
// * A GenericOperator is the head of a GenericElement.
// * It is used for the final output.
// */
// class GenericOperator {
// public:
// enum class Kind {
// FIXED_FUNCTION_CALL,
// VAR_FUNCTION_CALL,
// FIXED_OPERATOR
// };
//
// private:
// // if we're a VAR_FUNCTION_CALL, this should contain the expression to get the function
// Form* m_function_val;
//
// //std::string
//
//};
//
// class GenericElement : public FormElement {
// public:
// goos::Object to_form(const Env& env) const override;
// void apply(const std::function<void(FormElement*)>& f) override;
// void apply_form(const std::function<void(Form*)>& f) override;
// void collect_vars(VariableSet& vars) const override;
// private:
// GenericOperator m_head;
// std::vector<Form*> m_elts;
//};
/*!
* A Form is a wrapper around one or more FormElements.
* This is done for two reasons:
@ -401,9 +545,15 @@ class Form {
const std::vector<FormElement*>& elts() const { return m_elements; }
std::vector<FormElement*>& elts() { return m_elements; }
void push_back(FormElement* elt) { m_elements.push_back(elt); }
void push_back(FormElement* elt) {
elt->parent_form = this;
m_elements.push_back(elt);
}
void clear() { m_elements.clear(); }
goos::Object to_form(const Env& env) const;
std::string to_string(const Env& env) const;
void inline_forms(std::vector<goos::Object>& forms, const Env& env) const;
void apply(const std::function<void(FormElement*)>& f);
void apply_form(const std::function<void(Form*)>& f);

View File

@ -0,0 +1,91 @@
#include "FormStack.h"
#include "Form.h"
namespace decompiler {
std::string FormStack::StackEntry::print(const Env& env) const {
if (destination.has_value()) {
assert(source && !elt);
return fmt::format("d: {} s: {} | {} <- {}", active, sequence_point,
destination.value().reg().to_charp(), source->to_string(env));
} else {
assert(elt && !source);
return fmt::format("d: {} s: {} | {}", active, sequence_point, elt->to_string(env));
}
}
std::string FormStack::print(const Env& env) {
std::string result;
for (auto& x : m_stack) {
result += x.print(env);
result += '\n';
}
return result;
}
void FormStack::push_value_to_reg(Variable var, Form* value, bool sequence_point) {
StackEntry entry;
entry.active = true; // by default, we should display everything!
entry.sequence_point = sequence_point;
entry.destination = var;
entry.source = value;
m_stack.push_back(entry);
}
bool FormStack::is_single_expression() {
int count = 0;
for (auto& e : m_stack) {
if (e.active) {
count++;
}
}
return count == 1;
}
void FormStack::push_form_element(FormElement* elt, bool sequence_point) {
StackEntry entry;
entry.active = true;
entry.elt = elt;
entry.sequence_point = sequence_point;
m_stack.push_back(entry);
}
Form* FormStack::pop_reg(const Variable& var) {
for (size_t i = m_stack.size(); i-- > 0;) {
auto& entry = m_stack.at(i);
if (entry.active) {
if (entry.destination == var) {
entry.active = false;
assert(entry.source);
return entry.source;
} else {
// we didn't match
if (entry.sequence_point) {
// and it's a sequence point! can't look any more back than this.
return nullptr;
}
}
}
}
// we didn't have it...
return nullptr;
}
std::vector<FormElement*> FormStack::rewrite(FormPool& pool) {
std::vector<FormElement*> result;
for (auto& e : m_stack) {
if (!e.active) {
continue;
}
if (e.destination.has_value()) {
auto elt = pool.alloc_element<SetVarElement>(*e.destination, e.source, e.sequence_point);
e.source->parent_element = elt;
result.push_back(elt);
} else {
result.push_back(e.elt);
}
}
return result;
}
} // namespace decompiler

View File

@ -0,0 +1,36 @@
#pragma once
#include <optional>
#include "decompiler/Disasm/Register.h"
#include "decompiler/IR2/AtomicOp.h"
namespace decompiler {
class Form;
/*!
* A FormStack is used to track partial expressions when rebuilding the tree structure of
* GOAL code. Linear sequences of operations are added onto the expression stack.
*/
class FormStack {
public:
FormStack() = default;
void push_value_to_reg(Variable var, Form* value, bool sequence_point);
void push_form_element(FormElement* elt, bool sequence_point);
Form* pop_reg(const Variable& var);
bool is_single_expression();
std::vector<FormElement*> rewrite(FormPool& pool);
std::string print(const Env& env);
private:
struct StackEntry {
bool active = true; // should this appear in the output?
std::optional<Variable> destination; // what register we are setting (or nullopt if no dest.)
Form* source = nullptr; // the value we are setting the register to.
FormElement* elt = nullptr;
bool sequence_point = false;
TP_Type type;
std::string print(const Env& env) const;
};
std::vector<StackEntry> m_stack;
};
} // namespace decompiler

View File

@ -0,0 +1,27 @@
#include "expression_build.h"
#include "decompiler/Function/Function.h"
#include "decompiler/IR2/Form.h"
#include "decompiler/IR2/FormStack.h"
namespace decompiler {
bool convert_to_expressions(Form* top_level_form, FormPool& pool, const Function& f) {
assert(top_level_form);
try {
top_level_form->apply_form([&](Form* form) {
FormStack stack;
for (auto& entry : form->elts()) {
entry->push_to_stack(f.ir2.env, stack);
}
auto new_entries = stack.rewrite(pool);
form->clear();
for (auto x : new_entries) {
form->push_back(x);
}
});
} catch (std::exception& e) {
return false;
}
return true;
}
} // namespace decompiler

View File

@ -0,0 +1,8 @@
#pragma once
namespace decompiler {
class Form;
class Function;
class FormPool;
bool convert_to_expressions(Form* top_level_form, FormPool& pool, const Function& f);
} // namespace decompiler

View File

@ -74,6 +74,8 @@ class ObjectFileDB {
void ir2_register_usage_pass();
void ir2_variable_pass();
void ir2_cfg_build_pass();
void ir2_store_current_forms();
void ir2_build_expressions();
void ir2_write_results(const std::string& output_dir);
std::string ir2_to_file(ObjectFileData& data);
std::string ir2_function_to_string(ObjectFileData& data, Function& function, int seg);

View File

@ -11,6 +11,7 @@
#include "decompiler/IR2/reg_usage.h"
#include "decompiler/IR2/variable_naming.h"
#include "decompiler/IR2/cfg_builder.h"
#include "decompiler/IR2/expression_build.h"
#include "common/goos/PrettyPrinter.h"
namespace decompiler {
@ -34,8 +35,12 @@ void ObjectFileDB::analyze_functions_ir2(const std::string& output_dir) {
ir2_register_usage_pass();
lg::info("Variable analysis...");
ir2_variable_pass();
lg::info("Initial conversion to Form...");
lg::info("Initial structuring..");
ir2_cfg_build_pass();
lg::info("Storing temporary form result...");
ir2_store_current_forms();
lg::info("Expression building...");
ir2_build_expressions();
lg::info("Writing results...");
ir2_write_results(output_dir);
}
@ -349,6 +354,45 @@ void ObjectFileDB::ir2_cfg_build_pass() {
lg::info("{}/{}/{} cfg build in {:.2f} ms\n", successful, attempted, total, timer.getMs());
}
void ObjectFileDB::ir2_store_current_forms() {
Timer timer;
int total = 0;
for_each_function_def_order([&](Function& func, int segment_id, ObjectFileData& data) {
(void)segment_id;
(void)data;
if (func.ir2.top_form) {
total++;
func.ir2.debug_form_string =
pretty_print::to_string(func.ir2.top_form->to_form(func.ir2.env));
}
});
lg::info("Stored debug forms for {} functions in {:.2f} ms\n", total, timer.getMs());
}
void ObjectFileDB::ir2_build_expressions() {
Timer timer;
int total = 0;
int attempted = 0;
int successful = 0;
for_each_function_def_order([&](Function& func, int segment_id, ObjectFileData& data) {
(void)segment_id;
(void)data;
total++;
if (func.ir2.top_form) {
attempted++;
if (convert_to_expressions(func.ir2.top_form, func.ir2.form_pool, func)) {
successful++;
func.ir2.print_debug_forms = true;
}
}
});
lg::info("{}/{}/{} expression build in {:.2f} ms\n", successful, attempted, total, timer.getMs());
}
void ObjectFileDB::ir2_write_results(const std::string& output_dir) {
Timer timer;
lg::info("Writing IR2 results to file...");
@ -388,6 +432,12 @@ std::string ObjectFileDB::ir2_to_file(ObjectFileData& data) {
result += pretty_print::to_string(func.ir2.top_form->to_form(func.ir2.env));
result += '\n';
}
if (func.ir2.print_debug_forms) {
result += '\n';
result += func.ir2.debug_form_string;
result += '\n';
}
}
// print data