mirror of
https://github.com/Xeeynamo/sotn-decomp.git
synced 2024-11-26 22:40:33 +00:00
Add duplicates report to dups tool (#467)
This adds an equivalent of find_duplicates.py to tools/dups. This runs in about 2 minutes on my machine so it should be significantly faster on the CI. The algorithm isn't exactly the same so the report is a little different. Here's an example: https://gist.github.com/sozud/503fd3b3014668e6644fb2dfae51d5e5 This works by grouping all the functions in to clusters, basically: ``` if levenshtein_similarity > threshold cluster.append(current_function) ``` Memoization gives a little speedup to avoid computing the levenshtein distance for the same pairs over and over again. This is still a brute-force algorithm. I did some research and there's a lot of similar problems but didn't find something that seemed like it would be a good fit. I think this is probably fast enough to last for a while.
This commit is contained in:
parent
e92e9f4f4d
commit
4588d94071
@ -7,3 +7,4 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
clap = { version = "4.3.19", features = ["derive"] }
|
||||
|
||||
|
164
tools/dups/src/levenshtein_hashmap.rs
Normal file
164
tools/dups/src/levenshtein_hashmap.rs
Normal file
@ -0,0 +1,164 @@
|
||||
use crate::types::{DupsFile, Function, Instruction};
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub struct LevenshteinHashMap {
|
||||
pub map: HashMap<Vec<u8>, Vec<Function>>,
|
||||
threshold: f64,
|
||||
cache: HashMap<(Vec<u8>, Vec<u8>), f64>,
|
||||
}
|
||||
|
||||
fn levenshtein_similarity(
|
||||
s1: &[u8],
|
||||
s2: &[u8],
|
||||
cache: &mut HashMap<(Vec<u8>, Vec<u8>), f64>,
|
||||
) -> f64 {
|
||||
if let Some(result) = cache.get(&(s1.to_vec(), s2.to_vec())) {
|
||||
// Return cached result if it exists
|
||||
return *result;
|
||||
}
|
||||
|
||||
let len1 = s1.len();
|
||||
let len2 = s2.len();
|
||||
let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
|
||||
|
||||
for i in 0..=len1 {
|
||||
dp[i][0] = i;
|
||||
}
|
||||
|
||||
for j in 0..=len2 {
|
||||
dp[0][j] = j;
|
||||
}
|
||||
|
||||
for (i, x) in s1.iter().enumerate() {
|
||||
for (j, y) in s2.iter().enumerate() {
|
||||
dp[i + 1][j + 1] = if x == y {
|
||||
dp[i][j]
|
||||
} else {
|
||||
dp[i][j].min(dp[i][j + 1]).min(dp[i + 1][j]) + 1
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
let max_len = len1.max(len2) as f64;
|
||||
let result = (max_len - dp[len1][len2] as f64) / max_len;
|
||||
cache
|
||||
.entry((s1.to_vec(), s2.to_vec()))
|
||||
.and_modify(|v| *v = result)
|
||||
.or_insert(result);
|
||||
result
|
||||
}
|
||||
|
||||
impl LevenshteinHashMap {
|
||||
pub fn new(threshold: f64) -> Self {
|
||||
Self {
|
||||
map: HashMap::new(),
|
||||
threshold,
|
||||
cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
pub fn len(&self) -> usize {
|
||||
self.map.len()
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (&Vec<u8>, &Vec<Function>)> {
|
||||
self.map.iter()
|
||||
}
|
||||
|
||||
pub fn get(&mut self, key: &[u8]) -> Option<&mut Vec<Function>> {
|
||||
let mut closest_key = None;
|
||||
let mut closest_distance = std::f64::MAX;
|
||||
|
||||
let map = self.map.clone();
|
||||
|
||||
for (k, _) in map.iter() {
|
||||
let distance = levenshtein_similarity(key, k, &mut self.cache);
|
||||
|
||||
if distance < closest_distance && distance >= self.threshold {
|
||||
closest_key = Some(k);
|
||||
closest_distance = distance;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(k) = closest_key {
|
||||
self.map.get_mut(k)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: Vec<u8>, mut value: Function) {
|
||||
let mut closest_key = None;
|
||||
let mut closest_distance = std::f64::MAX;
|
||||
|
||||
for k in self.map.keys() {
|
||||
let distance = levenshtein_similarity(&key, k, &mut self.cache);
|
||||
|
||||
if distance < closest_distance && distance >= self.threshold {
|
||||
closest_key = Some(k.clone());
|
||||
closest_distance = distance;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(k) = closest_key {
|
||||
let mut val = self.map.get_mut(&k);
|
||||
value.similarity = closest_distance;
|
||||
val.unwrap().push(value);
|
||||
} else {
|
||||
let mut my_vec: Vec<Function> = Vec::new();
|
||||
value.similarity = 1.0;
|
||||
my_vec.push(value);
|
||||
self.map.insert(key, my_vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_insert_and_get_same_cluster() {
|
||||
let mut map = LevenshteinHashMap::new(0.95);
|
||||
let func1 = Function {
|
||||
name: String::from("func1"),
|
||||
ops: vec![],
|
||||
key: vec![1, 2, 3],
|
||||
};
|
||||
let func2 = Function {
|
||||
name: String::from("func2"),
|
||||
ops: vec![],
|
||||
key: vec![1, 2, 3],
|
||||
};
|
||||
|
||||
map.insert(func1.key.clone(), func1.clone());
|
||||
map.insert(func2.key.clone(), func2.clone());
|
||||
|
||||
// both functions should be in the same cluster
|
||||
let result1 = map.get(&vec![1, 2, 3]);
|
||||
assert_eq!(result1.expect("has items").len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_insert_and_get_different_cluster() {
|
||||
let mut map = LevenshteinHashMap::new(0.95);
|
||||
let func1 = Function {
|
||||
name: String::from("func1"),
|
||||
ops: vec![],
|
||||
key: vec![1, 2, 3],
|
||||
};
|
||||
let func2 = Function {
|
||||
name: String::from("func2"),
|
||||
ops: vec![],
|
||||
key: vec![4, 5, 6],
|
||||
};
|
||||
|
||||
map.insert(func1.key.clone(), func1.clone());
|
||||
map.insert(func2.key.clone(), func2.clone());
|
||||
|
||||
// functions should be in different clusters
|
||||
let result1 = map.get(&vec![1, 2, 3]);
|
||||
assert_eq!(result1.expect("has items").len(), 1);
|
||||
|
||||
let result2 = map.get(&vec![4, 5, 6]);
|
||||
assert_eq!(result2.expect("has items").len(), 1);
|
||||
}
|
||||
}
|
@ -1,23 +1,18 @@
|
||||
use std::fs;
|
||||
use std::io::Read;
|
||||
use std::env::*;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::BufRead;
|
||||
use std::io::BufReader;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::process::exit;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Function {
|
||||
name: String,
|
||||
ops: Vec<Instruction>,
|
||||
key: Vec<u8>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Instruction {
|
||||
file_addr: u64,
|
||||
vram_addr: u64,
|
||||
op: u32,
|
||||
}
|
||||
|
||||
mod levenshtein_hashmap;
|
||||
mod types;
|
||||
use levenshtein_hashmap::LevenshteinHashMap;
|
||||
use types::{DupsFile, Function, Instruction};
|
||||
// parse .s file to get instructions and function name
|
||||
fn parse_instructions(input: &str) -> Function {
|
||||
fn parse_instructions(input: &str, dir: &str, file: &str) -> Function {
|
||||
let mut instructions = Vec::new();
|
||||
let mut func_name = "";
|
||||
|
||||
@ -38,12 +33,17 @@ fn parse_instructions(input: &str) -> Function {
|
||||
if let Ok(file_addr) = u64::from_str_radix(parts[1], 16) {
|
||||
if let Ok(vram_addr) = u64::from_str_radix(parts[2], 16) {
|
||||
if let Ok(op) = u32::from_str_radix(parts[3], 16) {
|
||||
// splat's output for the instruction is apparently little-endian
|
||||
let reversed_num = ((op >> 24) & 0xFF)
|
||||
| (((op >> 16) & 0xFF) << 8)
|
||||
| (((op >> 8) & 0xFF) << 16)
|
||||
| ((op & 0xFF) << 24);
|
||||
|
||||
// if the file address, vram address, and instruction parsed, add it
|
||||
let instruction = Instruction {
|
||||
file_addr,
|
||||
vram_addr,
|
||||
op,
|
||||
op: reversed_num,
|
||||
};
|
||||
|
||||
instructions.push(instruction);
|
||||
@ -58,51 +58,76 @@ fn parse_instructions(input: &str) -> Function {
|
||||
.iter()
|
||||
.map(|num| (num.op >> 26) as u8)
|
||||
.collect();
|
||||
|
||||
Function {
|
||||
ops: instructions,
|
||||
name: func_name.to_string(),
|
||||
key: key,
|
||||
dir: dir.to_string(),
|
||||
file: file.to_string(),
|
||||
similarity: 0.0,
|
||||
decompiled: false,
|
||||
}
|
||||
}
|
||||
|
||||
fn process_directory(dir_path: &str, funcs: &mut Vec<Function>) {
|
||||
let entries = std::fs::read_dir(dir_path).expect("Unable to read directory");
|
||||
match std::fs::read_dir(dir_path) {
|
||||
Ok(entries) => {
|
||||
entries.for_each(|entry| {
|
||||
if let Ok(entry) = entry {
|
||||
let item_path = entry.path();
|
||||
if item_path.is_file() && item_path.to_string_lossy().ends_with(".s") {
|
||||
println!("checking {:?}", item_path);
|
||||
|
||||
entries.for_each(|entry| {
|
||||
if let Ok(entry) = entry {
|
||||
let item_path = entry.path();
|
||||
if item_path.is_file() && item_path.to_string_lossy().ends_with(".s") {
|
||||
println!("checking {:?}", item_path);
|
||||
let mut file = fs::File::open(item_path.clone()).unwrap();
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
|
||||
let mut file = fs::File::open(item_path).unwrap();
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
let func =
|
||||
parse_instructions(&buffer, &dir_path, &item_path.to_string_lossy());
|
||||
|
||||
let func = parse_instructions(&buffer);
|
||||
funcs.push(func.clone());
|
||||
} else if item_path.is_dir() {
|
||||
process_directory(&item_path.to_string_lossy(), funcs);
|
||||
}
|
||||
// jr $ra, nop
|
||||
let is_null = func.ops.len() == 2
|
||||
&& func.ops[0].op == 0x03E00008
|
||||
&& func.ops[1].op == 0x00000000;
|
||||
if !is_null {
|
||||
funcs.push(func.clone());
|
||||
}
|
||||
} else if item_path.is_dir() {
|
||||
process_directory(&item_path.to_string_lossy(), funcs);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
struct File {
|
||||
name: String,
|
||||
funcs: Vec<Function>,
|
||||
Err(error) => {
|
||||
eprintln!("Unable to read directory: {}", error);
|
||||
println!("Directory path: {}", dir_path);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
use clap::Parser;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = "\n
|
||||
#[command(
|
||||
author,
|
||||
version,
|
||||
about,
|
||||
long_about = "\n
|
||||
Finds duplicates in two asm directories and prints them out in order to identify patterns
|
||||
|
||||
Usage:
|
||||
|
||||
make force_extract
|
||||
|
||||
Do a 2-way compare with ordering
|
||||
cargo run --release -- --dir ../../asm/us/st/nz0/nonmatchings/ --dir ../../asm/us/st/np3/nonmatchings/ --threshold .94
|
||||
")]
|
||||
|
||||
Clustering report for all overlays
|
||||
cargo run --release -- --threshold .94 --output-file output.txt
|
||||
"
|
||||
)]
|
||||
|
||||
struct Args {
|
||||
/// Levenshtein similarity threshold
|
||||
@ -112,14 +137,288 @@ struct Args {
|
||||
/// Directory to parse asm from (2 required)
|
||||
#[arg(short, long)]
|
||||
dir: Vec<String>,
|
||||
|
||||
/// File to write output to
|
||||
#[arg(short, long)]
|
||||
output_file: Option<String>,
|
||||
|
||||
/// Base of source directory
|
||||
#[arg(short, long)]
|
||||
src_base: Option<String>,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = Args::parse();
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct IncludeAsmEntry {
|
||||
pub line: String,
|
||||
pub path: String,
|
||||
}
|
||||
|
||||
let threshold = args.threshold;
|
||||
let dirs = args.dir;
|
||||
fn process_directory_for_include_asm(dir: &str) -> Vec<IncludeAsmEntry> {
|
||||
let entries = std::fs::read_dir(dir).expect("Unable to read directory");
|
||||
|
||||
let mut output = Vec::new();
|
||||
|
||||
entries.for_each(|entry| {
|
||||
if let Ok(entry) = entry {
|
||||
let item_path = entry.path();
|
||||
if item_path.is_file() && item_path.to_string_lossy().ends_with(".c") {
|
||||
println!("checking {:?}", item_path);
|
||||
|
||||
let file = File::open(item_path.clone()).expect("Unable to open file");
|
||||
let reader = BufReader::new(file);
|
||||
for line in reader.lines() {
|
||||
let line_str = line.unwrap();
|
||||
|
||||
if line_str.contains("INCLUDE_ASM") {
|
||||
output.push(IncludeAsmEntry {
|
||||
line: line_str.clone(),
|
||||
path: item_path.to_string_lossy().to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
} else if item_path.is_dir() {
|
||||
process_directory_for_include_asm(&item_path.to_string_lossy());
|
||||
}
|
||||
}
|
||||
});
|
||||
output
|
||||
}
|
||||
|
||||
fn get_all_include_asm(dir: &str) -> Vec<IncludeAsmEntry> {
|
||||
process_directory_for_include_asm(dir)
|
||||
}
|
||||
#[derive(Clone)]
|
||||
struct SrcAsmPair {
|
||||
asm_dir: String,
|
||||
src_dir: String,
|
||||
overlay_name: String,
|
||||
include_asm: Vec<IncludeAsmEntry>,
|
||||
path_matcher: String,
|
||||
}
|
||||
|
||||
fn do_dups_report(output_file: Option<String>, threshold: f64) {
|
||||
// full dups report
|
||||
let mut hash_map = LevenshteinHashMap::new(threshold);
|
||||
|
||||
let mut files = Vec::new();
|
||||
|
||||
let pairs: Vec<SrcAsmPair> = vec![
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/dra/nonmatchings/"),
|
||||
src_dir: String::from("../../src/dra/"),
|
||||
overlay_name: String::from("DRA"),
|
||||
include_asm: get_all_include_asm("../../src/dra/"),
|
||||
path_matcher: "/dra/".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/main/nonmatchings/"),
|
||||
src_dir: String::from("../../src/main/"),
|
||||
overlay_name: String::from("MAIN"),
|
||||
include_asm: get_all_include_asm("../../src/main/"),
|
||||
path_matcher: "/main/".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/ric/nonmatchings/"),
|
||||
src_dir: String::from("../../src/ric/"),
|
||||
overlay_name: String::from("RIC"),
|
||||
include_asm: get_all_include_asm("../../src/ric/"),
|
||||
path_matcher: "/ric/".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/servant/tt_000/nonmatchings/"),
|
||||
src_dir: String::from("../../src/servant/tt_000"),
|
||||
overlay_name: String::from("RIC"),
|
||||
include_asm: get_all_include_asm("../../src/servant/tt_000"),
|
||||
path_matcher: "/tt_000/".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/cen/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/cen/"),
|
||||
overlay_name: String::from("CEN"),
|
||||
include_asm: get_all_include_asm("../../src/st/cen/"),
|
||||
path_matcher: "st/cen".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/dre/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/dre/"),
|
||||
overlay_name: String::from("DRE"),
|
||||
include_asm: get_all_include_asm("../../src/st/dre/"),
|
||||
path_matcher: "st/dre".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/mad/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/mad/"),
|
||||
overlay_name: String::from("MAD"),
|
||||
include_asm: get_all_include_asm("../../src/st/mad/"),
|
||||
path_matcher: "st/mad".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/no3/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/no3/"),
|
||||
overlay_name: String::from("NO3"),
|
||||
include_asm: get_all_include_asm("../../src/st/no3/"),
|
||||
path_matcher: "st/no3".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/np3/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/np3/"),
|
||||
overlay_name: String::from("NP3"),
|
||||
include_asm: get_all_include_asm("../../src/st/np3/"),
|
||||
path_matcher: "st/np3".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/nz0/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/nz0/"),
|
||||
overlay_name: String::from("NZ0"),
|
||||
include_asm: get_all_include_asm("../../src/st/nz0/"),
|
||||
path_matcher: "st/nz0".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/rwrp/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/rwrp/"),
|
||||
overlay_name: String::from("RWRP"),
|
||||
include_asm: get_all_include_asm("../../src/st/rwrp/"),
|
||||
path_matcher: "st/rwrp".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/sel/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/sel/"),
|
||||
overlay_name: String::from("SEL"),
|
||||
include_asm: get_all_include_asm("../../src/st/sel/"),
|
||||
path_matcher: "st/sel".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/st0/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/st0/"),
|
||||
overlay_name: String::from("ST0"),
|
||||
include_asm: get_all_include_asm("../../src/st/st0/"),
|
||||
path_matcher: "st/st0".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/st/wrp/nonmatchings/"),
|
||||
src_dir: String::from("../../src/st/wrp/"),
|
||||
overlay_name: String::from("WRP"),
|
||||
include_asm: get_all_include_asm("../../src/st/wrp/"),
|
||||
path_matcher: "st/wrp".to_string(),
|
||||
},
|
||||
SrcAsmPair {
|
||||
asm_dir: String::from("../../asm/us/weapon/nonmatchings/"),
|
||||
src_dir: String::from("../../src/weapon/"),
|
||||
overlay_name: String::from("WEAPON"),
|
||||
include_asm: get_all_include_asm("../../src/weapon/"),
|
||||
path_matcher: "/weapon/".to_string(),
|
||||
},
|
||||
];
|
||||
|
||||
for pair in pairs.clone() {
|
||||
let dir = pair.asm_dir;
|
||||
let mut funcs = Vec::new();
|
||||
process_directory(&dir, &mut funcs);
|
||||
|
||||
// sort functions by vram address
|
||||
funcs.sort_by_key(|function| {
|
||||
function
|
||||
.ops
|
||||
.first()
|
||||
.map_or(u64::MAX, |instr| instr.vram_addr)
|
||||
});
|
||||
|
||||
files.push(DupsFile {
|
||||
name: dir.to_string(),
|
||||
funcs: funcs.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
for file in &files {
|
||||
println!("file {}", file.name);
|
||||
for func in &file.funcs {
|
||||
println!("\t{} {}", func.name, func.ops.len());
|
||||
}
|
||||
}
|
||||
|
||||
for file in &files {
|
||||
for func in &file.funcs {
|
||||
hash_map.insert(func.key.clone(), func.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let mut entries: Vec<(&Vec<u8>, &Vec<Function>)> = hash_map.map.iter().collect();
|
||||
|
||||
// sort by filename
|
||||
entries.sort_by(|(_, functions1), (_, functions2)| functions1[0].file.cmp(&functions2[0].file));
|
||||
|
||||
// Then sort by the length of functions in reverse order
|
||||
entries.sort_by_key(|(_, functions)| std::cmp::Reverse(functions.len()));
|
||||
|
||||
if let o_file = output_file.unwrap() {
|
||||
let mut output_file = File::create(o_file).expect("Unable to create file");
|
||||
writeln!(
|
||||
output_file,
|
||||
"| {:<4} | {:<8} | {:<35} | {:<2} ",
|
||||
"%", "Decomp?", "Name", "Asm Path"
|
||||
)
|
||||
.expect("Error writing to file");
|
||||
|
||||
for (_, functions) in entries {
|
||||
if functions.len() > 1 {
|
||||
// Write separator to file
|
||||
writeln!(output_file, "-------------------------------------------------------------------------------")
|
||||
.expect("Error writing to file");
|
||||
|
||||
let mut temp_functions = functions.clone();
|
||||
|
||||
// sort by the filename then the similarity
|
||||
temp_functions.sort_by(|a, b| {
|
||||
let file_cmp = a.file.cmp(&b.file);
|
||||
if file_cmp != std::cmp::Ordering::Equal {
|
||||
return file_cmp;
|
||||
}
|
||||
|
||||
a.similarity
|
||||
.partial_cmp(&b.similarity)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
|
||||
for function in &mut temp_functions {
|
||||
// Write function details to file
|
||||
let mut decompiled = true;
|
||||
|
||||
for pair in &pairs.clone() {
|
||||
if function.file.contains(&pair.path_matcher) {
|
||||
for inc in &pair.include_asm {
|
||||
if inc.line.contains(&function.name) {
|
||||
decompiled = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writeln!(
|
||||
output_file,
|
||||
"| {:<4.2} | {:<8} | {:<35} | {:<2} ",
|
||||
function.similarity, decompiled, function.name, function.file
|
||||
)
|
||||
.expect("Error writing to file");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (_, functions) in entries {
|
||||
if functions.len() > 1 {
|
||||
println!("------------------------");
|
||||
|
||||
for function in functions {
|
||||
println!(
|
||||
"{:.2} {:?} {:?} {:?}",
|
||||
function.similarity, function.decompiled, function.name, function.file
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn do_ordered_compare(dirs: Vec<String>, threshold: f64) {
|
||||
let mut files = Vec::new();
|
||||
|
||||
for dir in dirs {
|
||||
@ -134,7 +433,7 @@ fn main() {
|
||||
.map_or(u64::MAX, |instr| instr.vram_addr)
|
||||
});
|
||||
|
||||
files.push(File {
|
||||
files.push(DupsFile {
|
||||
name: dir.to_string(),
|
||||
funcs: funcs.clone(),
|
||||
});
|
||||
@ -147,6 +446,7 @@ fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
// 2 way comparison for determining patterns in overlays
|
||||
let mut pairs: Vec<Vec<Function>> = Vec::new();
|
||||
|
||||
// print out all found duplicates with their similarity values
|
||||
@ -160,7 +460,13 @@ fn main() {
|
||||
let result = levenshtein_similarity(&func_0.key, &func_1.key);
|
||||
|
||||
if result >= threshold {
|
||||
println!("{:<width$} | {:<width$} | {:<width$}", func_0.name, func_1.name, result, width = 40);
|
||||
println!(
|
||||
"{:<width$} | {:<width$} | {:<width$}",
|
||||
func_0.name,
|
||||
func_1.name,
|
||||
result,
|
||||
width = 40
|
||||
);
|
||||
let mut temp = Vec::new();
|
||||
temp.push(func_0.clone());
|
||||
temp.push(func_1.clone());
|
||||
@ -197,6 +503,22 @@ fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = Args::parse();
|
||||
|
||||
let threshold = args.threshold;
|
||||
let dirs = args.dir;
|
||||
let output_file = args.output_file;
|
||||
let num_dirs = dirs.len();
|
||||
let src_base_dir = args.src_base;
|
||||
|
||||
if num_dirs == 2 {
|
||||
do_ordered_compare(dirs, threshold);
|
||||
} else {
|
||||
do_dups_report(output_file, threshold);
|
||||
}
|
||||
}
|
||||
|
||||
fn levenshtein_similarity(s1: &[u8], s2: &[u8]) -> f64 {
|
||||
let len1 = s1.len();
|
||||
let len2 = s2.len();
|
||||
|
22
tools/dups/src/types.rs
Normal file
22
tools/dups/src/types.rs
Normal file
@ -0,0 +1,22 @@
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct Function {
|
||||
pub name: String,
|
||||
pub ops: Vec<Instruction>,
|
||||
pub key: Vec<u8>,
|
||||
pub dir: String,
|
||||
pub file: String,
|
||||
pub similarity: f64,
|
||||
pub decompiled: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct Instruction {
|
||||
pub file_addr: u64,
|
||||
pub vram_addr: u64,
|
||||
pub op: u32,
|
||||
}
|
||||
|
||||
pub struct DupsFile {
|
||||
pub name: String,
|
||||
pub funcs: Vec<Function>,
|
||||
}
|
Loading…
Reference in New Issue
Block a user