Add duplicates report to dups tool (#467)

This adds an equivalent of find_duplicates.py to tools/dups. This runs in about 2 minutes on my machine so it should be significantly faster on the CI. The algorithm isn't exactly the same so the report is a little different. Here's an example: https://gist.github.com/sozud/503fd3b3014668e6644fb2dfae51d5e5 This works by grouping all the functions in to clusters, basically: ``` if levenshtein_similarity > threshold cluster.append(current_function) ``` Memoization gives a little speedup to avoid computing the levenshtein distance for the same pairs over and over again. This is still a brute-force algorithm. I did some research and there's a lot of similar problems but didn't find something that seemed like it would be a good fit. I think this is probably fast enough to last for a while.
2024-11-26 22:40:33 +00:00 · 2023-08-14 09:22:35 -07:00 · 2023-08-14 09:22:35 -07:00 · 4588d94071
commit 4588d94071
parent e92e9f4f4d
4 changed files with 555 additions and 46 deletions
--- a/tools/dups/Cargo.toml
+++ b/tools/dups/Cargo.toml
@ -7,3 +7,4 @@ edition = "2021"

 [dependencies]
 clap = { version = "4.3.19", features = ["derive"] }
+
--- a/tools/dups/src/levenshtein_hashmap.rs
+++ b/tools/dups/src/levenshtein_hashmap.rs
@ -0,0 +1,164 @@
+use crate::types::{DupsFile, Function, Instruction};
+use std::collections::HashMap;
+
+pub struct LevenshteinHashMap {
+    pub map: HashMap<Vec<u8>, Vec<Function>>,
+    threshold: f64,
+    cache: HashMap<(Vec<u8>, Vec<u8>), f64>,
+}
+
+fn levenshtein_similarity(
+    s1: &[u8],
+    s2: &[u8],
+    cache: &mut HashMap<(Vec<u8>, Vec<u8>), f64>,
+) -> f64 {
+    if let Some(result) = cache.get(&(s1.to_vec(), s2.to_vec())) {
+        // Return cached result if it exists
+        return *result;
+    }
+
+    let len1 = s1.len();
+    let len2 = s2.len();
+    let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
+
+    for i in 0..=len1 {
+        dp[i][0] = i;
+    }
+
+    for j in 0..=len2 {
+        dp[0][j] = j;
+    }
+
+    for (i, x) in s1.iter().enumerate() {
+        for (j, y) in s2.iter().enumerate() {
+            dp[i + 1][j + 1] = if x == y {
+                dp[i][j]
+            } else {
+                dp[i][j].min(dp[i][j + 1]).min(dp[i + 1][j]) + 1
+            };
+        }
+    }
+
+    let max_len = len1.max(len2) as f64;
+    let result = (max_len - dp[len1][len2] as f64) / max_len;
+    cache
+        .entry((s1.to_vec(), s2.to_vec()))
+        .and_modify(|v| *v = result)
+        .or_insert(result);
+    result
+}
+
+impl LevenshteinHashMap {
+    pub fn new(threshold: f64) -> Self {
+        Self {
+            map: HashMap::new(),
+            threshold,
+            cache: HashMap::new(),
+        }
+    }
+    pub fn len(&self) -> usize {
+        self.map.len()
+    }
+    pub fn iter(&self) -> impl Iterator<Item = (&Vec<u8>, &Vec<Function>)> {
+        self.map.iter()
+    }
+
+    pub fn get(&mut self, key: &[u8]) -> Option<&mut Vec<Function>> {
+        let mut closest_key = None;
+        let mut closest_distance = std::f64::MAX;
+
+        let map = self.map.clone();
+
+        for (k, _) in map.iter() {
+            let distance = levenshtein_similarity(key, k, &mut self.cache);
+
+            if distance < closest_distance && distance >= self.threshold {
+                closest_key = Some(k);
+                closest_distance = distance;
+            }
+        }
+
+        if let Some(k) = closest_key {
+            self.map.get_mut(k)
+        } else {
+            None
+        }
+    }
+
+    pub fn insert(&mut self, key: Vec<u8>, mut value: Function) {
+        let mut closest_key = None;
+        let mut closest_distance = std::f64::MAX;
+
+        for k in self.map.keys() {
+            let distance = levenshtein_similarity(&key, k, &mut self.cache);
+
+            if distance < closest_distance && distance >= self.threshold {
+                closest_key = Some(k.clone());
+                closest_distance = distance;
+            }
+        }
+
+        if let Some(k) = closest_key {
+            let mut val = self.map.get_mut(&k);
+            value.similarity = closest_distance;
+            val.unwrap().push(value);
+        } else {
+            let mut my_vec: Vec<Function> = Vec::new();
+            value.similarity = 1.0;
+            my_vec.push(value);
+            self.map.insert(key, my_vec);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_insert_and_get_same_cluster() {
+        let mut map = LevenshteinHashMap::new(0.95);
+        let func1 = Function {
+            name: String::from("func1"),
+            ops: vec![],
+            key: vec![1, 2, 3],
+        };
+        let func2 = Function {
+            name: String::from("func2"),
+            ops: vec![],
+            key: vec![1, 2, 3],
+        };
+
+        map.insert(func1.key.clone(), func1.clone());
+        map.insert(func2.key.clone(), func2.clone());
+
+        // both functions should be in the same cluster
+        let result1 = map.get(&vec![1, 2, 3]);
+        assert_eq!(result1.expect("has items").len(), 2);
+    }
+
+    #[test]
+    fn test_insert_and_get_different_cluster() {
+        let mut map = LevenshteinHashMap::new(0.95);
+        let func1 = Function {
+            name: String::from("func1"),
+            ops: vec![],
+            key: vec![1, 2, 3],
+        };
+        let func2 = Function {
+            name: String::from("func2"),
+            ops: vec![],
+            key: vec![4, 5, 6],
+        };
+
+        map.insert(func1.key.clone(), func1.clone());
+        map.insert(func2.key.clone(), func2.clone());
+
+        // functions should be in different clusters
+        let result1 = map.get(&vec![1, 2, 3]);
+        assert_eq!(result1.expect("has items").len(), 1);
+
+        let result2 = map.get(&vec![4, 5, 6]);
+        assert_eq!(result2.expect("has items").len(), 1);
+    }
+}
--- a/tools/dups/src/main.rs
+++ b/tools/dups/src/main.rs
@ -1,23 +1,18 @@
-use std::fs;
-use std::io::Read;
 use std::env::*;
+use std::fs;
+use std::fs::File;
+use std::io::BufRead;
+use std::io::BufReader;
+use std::io::Read;
+use std::io::Write;
+use std::process::exit;

-#[derive(Clone)]
-pub struct Function {
-    name: String,
-    ops: Vec<Instruction>,
-    key: Vec<u8>,
-}
-
-#[derive(Clone)]
-pub struct Instruction {
-    file_addr: u64,
-    vram_addr: u64,
-    op: u32,
-}
-
+mod levenshtein_hashmap;
+mod types;
+use levenshtein_hashmap::LevenshteinHashMap;
+use types::{DupsFile, Function, Instruction};
 // parse .s file to get instructions and function name
-fn parse_instructions(input: &str) -> Function {
+fn parse_instructions(input: &str, dir: &str, file: &str) -> Function {
    let mut instructions = Vec::new();
    let mut func_name = "";

@ -38,12 +33,17 @@ fn parse_instructions(input: &str) -> Function {
        if let Ok(file_addr) = u64::from_str_radix(parts[1], 16) {
            if let Ok(vram_addr) = u64::from_str_radix(parts[2], 16) {
                if let Ok(op) = u32::from_str_radix(parts[3], 16) {
+                    // splat's output for the instruction is apparently little-endian
+                    let reversed_num = ((op >> 24) & 0xFF)
+                        | (((op >> 16) & 0xFF) << 8)
+                        | (((op >> 8) & 0xFF) << 16)
+                        | ((op & 0xFF) << 24);

                    // if the file address, vram address, and instruction parsed, add it
                    let instruction = Instruction {
                        file_addr,
                        vram_addr,
-                        op,
+                        op: reversed_num,
                    };

                    instructions.push(instruction);
@ -58,51 +58,76 @@ fn parse_instructions(input: &str) -> Function {
        .iter()
        .map(|num| (num.op >> 26) as u8)
        .collect();
+
    Function {
        ops: instructions,
        name: func_name.to_string(),
        key: key,
+        dir: dir.to_string(),
+        file: file.to_string(),
+        similarity: 0.0,
+        decompiled: false,
    }
 }

 fn process_directory(dir_path: &str, funcs: &mut Vec<Function>) {
-    let entries = std::fs::read_dir(dir_path).expect("Unable to read directory");
+    match std::fs::read_dir(dir_path) {
+        Ok(entries) => {
+            entries.for_each(|entry| {
+                if let Ok(entry) = entry {
+                    let item_path = entry.path();
+                    if item_path.is_file() && item_path.to_string_lossy().ends_with(".s") {
+                        println!("checking {:?}", item_path);

-    entries.for_each(|entry| {
-        if let Ok(entry) = entry {
-            let item_path = entry.path();
-            if item_path.is_file() && item_path.to_string_lossy().ends_with(".s") {
-                println!("checking {:?}", item_path);
+                        let mut file = fs::File::open(item_path.clone()).unwrap();
+                        let mut buffer = String::new();
+                        file.read_to_string(&mut buffer).unwrap();

-                let mut file = fs::File::open(item_path).unwrap();
-                let mut buffer = String::new();
-                file.read_to_string(&mut buffer).unwrap();
+                        let func =
+                            parse_instructions(&buffer, &dir_path, &item_path.to_string_lossy());

-                let func = parse_instructions(&buffer);
-                funcs.push(func.clone());
-            } else if item_path.is_dir() {
-                process_directory(&item_path.to_string_lossy(), funcs);
-            }
+                        // jr $ra, nop
+                        let is_null = func.ops.len() == 2
+                            && func.ops[0].op == 0x03E00008
+                            && func.ops[1].op == 0x00000000;
+                        if !is_null {
+                            funcs.push(func.clone());
+                        }
+                    } else if item_path.is_dir() {
+                        process_directory(&item_path.to_string_lossy(), funcs);
+                    }
+                }
+            });
        }
-    });
-}
-
-struct File {
-    name: String,
-    funcs: Vec<Function>,
+        Err(error) => {
+            eprintln!("Unable to read directory: {}", error);
+            println!("Directory path: {}", dir_path);
+            exit(1);
+        }
+    }
 }

 use clap::Parser;

 #[derive(Parser, Debug)]
-#[command(author, version, about, long_about = "\n
+#[command(
+    author,
+    version,
+    about,
+    long_about = "\n
 Finds duplicates in two asm directories and prints them out in order to identify patterns

 Usage:

 make force_extract
+
+Do a 2-way compare with ordering
 cargo run --release -- --dir ../../asm/us/st/nz0/nonmatchings/ --dir ../../asm/us/st/np3/nonmatchings/ --threshold .94
-")]
+
+Clustering report for all overlays
+cargo run --release -- --threshold .94 --output-file output.txt
+"
+)]

 struct Args {
    /// Levenshtein similarity threshold
@ -112,14 +137,288 @@ struct Args {
    /// Directory to parse asm from (2 required)
    #[arg(short, long)]
    dir: Vec<String>,
+
+    /// File to write output to
+    #[arg(short, long)]
+    output_file: Option<String>,
+
+    /// Base of source directory
+    #[arg(short, long)]
+    src_base: Option<String>,
 }

-fn main() {
-    let args = Args::parse();
+#[derive(Clone, Debug, PartialEq)]
+pub struct IncludeAsmEntry {
+    pub line: String,
+    pub path: String,
+}

-    let threshold = args.threshold;
-    let dirs = args.dir;
+fn process_directory_for_include_asm(dir: &str) -> Vec<IncludeAsmEntry> {
+    let entries = std::fs::read_dir(dir).expect("Unable to read directory");

+    let mut output = Vec::new();
+
+    entries.for_each(|entry| {
+        if let Ok(entry) = entry {
+            let item_path = entry.path();
+            if item_path.is_file() && item_path.to_string_lossy().ends_with(".c") {
+                println!("checking {:?}", item_path);
+
+                let file = File::open(item_path.clone()).expect("Unable to open file");
+                let reader = BufReader::new(file);
+                for line in reader.lines() {
+                    let line_str = line.unwrap();
+
+                    if line_str.contains("INCLUDE_ASM") {
+                        output.push(IncludeAsmEntry {
+                            line: line_str.clone(),
+                            path: item_path.to_string_lossy().to_string(),
+                        });
+                    }
+                }
+            } else if item_path.is_dir() {
+                process_directory_for_include_asm(&item_path.to_string_lossy());
+            }
+        }
+    });
+    output
+}
+
+fn get_all_include_asm(dir: &str) -> Vec<IncludeAsmEntry> {
+    process_directory_for_include_asm(dir)
+}
+#[derive(Clone)]
+struct SrcAsmPair {
+    asm_dir: String,
+    src_dir: String,
+    overlay_name: String,
+    include_asm: Vec<IncludeAsmEntry>,
+    path_matcher: String,
+}
+
+fn do_dups_report(output_file: Option<String>, threshold: f64) {
+    // full dups report
+    let mut hash_map = LevenshteinHashMap::new(threshold);
+
+    let mut files = Vec::new();
+
+    let pairs: Vec<SrcAsmPair> = vec![
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/dra/nonmatchings/"),
+            src_dir: String::from("../../src/dra/"),
+            overlay_name: String::from("DRA"),
+            include_asm: get_all_include_asm("../../src/dra/"),
+            path_matcher: "/dra/".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/main/nonmatchings/"),
+            src_dir: String::from("../../src/main/"),
+            overlay_name: String::from("MAIN"),
+            include_asm: get_all_include_asm("../../src/main/"),
+            path_matcher: "/main/".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/ric/nonmatchings/"),
+            src_dir: String::from("../../src/ric/"),
+            overlay_name: String::from("RIC"),
+            include_asm: get_all_include_asm("../../src/ric/"),
+            path_matcher: "/ric/".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/servant/tt_000/nonmatchings/"),
+            src_dir: String::from("../../src/servant/tt_000"),
+            overlay_name: String::from("RIC"),
+            include_asm: get_all_include_asm("../../src/servant/tt_000"),
+            path_matcher: "/tt_000/".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/cen/nonmatchings/"),
+            src_dir: String::from("../../src/st/cen/"),
+            overlay_name: String::from("CEN"),
+            include_asm: get_all_include_asm("../../src/st/cen/"),
+            path_matcher: "st/cen".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/dre/nonmatchings/"),
+            src_dir: String::from("../../src/st/dre/"),
+            overlay_name: String::from("DRE"),
+            include_asm: get_all_include_asm("../../src/st/dre/"),
+            path_matcher: "st/dre".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/mad/nonmatchings/"),
+            src_dir: String::from("../../src/st/mad/"),
+            overlay_name: String::from("MAD"),
+            include_asm: get_all_include_asm("../../src/st/mad/"),
+            path_matcher: "st/mad".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/no3/nonmatchings/"),
+            src_dir: String::from("../../src/st/no3/"),
+            overlay_name: String::from("NO3"),
+            include_asm: get_all_include_asm("../../src/st/no3/"),
+            path_matcher: "st/no3".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/np3/nonmatchings/"),
+            src_dir: String::from("../../src/st/np3/"),
+            overlay_name: String::from("NP3"),
+            include_asm: get_all_include_asm("../../src/st/np3/"),
+            path_matcher: "st/np3".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/nz0/nonmatchings/"),
+            src_dir: String::from("../../src/st/nz0/"),
+            overlay_name: String::from("NZ0"),
+            include_asm: get_all_include_asm("../../src/st/nz0/"),
+            path_matcher: "st/nz0".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/rwrp/nonmatchings/"),
+            src_dir: String::from("../../src/st/rwrp/"),
+            overlay_name: String::from("RWRP"),
+            include_asm: get_all_include_asm("../../src/st/rwrp/"),
+            path_matcher: "st/rwrp".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/sel/nonmatchings/"),
+            src_dir: String::from("../../src/st/sel/"),
+            overlay_name: String::from("SEL"),
+            include_asm: get_all_include_asm("../../src/st/sel/"),
+            path_matcher: "st/sel".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/st0/nonmatchings/"),
+            src_dir: String::from("../../src/st/st0/"),
+            overlay_name: String::from("ST0"),
+            include_asm: get_all_include_asm("../../src/st/st0/"),
+            path_matcher: "st/st0".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/st/wrp/nonmatchings/"),
+            src_dir: String::from("../../src/st/wrp/"),
+            overlay_name: String::from("WRP"),
+            include_asm: get_all_include_asm("../../src/st/wrp/"),
+            path_matcher: "st/wrp".to_string(),
+        },
+        SrcAsmPair {
+            asm_dir: String::from("../../asm/us/weapon/nonmatchings/"),
+            src_dir: String::from("../../src/weapon/"),
+            overlay_name: String::from("WEAPON"),
+            include_asm: get_all_include_asm("../../src/weapon/"),
+            path_matcher: "/weapon/".to_string(),
+        },
+    ];
+
+    for pair in pairs.clone() {
+        let dir = pair.asm_dir;
+        let mut funcs = Vec::new();
+        process_directory(&dir, &mut funcs);
+
+        // sort functions by vram address
+        funcs.sort_by_key(|function| {
+            function
+                .ops
+                .first()
+                .map_or(u64::MAX, |instr| instr.vram_addr)
+        });
+
+        files.push(DupsFile {
+            name: dir.to_string(),
+            funcs: funcs.clone(),
+        });
+    }
+
+    for file in &files {
+        println!("file {}", file.name);
+        for func in &file.funcs {
+            println!("\t{} {}", func.name, func.ops.len());
+        }
+    }
+
+    for file in &files {
+        for func in &file.funcs {
+            hash_map.insert(func.key.clone(), func.clone());
+        }
+    }
+
+    let mut entries: Vec<(&Vec<u8>, &Vec<Function>)> = hash_map.map.iter().collect();
+
+    // sort by filename
+    entries.sort_by(|(_, functions1), (_, functions2)| functions1[0].file.cmp(&functions2[0].file));
+
+    // Then sort by the length of functions in reverse order
+    entries.sort_by_key(|(_, functions)| std::cmp::Reverse(functions.len()));
+
+    if let o_file = output_file.unwrap() {
+        let mut output_file = File::create(o_file).expect("Unable to create file");
+        writeln!(
+            output_file,
+            "| {:<4} | {:<8} | {:<35} | {:<2} ",
+            "%", "Decomp?", "Name", "Asm Path"
+        )
+        .expect("Error writing to file");
+
+        for (_, functions) in entries {
+            if functions.len() > 1 {
+                // Write separator to file
+                writeln!(output_file, "-------------------------------------------------------------------------------")
+                        .expect("Error writing to file");
+
+                let mut temp_functions = functions.clone();
+
+                // sort by the filename then the similarity
+                temp_functions.sort_by(|a, b| {
+                    let file_cmp = a.file.cmp(&b.file);
+                    if file_cmp != std::cmp::Ordering::Equal {
+                        return file_cmp;
+                    }
+
+                    a.similarity
+                        .partial_cmp(&b.similarity)
+                        .unwrap_or(std::cmp::Ordering::Equal)
+                });
+
+                for function in &mut temp_functions {
+                    // Write function details to file
+                    let mut decompiled = true;
+
+                    for pair in &pairs.clone() {
+                        if function.file.contains(&pair.path_matcher) {
+                            for inc in &pair.include_asm {
+                                if inc.line.contains(&function.name) {
+                                    decompiled = false;
+                                }
+                            }
+                        }
+                    }
+
+                    writeln!(
+                        output_file,
+                        "| {:<4.2} | {:<8} | {:<35} | {:<2} ",
+                        function.similarity, decompiled, function.name, function.file
+                    )
+                    .expect("Error writing to file");
+                }
+            }
+        }
+    } else {
+        for (_, functions) in entries {
+            if functions.len() > 1 {
+                println!("------------------------");
+
+                for function in functions {
+                    println!(
+                        "{:.2} {:?} {:?} {:?}",
+                        function.similarity, function.decompiled, function.name, function.file
+                    );
+                }
+            }
+        }
+    }
+}
+
+fn do_ordered_compare(dirs: Vec<String>, threshold: f64) {
    let mut files = Vec::new();

    for dir in dirs {
@ -134,7 +433,7 @@ fn main() {
                .map_or(u64::MAX, |instr| instr.vram_addr)
        });

-        files.push(File {
+        files.push(DupsFile {
            name: dir.to_string(),
            funcs: funcs.clone(),
        });
@ -147,6 +446,7 @@ fn main() {
        }
    }

+    // 2 way comparison for determining patterns in overlays
    let mut pairs: Vec<Vec<Function>> = Vec::new();

    // print out all found duplicates with their similarity values
@ -160,7 +460,13 @@ fn main() {
            let result = levenshtein_similarity(&func_0.key, &func_1.key);

            if result >= threshold {
-                println!("{:<width$} | {:<width$} | {:<width$}", func_0.name, func_1.name, result, width = 40);
+                println!(
+                    "{:<width$} | {:<width$} | {:<width$}",
+                    func_0.name,
+                    func_1.name,
+                    result,
+                    width = 40
+                );
                let mut temp = Vec::new();
                temp.push(func_0.clone());
                temp.push(func_1.clone());
@ -197,6 +503,22 @@ fn main() {
    }
 }

+fn main() {
+    let args = Args::parse();
+
+    let threshold = args.threshold;
+    let dirs = args.dir;
+    let output_file = args.output_file;
+    let num_dirs = dirs.len();
+    let src_base_dir = args.src_base;
+
+    if num_dirs == 2 {
+        do_ordered_compare(dirs, threshold);
+    } else {
+        do_dups_report(output_file, threshold);
+    }
+}
+
 fn levenshtein_similarity(s1: &[u8], s2: &[u8]) -> f64 {
    let len1 = s1.len();
    let len2 = s2.len();
--- a/tools/dups/src/types.rs
+++ b/tools/dups/src/types.rs
@ -0,0 +1,22 @@
+#[derive(Clone, Debug, PartialEq)]
+pub struct Function {
+    pub name: String,
+    pub ops: Vec<Instruction>,
+    pub key: Vec<u8>,
+    pub dir: String,
+    pub file: String,
+    pub similarity: f64,
+    pub decompiled: bool,
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub struct Instruction {
+    pub file_addr: u64,
+    pub vram_addr: u64,
+    pub op: u32,
+}
+
+pub struct DupsFile {
+    pub name: String,
+    pub funcs: Vec<Function>,
+}