From 17b3aa0e4fd80d14872452d45ae6453f0762e5fc Mon Sep 17 00:00:00 2001 From: Filipe Rodrigues Date: Wed, 28 Oct 2020 17:27:25 +0000 Subject: [PATCH] Added `dcb::util::merge_iter`. Improved `dcb::game::exe::Func` yet again. --- dcb-tools/Cargo.toml | 1 + dcb-tools/src/decompiler/main.rs | 117 ++++++++------ dcb/Cargo.toml | 2 + dcb/src/game/exe/func.rs | 258 +++++++++++++++++++++++++------ dcb/src/game/exe/func/iter.rs | 69 +++++++++ dcb/src/lib.rs | 3 +- dcb/src/util.rs | 1 + dcb/src/util/merge_iter.rs | 67 ++++++++ 8 files changed, 420 insertions(+), 98 deletions(-) create mode 100644 dcb/src/game/exe/func/iter.rs create mode 100644 dcb/src/util/merge_iter.rs diff --git a/dcb-tools/Cargo.toml b/dcb-tools/Cargo.toml index 7c108f2..bd5025f 100644 --- a/dcb-tools/Cargo.toml +++ b/dcb-tools/Cargo.toml @@ -30,6 +30,7 @@ float-ord = "0.2" itertools = "0.9" rand = "0.7" ref-cast = "1.0" +maplit = "1.0" # Cmd clap = "2.33" diff --git a/dcb-tools/src/decompiler/main.rs b/dcb-tools/src/decompiler/main.rs index 887315b..d6e5b1f 100644 --- a/dcb-tools/src/decompiler/main.rs +++ b/dcb-tools/src/decompiler/main.rs @@ -8,7 +8,8 @@ array_value_iter, array_chunks, format_args_capture, - or_patterns + or_patterns, + bindings_after_at )] // Lints #![warn(clippy::restriction, clippy::pedantic, clippy::nursery)] @@ -73,27 +74,19 @@ mod cli; #[path = "../logger.rs"] mod logger; -// Exports -use std::collections::{HashMap, HashSet}; - // Imports use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use dcb::{ game::exe::{ - instruction::{ - Directive, - PseudoInstruction::{self, Nop}, - Raw, Register, SimpleInstruction, - }, + func::Funcs, + instruction::{Directive, PseudoInstruction::Nop, Raw, Register, SimpleInstruction}, Instruction, Pos, }, GameFile, }; -use itertools::Itertools; -use ref_cast::RefCast; -#[allow(clippy::too_many_lines)] // TODO: Refactor +#[allow(clippy::cognitive_complexity, clippy::too_many_lines)] // TODO: Refactor fn main() -> Result<(), anyhow::Error> { // Initialize the logger and set the panic handler logger::init(); @@ -106,9 +99,11 @@ fn main() -> Result<(), anyhow::Error> { let mut game_file = GameFile::from_reader(input_file).context("Unable to parse input file as dcb")?; // Read the executable + log::debug!("Deserializing executable"); let exe = dcb::game::Exe::deserialize(&mut game_file).context("Unable to parse game executable")?; // Get all instructions + log::debug!("Retrieving all instructions"); let instructions: Vec<(Pos, Instruction)> = Instruction::new_iter( exe.data .array_chunks::<4>() @@ -121,10 +116,22 @@ fn main() -> Result<(), anyhow::Error> { ) .collect(); + // Get all functions + log::debug!("Retrieving all functions"); + let functions: Funcs = Funcs::known() + .into_string() + .merge(Funcs::from_instructions( + instructions.iter().map(|(pos, instruction)| (*pos, instruction)), + )) + .collect(); + + /* // All instruction offsets + log::debug!("Retrieving all offsets"); let offsets: HashSet = instructions.iter().map(|(offset, _)| offset).copied().collect(); // All data / string addresses + log::debug!("Retrieving all data / strings addresses"); let data_string_addresses: HashSet = instructions .iter() .filter_map(|(_, instruction)| match instruction { @@ -149,20 +156,8 @@ fn main() -> Result<(), anyhow::Error> { }) .collect(); - // Get all function jumps - let funcs_pos: HashMap = instructions - .iter() - .filter_map(|(_, instruction)| match *instruction { - Instruction::Simple(SimpleInstruction::Jal { target }) => Some(target), - Instruction::Directive(Directive::Dw(target) | Directive::DwRepeated { value: target, .. }) => Some(Pos(target)), - _ => None, - }) - .filter(|target| (Instruction::CODE_START..Instruction::CODE_END).contains(target) && offsets.contains(target)) - .unique() - .zip(0..) - .collect(); - // Get all local jumps + log::debug!("Retrieving all local jumps"); let locals_pos: HashMap = instructions .iter() .filter_map(|(_, instruction)| match *instruction { @@ -187,16 +182,8 @@ fn main() -> Result<(), anyhow::Error> { .zip(0..) .collect(); - // Get all returns - let return_pos: HashSet = instructions - .iter() - .filter_map(|(cur_pos, instruction)| match instruction { - Instruction::Simple(SimpleInstruction::Jr { rs: Register::Ra }) => Some(*cur_pos), - _ => None, - }) - .collect(); - // Get all strings + log::debug!("Retrieving all strings"); let strings_pos: HashMap = instructions .iter() .filter_map(|(cur_pos, instruction)| match instruction { @@ -209,6 +196,7 @@ fn main() -> Result<(), anyhow::Error> { .collect(); // Get all data + log::debug!("Retrieving all data"); let data_pos: HashMap = instructions .iter() .filter_map(|(cur_pos, instruction)| match instruction { @@ -219,14 +207,23 @@ fn main() -> Result<(), anyhow::Error> { .unique() .zip(0..) .collect(); + */ + // Build the full instructions iterator + let full_iter = functions + .with_instructions(instructions.iter().map(|(pos, instruction)| (*pos, instruction))) + .scan(None, |last_instruction, output @ (_, cur_instruction, _)| { + Some((output, last_instruction.replace(cur_instruction))) + }); // Read all instructions - let mut last_instruction = None; let mut skipped_nops = 0; - for (offset, instruction) in &instructions { + for ((cur_pos, instruction, cur_func), last_instruction) in full_iter { + // Note: Required by `rust-analyzer` currently, it can't determine the type of `cur_func`. + let cur_func: Option<&dcb::game::exe::Func> = cur_func; + // If both last and current instructions are nops, skip - if let (Some(&Instruction::Pseudo(Nop)), Instruction::Pseudo(Nop)) = (last_instruction, instruction) { + if let (Some(Instruction::Pseudo(Nop)), Instruction::Pseudo(Nop)) = (last_instruction, instruction) { skipped_nops += 1; continue; } @@ -239,9 +236,18 @@ fn main() -> Result<(), anyhow::Error> { } // Check if we need to prefix - if let Some(func_idx) = funcs_pos.get(offset) { - println!("\n\tfunc_{func_idx}:"); + match cur_func { + Some(cur_func) if cur_func.start_pos == cur_pos => { + println!("####################"); + println!("{}:", cur_func.name); + println!("# {}\n#", cur_func.signature); + for description in cur_func.desc.lines() { + println!("# {}", description); + } + }, + _ => (), } + /* if let Some(local_idx) = locals_pos.get(offset) { println!("\t.{local_idx}:"); } @@ -251,9 +257,10 @@ fn main() -> Result<(), anyhow::Error> { if let Some(data_idx) = data_pos.get(offset) { println!("\tdata_{data_idx}:"); } + */ // Print the instruction - print!("{offset:#010x}: {instruction}"); + print!("{cur_pos:#010x}: {instruction}"); // Check if we should have any comments with this instruction // TODO: Add Pseudo jumps too @@ -272,12 +279,14 @@ fn main() -> Result<(), anyhow::Error> { SimpleInstruction::Bgezal { target, .. }, ) => { print!(" #"); - if let Some(func_idx) = funcs_pos.get(target) { - print!(" func_{func_idx}"); + if let Some(func) = functions.get(*target) { + print!(" {}", func.name); } + /* if let Some(local_idx) = locals_pos.get(target) { print!(" .{local_idx}"); } + */ }, // Comment returns @@ -285,6 +294,7 @@ fn main() -> Result<(), anyhow::Error> { print!(" # Return"); }, + /* // Comment loading address, loading and writing values of string and data // TODO: Maybe check loads / writes to halfway between // the strings / data. @@ -305,20 +315,23 @@ fn main() -> Result<(), anyhow::Error> { PseudoInstruction::SwrImm { offset, .. }, ) => { print!(" #"); + /* if let Some(string_idx) = strings_pos.get(Pos::ref_cast(offset)) { print!(" string_{string_idx}"); } if let Some(data_idx) = data_pos.get(Pos::ref_cast(offset)) { print!(" data_{data_idx}"); } + */ }, - + */ // Comment `dw`s with both function and data Instruction::Directive(Directive::Dw(offset) | Directive::DwRepeated { value: offset, .. }) => { print!(" #"); - if let Some(func_idx) = funcs_pos.get(Pos::ref_cast(offset)) { - print!(" func_{func_idx}"); + if let Some(func) = functions.get(Pos(*offset)) { + print!(" {}", func.name); } + /* if let Some(local_idx) = locals_pos.get(Pos::ref_cast(offset)) { print!(" .{local_idx}"); } @@ -328,20 +341,26 @@ fn main() -> Result<(), anyhow::Error> { if let Some(data_idx) = data_pos.get(Pos::ref_cast(offset)) { print!(" data_{data_idx}"); } + */ }, _ => (), } + // Append any comments in this line + if let Some(cur_func) = cur_func { + if let Some(comment) = cur_func.comments.get(&cur_pos) { + print!(" {comment}"); + } + } // And finish the line println!(); - // If the _last_ instruction was a return, print a newline after this one - if return_pos.contains(&(offset - 4)) { + // If the last instruction was a `return` and we have a function, space it out + if let (Some(Instruction::Simple(SimpleInstruction::Jr { rs: Register::Ra })), Some(_cur_func)) = (last_instruction, cur_func) { println!(); + println!("####################"); } - - last_instruction = Some(instruction); } Ok(()) diff --git a/dcb/Cargo.toml b/dcb/Cargo.toml index 5245897..154f72a 100644 --- a/dcb/Cargo.toml +++ b/dcb/Cargo.toml @@ -19,6 +19,8 @@ arrayref = "0.3" int-conv = "0.1" indoc = "1.0" bitmatch = "0.1" +maplit = "1.0" +either = "1.6" # Serde serde = { version = "1.0", features = ["derive"] } diff --git a/dcb/src/game/exe/func.rs b/dcb/src/game/exe/func.rs index cac051c..bc058d5 100644 --- a/dcb/src/game/exe/func.rs +++ b/dcb/src/game/exe/func.rs @@ -1,66 +1,228 @@ //! Executable functions +// Modules +pub mod iter; + +// Exports +pub use iter::WithInstructionsIter; + // Imports -use crate::game::exe::Pos; +use crate::{ + game::exe::{ + instruction::{Directive, Register, SimpleInstruction}, + Instruction, Pos, + }, + util::merge_iter::MergeSortedIter, +}; +use maplit::hashmap; +use std::{ + collections::{BTreeSet, HashMap}, + iter::FromIterator, + vec, +}; /// A function within the executable -#[derive(PartialEq, Eq, Clone, Hash, Debug)] +#[derive(Clone, Debug)] #[derive(serde::Serialize, serde::Deserialize)] -pub struct Func, C: AsRef<[(Pos, S)]>> { +pub struct Func> { + /// Function name + pub name: S, + /// Function signature - signature: S, + pub signature: S, /// Description - desc: S, + pub desc: S, /// Comments - comments: C, + pub comments: HashMap, /// Start position - start_pos: Pos, + pub start_pos: Pos, /// End position (non-inclusive) - end_pos: Pos, + pub end_pos: Pos, } -impl Func<&'static str, &'static [(Pos, &'static str)]> { - /// List of all known functions - pub const ALL: &'static [Self] = &[ - Self { - signature: "void InitHeap(int* addr, unsigned int size)", - desc: "Calls A(0x39)", - comments: &[], - start_pos: Pos(0x8006a734), - end_pos: Pos(0x8006a744), - }, - Self { - signature: "void start(void)", - desc: "Executable start", - comments: &[ - (Pos(0x80056280), "Zero out 0x80077a08 .. 0x801ddf38 word by word."), - (Pos(0x80056284), "^"), - (Pos(0x80056288), "^"), - (Pos(0x8005628c), "^"), - (Pos(0x800562f8), "InitHeap(0x8007f988, ???)"), - (Pos(0x8005630c), "func_1025(0x8007f98c)"), - (Pos(0x80056324), "func_1026(string_0, string_0)"), - ], - start_pos: Pos(0x80056270), - end_pos: Pos(0x80056330), - }, - Self { - signature: "void func_1025(int*)", - desc: "", - comments: &[(Pos(0x80013ef4), "Called indefinitely?"), (Pos(0x80013efc), "^ Due to this loop")], - start_pos: Pos(0x80013e4c), - end_pos: Pos(0x80013f04), - }, - Self { - signature: "int func_446(int)", - desc: "", - comments: &[], - start_pos: Pos(0x80069124), - end_pos: Pos(0x80069150), - }, - ]; +impl> PartialEq for Func { + fn eq(&self, other: &Self) -> bool { + // Only compare the start position + self.start_pos.eq(&other.start_pos) + } +} + +impl> Eq for Func {} + +impl> PartialOrd for Func { + fn partial_cmp(&self, other: &Self) -> Option { + // Delegate to `eq` since we have a total order. + Some(self.cmp(other)) + } +} +impl> Ord for Func { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // Only compare the start position + self.start_pos.cmp(&other.start_pos) + } +} + +/// A sorted list of functions by their start address. +pub struct Funcs>(Vec>); + +impl> FromIterator> for Funcs { + fn from_iter>>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl> Funcs { + /// Merges two function lists, discarding any duplicates + /// from `other`. + #[must_use] + pub fn merge(self, other: Self) -> MergeSortedIter, vec::IntoIter>, vec::IntoIter>> { + MergeSortedIter::new(self.0.into_iter(), other.0.into_iter()) + } + + /// Adapts an instruction iterator to extract the current function + pub fn with_instructions<'a, I: Iterator>(&'a self, instructions: I) -> WithInstructionsIter<'a, S, I> { + WithInstructionsIter::new(instructions, self) + } + + /// Retrieves a function with start address `pos` + #[must_use] + pub fn get(&self, pos: Pos) -> Option<&Func> { + // Note: As we're sorted, we can binary search + self.0 + .binary_search_by(|func| func.start_pos.cmp(&pos)) + .ok() + .and_then(|idx| self.0.get(idx)) + } +} + +#[allow(clippy::use_self)] // We're not using `Funcs`, but `Funcs` +impl + Into> Funcs { + /// Converts all strings to `String`. + #[must_use] + pub fn into_string(self) -> Funcs { + Funcs( + self.0 + .into_iter() + .map(|func| Func { + name: func.name.into(), + signature: func.signature.into(), + desc: func.desc.into(), + comments: func.comments.into_iter().map(|(pos, comment)| (pos, comment.into())).collect(), + start_pos: func.start_pos, + end_pos: func.end_pos, + }) + .collect(), + ) + } +} + + +impl Funcs<&'static str> { + /// Returns all known functions + #[must_use] + pub fn known() -> Self { + let mut functions = vec![ + Func { + name: "InitHeap", + signature: "void(int* addr, unsigned int size)", + desc: "Calls A(0x39)", + comments: hashmap! {}, + start_pos: Pos(0x8006a734), + end_pos: Pos(0x8006a744), + }, + Func { + name: "start", + signature: "void(void)", + desc: "Executable start", + comments: hashmap! { + Pos(0x80056280) => "Zero out 0x80077a08 .. 0x801ddf38 word by word.", + Pos(0x80056284) => "^", + Pos(0x80056288) => "^", + Pos(0x8005628c) => "^", + Pos(0x800562f8) => "InitHeap(0x8007f988, ???)", + Pos(0x8005630c) => "func_1025(0x8007f98c)", + Pos(0x80056324) => "func_1026(string_0, string_0)", + }, + start_pos: Pos(0x80056270), + end_pos: Pos(0x80056330), + }, + Func { + name: "func_1025", + signature: "void(int*)", + desc: "", + comments: hashmap! { + Pos(0x80013ef4) => "Called indefinitely?", + Pos(0x80013efc) => "^ Due to this loop" + }, + start_pos: Pos(0x80013e4c), + end_pos: Pos(0x80013f04), + }, + Func { + name: "func_446", + signature: "int(int)", + desc: "", + comments: hashmap! {}, + start_pos: Pos(0x80069124), + end_pos: Pos(0x80069150), + }, + ]; + + functions.sort_by(|lhs, rhs| lhs.start_pos.cmp(&rhs.start_pos)); + Self(functions) + } +} + +impl Funcs { + /// Creates a new list of functions from an iterator over instructions + #[must_use] + pub fn from_instructions<'a>(instructions: impl Iterator + Clone) -> Self { + // Get all instruction offsets present, ignoring directives. + let offsets: BTreeSet = instructions + .clone() + .filter_map(|(pos, instruction)| match instruction { + Instruction::Directive(_) => None, + _ => Some(pos), + }) + .collect(); + + // Get all returns + let returns: BTreeSet = instructions + .clone() + .filter_map(|(pos, instruction)| match instruction { + Instruction::Simple(SimpleInstruction::Jr { rs: Register::Ra }) => Some(pos), + _ => None, + }) + .collect(); + + // Now get every function entrance from jumps and `dw`s. + let function_entrances: BTreeSet = instructions + .filter_map(|(_, instruction)| match instruction { + Instruction::Simple(SimpleInstruction::Jal { target }) => Some(*target), + Instruction::Directive(Directive::Dw(target) | Directive::DwRepeated { value: target, .. }) => Some(Pos(*target)), + _ => None, + }) + .filter(|target| (Instruction::CODE_START..Instruction::CODE_END).contains(target) && offsets.contains(target)) + .collect(); + + // Now combine the function entrances and exits. + // Note: functions will be sorted, as + let functions = function_entrances + .iter() + .zip(0..) + .map(|(&target, idx)| Func { + name: format!("func_{idx}"), + signature: "".to_string(), + desc: "".to_string(), + comments: hashmap![], + start_pos: target, + end_pos: returns.range(target..).next().copied().unwrap_or(Pos(0xFFFFFFFF)), + }) + .collect(); + + Self(functions) + } } diff --git a/dcb/src/game/exe/func/iter.rs b/dcb/src/game/exe/func/iter.rs new file mode 100644 index 0000000..bd48e79 --- /dev/null +++ b/dcb/src/game/exe/func/iter.rs @@ -0,0 +1,69 @@ +//! Iterators + +// Imports +use super::{Func, Funcs}; +use crate::game::exe::{ + instruction::{Register, SimpleInstruction}, + Instruction, Pos, +}; + +/// Iterator of instructions along with the current function +pub struct WithInstructionsIter<'a, S: AsRef, I: Iterator> { + /// The instructions iterator + instructions: I, + + /// All functions + funcs: &'a Funcs, + + /// Last instruction + last_instruction: Option<&'a Instruction>, + + /// Current function + cur_func: Option<&'a Func>, +} + +impl<'a, S: AsRef, I: Iterator> WithInstructionsIter<'a, S, I> { + /// Creates a new instructions iterator + pub(super) fn new(instructions: I, funcs: &'a Funcs) -> Self { + Self { + instructions, + funcs, + last_instruction: None, + cur_func: None, + } + } +} + + +impl<'a, S: AsRef, I: Iterator> Iterator for WithInstructionsIter<'a, S, I> { + type Item = (Pos, &'a Instruction, Option<&'a Func>); + + fn next(&mut self) -> Option { + let (pos, instruction) = self.instructions.next()?; + + // Update our last instruction + let last_instruction = self.last_instruction.replace(instruction); + + // Check if we had a return last instruction + if let Some(Instruction::Simple(SimpleInstruction::Jr { rs: Register::Ra })) = last_instruction { + // Set our cur function to `None` and return it + let cur_func = self.cur_func.take(); + return Some((pos, instruction, cur_func)); + } + + // Else check if we have a current function + match self.cur_func { + // If we go, return it + Some(cur_func) => Some((pos, instruction, Some(cur_func))), + + // Else check if we're at the start of a new function. + None => match self.funcs.get(pos) { + Some(cur_func) => { + self.cur_func = Some(cur_func); + Some((pos, instruction, Some(cur_func))) + }, + None => Some((pos, instruction, None)), + }, + } + } +} diff --git a/dcb/src/lib.rs b/dcb/src/lib.rs index f850562..d245ad6 100644 --- a/dcb/src/lib.rs +++ b/dcb/src/lib.rs @@ -49,7 +49,8 @@ core_intrinsics, const_assume, bindings_after_at, - array_value_iter + array_value_iter, + or_patterns )] // Lints #![warn(clippy::restriction, clippy::pedantic, clippy::nursery)] diff --git a/dcb/src/util.rs b/dcb/src/util.rs index e124832..cc9d127 100644 --- a/dcb/src/util.rs +++ b/dcb/src/util.rs @@ -11,6 +11,7 @@ pub mod array_split; pub mod null_ascii_string; #[macro_use] pub mod impl_bytes; +pub mod merge_iter; pub mod signed_hex; // Exports diff --git a/dcb/src/util/merge_iter.rs b/dcb/src/util/merge_iter.rs new file mode 100644 index 0000000..4d02239 --- /dev/null +++ b/dcb/src/util/merge_iter.rs @@ -0,0 +1,67 @@ +//! Merging iterator + +// Imports +use either::Either; +use std::cmp::Ordering; + +/// Merging sorted iterator +/// +/// Will discard duplicate items. +pub struct MergeSortedIter, Ri: Iterator> { + /// Left iterator + lhs: Li, + + /// Right iterator + rhs: Ri, + + /// Last element stored + last: Option>, +} + +impl, Ri: Iterator> MergeSortedIter { + /// Creates a new merging iterator + pub fn new(lhs: Li, rhs: Ri) -> Self { + Self { lhs, rhs, last: None } + } + + /// Chooses between two values, storing the larger one and + /// discarding the `rhs` value if equal. + /// + /// `self.last` must not be populated. + fn cmp_next(&mut self, lhs: T, rhs: T) -> T { + match lhs.cmp(&rhs) { + // Note: Discard rhs + Ordering::Equal => lhs, + Ordering::Less => { + self.last = Some(Either::Right(rhs)); + lhs + }, + Ordering::Greater => { + self.last = Some(Either::Left(lhs)); + rhs + }, + } + } +} + +impl, Ri: Iterator> Iterator for MergeSortedIter { + type Item = T; + + fn next(&mut self) -> Option { + match self.last.take() { + Some(Either::Left(lhs)) => match self.rhs.next() { + Some(rhs) => Some(self.cmp_next(lhs, rhs)), + None => Some(lhs), + }, + Some(Either::Right(rhs)) => match self.lhs.next() { + Some(lhs) => Some(self.cmp_next(lhs, rhs)), + None => Some(rhs), + }, + None => match (self.lhs.next(), self.rhs.next()) { + (None, None) => None, + (None, Some(func)) | (Some(func), None) => Some(func), + (Some(lhs), Some(rhs)) => Some(self.cmp_next(lhs, rhs)), + }, + } + } +}