diff --git a/README.md b/README.md index 90dcaf0..c498859 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,6 @@ To build and run this you will need: * protoc * The following OPAM packages: * ocaml-protoc-plugin - * hexstring * base64 To massively simplify this, simply run ```scripts/build-all.sh``` in your preferred install directory. This script assumes a completely fresh Ubuntu 20.04.5 installation. It is advised to run this script within a fresh VM but it should work on established installations. A complete installation can take several hours and will prompt for the sudo password at least once. diff --git a/bin/dune b/bin/dune index 493907a..a29b42a 100644 --- a/bin/dune +++ b/bin/dune @@ -1,4 +1,8 @@ (executable (public_name gtirb_semantics) (name main) - (libraries base64 yojson gtirb_semantics asli.libASL)) + (modules main) + (libraries gtirb_semantics base64 yojson asli.libASL llvm_disas) + (flags :standard (:include ../llvm-disas/llvmconf.sexp)) +) + diff --git a/bin/main.ml b/bin/main.ml index 278799f..cf71d3d 100644 --- a/bin/main.ml +++ b/bin/main.ml @@ -5,9 +5,11 @@ open Gtirb_semantics.Module.Gtirb.Proto open Gtirb_semantics.Section.Gtirb.Proto open Gtirb_semantics.CodeBlock.Gtirb.Proto open Gtirb_semantics.AuxData.Gtirb.Proto +open Gtirb_semantics.Symbol.Gtirb.Proto open LibASL open Bytes open List +open Llvm_disas (* TYPES *) @@ -22,12 +24,24 @@ type rectified_block = { size : int; } +type instruction_semantics = { + address: int; + opcode_le: string; + opcode_be: string; + readable: string option; + statementlist: string list; + pretty_statementlist: string list; +} + (* ASLi semantic info for a block *) type ast_block = { auuid : bytes; - asts : string list list; + label: string option; + address : int; + asts : instruction_semantics list; } + (* Wrapper for polymorphic code/data/not-set block pre-rectification *) type content_block = { block : Block.t; @@ -50,10 +64,6 @@ let usage_string = "GTIRB_FILE OUTPUT_FILE" let ast = "ast" (*let text = ".text"*) -(* JSON parsing/building *) -let hex = "0x" - - (* MAIN *) let () = (* Convenience *) @@ -137,6 +147,13 @@ let () = mapmap (fun b -> {b with opcodes = cut_ops b.contents}) trimmed in + let symmap = Hashtbl.create (List.fold_left (+) 0 (List.map (fun (m: Module.t) -> List.length m.symbols) ir.modules)) in + List.iter (fun (m: Module.t) -> (List.iter (fun (s: Symbol.t) -> + match s.optional_payload with + | `Referent_uuid b -> Hashtbl.add symmap b s + | _ -> () + ) m.symbols)) + modules; (* Convert every opcode to big endianness *) let blk_orded : rectified_block list list = let need_flip = map (fun (m : Module.t) @@ -156,7 +173,7 @@ let () = in (* hashtable for memoising disassembly results by opcode. *) - let tbl : (bytes, string list) Hashtbl.t = Hashtbl.create 10000 in + let tbl : (bytes, ((string list) * (string list))) Hashtbl.t = Hashtbl.create 10000 in let tbl_update k f = match Hashtbl.find_opt tbl k with | Some x -> x @@ -171,21 +188,39 @@ let () = in (* Evaluate each instruction one by one with a new environment for each *) - let to_asli (op: bytes) (addr : int) : string list = - let p_raw a = Utils.to_string (Asl_parser_pp.pp_raw_stmt a) |> String.trim in - let address = Some (string_of_int addr) in - let str = hex ^ Hexstring.encode op in - let str_bytes = Printf.sprintf "%08lX" (Bytes.get_int32_le op 0) in + let to_asli (opcode_be: bytes) (addr : int) : instruction_semantics = + let p_raw a = Utils.to_string (Asl_parser_pp.pp_raw_stmt a) |> String.trim in + let p_pretty a = Asl_utils.pp_stmt a |> String.trim in + let p_byte (b: char) = Printf.sprintf "%02X" (Char.code b) in + let address = Some (string_of_int addr) in + + (* below, opnum is the numeric opcode (necessarily BE) and opcode_* are always LE. *) + (* TODO: change argument of to_asli to follow this convention. *) + let opnum = Int32.to_int Bytes.(get_int32_be opcode_be 0) in + let opnum_str = Printf.sprintf "0x%08lx" Int32.(of_int opnum) in + + let opcode_list : char list = List.(rev @@ of_seq @@ Bytes.to_seq opcode_be) in + let opcode_str = String.concat " " List.(map p_byte opcode_list) in + let opcode : bytes = Bytes.of_seq List.(to_seq opcode_list) in let do_dis () = - (match (Dis.retrieveDisassembly ?address env (Dis.build_env env) str) with - | res -> map (fun x -> p_raw x) res + (match Dis.retrieveDisassembly ?address env (Dis.build_env env) opnum_str with + | res -> (map p_raw res, map p_pretty res) | exception exc -> Printf.eprintf "error during aslp disassembly (opcode %s, bytes %s):\n\nFatal error: exception %s\n" - str str_bytes (Printexc.to_string exc); + opnum_str opcode_str (Printexc.to_string exc); Printexc.print_backtrace stderr; exit 1) - in tbl_update op do_dis + in + let insns_raw, insns_pretty = tbl_update opcode_be (do_dis) in + { + address = addr; + opcode_be = opnum_str; + opcode_le = opcode_str; + readable = assembly_of_bytes_opt opcode; + statementlist = insns_raw; + pretty_statementlist = insns_pretty; + } in let rec asts opcodes addr = match opcodes with @@ -195,18 +230,43 @@ let () = let with_asts = mapmap (fun b -> { auuid = b.ruuid; + address = b.address; asts = (asts b.opcodes b.address); + label = Option.map (fun (s: Symbol.t) -> s.name) (Hashtbl.find_opt symmap b.ruuid) }) blk_orded in (* Massage asli outputs into a format which can be serialised and then deserialised by other tools *) + let yojson_instsem (s: instruction_semantics) = + `Assoc (List.append (match s.readable with + | Some x -> [("assembly", `String x)] + | None -> []) + [ ("addr", `Int s.address); + ("opcode_le", `String s.opcode_le); ("opcode_be", `String s.opcode_be); + ("semantics", `List (List.map (fun s -> `String s) s.statementlist)); + ("pretty_semantics", `List (List.map (fun s -> `String s) s.pretty_statementlist)); + ] + ) + in let serialisable: string list = - let to_list x = `List x in - let jsoned (asts: string list list ) : Yojson.Safe.t = mapmap (fun s -> `String s) asts |> map to_list |> to_list in + let to_list x = `List x in + let jsoned (asts: instruction_semantics list ) : Yojson.Safe.t = map (fun s -> yojson_instsem s) asts |> to_list in (*let quote bin = strung ^ (Bytes.to_string bin) ^ strung in *) - let paired: Yojson.Safe.t list = (map (fun l -> `Assoc (map (fun b -> (((Base64.encode_exn (Bytes.to_string b.auuid))), (jsoned b.asts))) l)) with_asts) in - map (fun j -> Yojson.Safe.to_string j) paired + let paired: Yojson.Safe.t list = (map (fun l -> `Assoc (map (fun (b: ast_block) -> (((Base64.encode_exn (Bytes.to_string b.auuid)), + (`Assoc (List.append (match b.label with + | Some l -> [("label", `String l)] + | None -> [] + ) + [ + ("addr", `Int b.address); + ("instructions", (jsoned b.asts)) + ] + ) + )) + )) l)) with_asts) in + List.iter (fun f -> Yojson.Safe.pretty_to_channel stderr f) paired; + map (Yojson.Safe.to_string) paired in (* Sandwich ASTs into the IR amongst the other auxdata *) @@ -235,6 +295,6 @@ let () = (* Reserialise to disk *) let out = open_out_bin Sys.argv.(out_ind) in ( - Printf.fprintf out "%s" encoded; + output_string out encoded; close_out out; ) diff --git a/dune-project b/dune-project index 23c8871..a4e8e13 100644 --- a/dune-project +++ b/dune-project @@ -19,7 +19,7 @@ (name gtirb_semantics) (synopsis "Add semantic information to the IR of a disassembled ARM64 binary") (description "A longer description") - (depends ocaml dune Yojson asli ocaml-protoc-plugin hexstring base64) + (depends ocaml dune yojson asli ocaml-protoc-plugin base64 ctypes ctypes-foreign conf-llvm) (tags (decompilers instruction-lifters static-analysis))) diff --git a/gtirb_semantics.opam b/gtirb_semantics.opam index 9228eee..b3200dc 100644 --- a/gtirb_semantics.opam +++ b/gtirb_semantics.opam @@ -12,11 +12,13 @@ bug-reports: "https://github.com/UQ-PAC/gtirb-semantics/issues" depends: [ "ocaml" "dune" {>= "3.6"} - "Yojson" + "yojson" "asli" "ocaml-protoc-plugin" - "hexstring" "base64" + "ctypes" + "ctypes-foreign" + "conf-llvm" "odoc" {with-doc} ] build: [ diff --git a/lib/dune b/lib/dune index 5760f8b..58670e7 100644 --- a/lib/dune +++ b/lib/dune @@ -1,6 +1,6 @@ (library (name gtirb_semantics) - (libraries ocaml-protoc-plugin asli.libASL hexstring base64)) + (libraries ocaml-protoc-plugin asli.libASL base64)) (rule (targets AuxData.ml ByteInterval.ml CFG.ml CodeBlock.ml DataBlock.ml IR.ml Module.ml Offset.ml ProxyBlock.ml Section.ml Symbol.ml SymbolicExpression.ml) diff --git a/llvm-disas/dune b/llvm-disas/dune new file mode 100644 index 0000000..e7c737e --- /dev/null +++ b/llvm-disas/dune @@ -0,0 +1,25 @@ + +(library + (name llvm_disas) + + (libraries ctypes ctypes.foreign) + (library_flags :standard (:include llvmconf.sexp)) +) + +; use opam's detected llvm if present, otherwise fall back to 'llvm-config' +(rule + (target llvmconf.pth) + (action (with-stdout-to %{target} + (bash "if command -v opam &>/dev/null ; then opam var conf-llvm:config ; else command -v llvm-config ; fi")))) + +; https://dune.readthedocs.io/en/latest/reference/actions/index.html +(rule + (target llvmconf.sexp) + (action (with-stdout-to %{target} + (setenv conf "%{read:llvmconf.pth}" + (pipe-stdout + ; (bash "printf 'llvm-config: %q\n' $conf >&2") + (bash "printf -- '-cclib %s\n' $($conf --libs)") + (bash "cat ; printf -- '-ccopt %s\n' $($conf --ldflags) $($conf --cflags)") + (bash "echo '(' ; cat ; echo ')'")))))) + diff --git a/llvm-disas/llvm_disas.ml b/llvm-disas/llvm_disas.ml new file mode 100644 index 0000000..b53674f --- /dev/null +++ b/llvm-disas/llvm_disas.ml @@ -0,0 +1,74 @@ +open Ctypes +open Foreign + + +let llvm_initialize_all_target_infos = foreign "LLVMInitializeAArch64TargetInfo" (void @-> returning void) +let llvm_initialize_all_target_mcs = foreign "LLVMInitializeAArch64TargetMC" (void @-> returning void) +let llvm_initialize_all_disassemblers = foreign "LLVMInitializeAArch64Disassembler" (void @-> returning void) +let llvm_create_disasm = foreign "LLVMCreateDisasm" ( string @-> ptr void @-> int @-> ptr void @-> ptr void @-> returning (ptr void)) +let llvm_disasm_instruction = foreign "LLVMDisasmInstruction" (ptr void @-> ptr char @-> size_t @-> size_t @-> ptr char @-> size_t @-> returning int) + + +let () = + llvm_initialize_all_target_infos (); + llvm_initialize_all_target_mcs (); + llvm_initialize_all_disassemblers () + +let disassembler : unit ptr = + let triple = "aarch64-unknown-linux-gnu" in + let dc : unit ptr = llvm_create_disasm triple null 0 null null in + if (dc == null) then + failwith ("Error creating disassembler for " ^ triple); + dc + +let byte_list (i: int) : char list = + let b = Bytes.create 4 in + Bytes.set_int32_le b 0 (Int32.of_int i); + List.init 4 (Bytes.get b) + +let hexstring_to_opcode (s: string) = + byte_list (int_of_string s) + +let assembly_of_bytelist (opcode : char list) : string = + let oc = CArray.of_list char opcode in + let array_len = 500 in + let out = CArray.make char array_len in + let outb = llvm_disasm_instruction disassembler (CArray.start oc) (Unsigned.Size_t.of_int 4) (Unsigned.Size_t.of_int 0) (CArray.start out) (Unsigned.Size_t.of_int array_len) in + if (outb == 0) then raise (Failure "Error disassembling instruction.") else + let takeWhile (p: 'a -> bool) (ar:('a list)) : 'a list = + let rec _take (c: 'a list) (a: 'a list) = match a with + | h :: tl -> if (p h) then (_take (c @ [h]) tl) else c + | _ -> c + in + _take [] ar + in + let strout = (String.concat "" (List.map (fun c -> String.make 1 c) (takeWhile (fun c -> c != '\000') (CArray.to_list out)))) + in String.trim @@ String.map (fun f -> if f == '\t' then ' ' else f) strout + +(* Get disassembly of a little-endian aarch64 opcode as a hexstring*) +let assembly_of_hexstring (opcode : string) : string = + assembly_of_bytelist (hexstring_to_opcode opcode) + +(* Get disassembly of a little-endian aarch64 opcode as bytes*) +let assembly_of_bytes (opcode: bytes) : string = + assembly_of_bytelist (List.of_seq (Bytes.to_seq opcode)) + +(* Get disassembly of a little-endian aarch64 opcode as an int*) +let assembly_of_int (opcode : int) : string = + assembly_of_bytelist (byte_list opcode) + +let assembly_of_int_opt (opcode : int) : string option = + match assembly_of_int opcode with + | exception Failure _ -> None + | x -> Some x + +let assembly_of_hexstring_opt (opcode : string) : string option = + match assembly_of_hexstring opcode with + | exception Failure _ -> None + | x -> Some x + +let assembly_of_bytes_opt (opcode: bytes) : string option = + match assembly_of_bytelist (List.of_seq (Bytes.to_seq opcode)) with + | exception Failure _ -> None + | x -> Some x + diff --git a/llvm-disas/llvm_disas.mli b/llvm-disas/llvm_disas.mli new file mode 100644 index 0000000..dfa833f --- /dev/null +++ b/llvm-disas/llvm_disas.mli @@ -0,0 +1,10 @@ + + + +val assembly_of_bytelist : char list -> string +val assembly_of_int: int -> string +val assembly_of_bytes: bytes -> string +val assembly_of_hexstring: string -> string +val assembly_of_int_opt: int -> string option +val assembly_of_hexstring_opt: string -> string option +val assembly_of_bytes_opt: bytes -> string option