Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Instruction addrs, block labels & llvm disassembly #6

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ To build and run this you will need:
* protoc
* The following OPAM packages:
* ocaml-protoc-plugin
* hexstring
* base64

To massively simplify this, simply run ```scripts/build-all.sh``` in your preferred install directory. This script assumes a completely fresh Ubuntu 20.04.5 installation. It is advised to run this script within a fresh VM but it should work on established installations. A complete installation can take several hours and will prompt for the sudo password at least once.
Expand Down
6 changes: 5 additions & 1 deletion bin/dune
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
(executable
(public_name gtirb_semantics)
(name main)
(libraries base64 yojson gtirb_semantics asli.libASL))
(modules main)
(libraries gtirb_semantics base64 yojson asli.libASL llvm_disas)
(flags :standard (:include ../llvm-disas/llvmconf.sexp))
)

100 changes: 80 additions & 20 deletions bin/main.ml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ open Gtirb_semantics.Module.Gtirb.Proto
open Gtirb_semantics.Section.Gtirb.Proto
open Gtirb_semantics.CodeBlock.Gtirb.Proto
open Gtirb_semantics.AuxData.Gtirb.Proto
open Gtirb_semantics.Symbol.Gtirb.Proto
open LibASL
open Bytes
open List
open Llvm_disas

(* TYPES *)

Expand All @@ -22,12 +24,24 @@ type rectified_block = {
size : int;
}

type instruction_semantics = {
address: int;
opcode_le: string;
opcode_be: string;
readable: string option;
statementlist: string list;
pretty_statementlist: string list;
}

(* ASLi semantic info for a block *)
type ast_block = {
auuid : bytes;
asts : string list list;
label: string option;
address : int;
asts : instruction_semantics list;
}


(* Wrapper for polymorphic code/data/not-set block pre-rectification *)
type content_block = {
block : Block.t;
Expand All @@ -50,10 +64,6 @@ let usage_string = "GTIRB_FILE OUTPUT_FILE"
let ast = "ast"
(*let text = ".text"*)

(* JSON parsing/building *)
let hex = "0x"


(* MAIN *)
let () =
(* Convenience *)
Expand Down Expand Up @@ -137,6 +147,13 @@ let () =
mapmap (fun b -> {b with opcodes = cut_ops b.contents}) trimmed
in

let symmap = Hashtbl.create (List.fold_left (+) 0 (List.map (fun (m: Module.t) -> List.length m.symbols) ir.modules)) in
List.iter (fun (m: Module.t) -> (List.iter (fun (s: Symbol.t) ->
match s.optional_payload with
| `Referent_uuid b -> Hashtbl.add symmap b s
| _ -> ()
) m.symbols))
modules;
(* Convert every opcode to big endianness *)
let blk_orded : rectified_block list list =
let need_flip = map (fun (m : Module.t)
Expand All @@ -156,7 +173,7 @@ let () =
in

(* hashtable for memoising disassembly results by opcode. *)
let tbl : (bytes, string list) Hashtbl.t = Hashtbl.create 10000 in
let tbl : (bytes, ((string list) * (string list))) Hashtbl.t = Hashtbl.create 10000 in
let tbl_update k f =
match Hashtbl.find_opt tbl k with
| Some x -> x
Expand All @@ -171,21 +188,39 @@ let () =
in

(* Evaluate each instruction one by one with a new environment for each *)
let to_asli (op: bytes) (addr : int) : string list =
let p_raw a = Utils.to_string (Asl_parser_pp.pp_raw_stmt a) |> String.trim in
let address = Some (string_of_int addr) in
let str = hex ^ Hexstring.encode op in
let str_bytes = Printf.sprintf "%08lX" (Bytes.get_int32_le op 0) in
let to_asli (opcode_be: bytes) (addr : int) : instruction_semantics =
let p_raw a = Utils.to_string (Asl_parser_pp.pp_raw_stmt a) |> String.trim in
let p_pretty a = Asl_utils.pp_stmt a |> String.trim in
let p_byte (b: char) = Printf.sprintf "%02X" (Char.code b) in
let address = Some (string_of_int addr) in

(* below, opnum is the numeric opcode (necessarily BE) and opcode_* are always LE. *)
(* TODO: change argument of to_asli to follow this convention. *)
let opnum = Int32.to_int Bytes.(get_int32_be opcode_be 0) in
let opnum_str = Printf.sprintf "0x%08lx" Int32.(of_int opnum) in

let opcode_list : char list = List.(rev @@ of_seq @@ Bytes.to_seq opcode_be) in
let opcode_str = String.concat " " List.(map p_byte opcode_list) in
let opcode : bytes = Bytes.of_seq List.(to_seq opcode_list) in
let do_dis () =
(match (Dis.retrieveDisassembly ?address env (Dis.build_env env) str) with
| res -> map (fun x -> p_raw x) res
(match Dis.retrieveDisassembly ?address env (Dis.build_env env) opnum_str with
| res -> (map p_raw res, map p_pretty res)
| exception exc ->
Printf.eprintf
"error during aslp disassembly (opcode %s, bytes %s):\n\nFatal error: exception %s\n"
str str_bytes (Printexc.to_string exc);
opnum_str opcode_str (Printexc.to_string exc);
Printexc.print_backtrace stderr;
exit 1)
in tbl_update op do_dis
in
let insns_raw, insns_pretty = tbl_update opcode_be (do_dis) in
{
address = addr;
opcode_be = opnum_str;
opcode_le = opcode_str;
readable = assembly_of_bytes_opt opcode;
statementlist = insns_raw;
pretty_statementlist = insns_pretty;
}
in
let rec asts opcodes addr =
match opcodes with
Expand All @@ -195,18 +230,43 @@ let () =
let with_asts = mapmap (fun b
-> {
auuid = b.ruuid;
address = b.address;
asts = (asts b.opcodes b.address);
label = Option.map (fun (s: Symbol.t) -> s.name) (Hashtbl.find_opt symmap b.ruuid)
}) blk_orded
in

(* Massage asli outputs into a format which can
be serialised and then deserialised by other tools *)
let yojson_instsem (s: instruction_semantics) =
`Assoc (List.append (match s.readable with
| Some x -> [("assembly", `String x)]
| None -> [])
[ ("addr", `Int s.address);
("opcode_le", `String s.opcode_le); ("opcode_be", `String s.opcode_be);
("semantics", `List (List.map (fun s -> `String s) s.statementlist));
("pretty_semantics", `List (List.map (fun s -> `String s) s.pretty_statementlist));
]
)
in
let serialisable: string list =
let to_list x = `List x in
let jsoned (asts: string list list ) : Yojson.Safe.t = mapmap (fun s -> `String s) asts |> map to_list |> to_list in
let to_list x = `List x in
let jsoned (asts: instruction_semantics list ) : Yojson.Safe.t = map (fun s -> yojson_instsem s) asts |> to_list in
(*let quote bin = strung ^ (Bytes.to_string bin) ^ strung in *)
let paired: Yojson.Safe.t list = (map (fun l -> `Assoc (map (fun b -> (((Base64.encode_exn (Bytes.to_string b.auuid))), (jsoned b.asts))) l)) with_asts) in
map (fun j -> Yojson.Safe.to_string j) paired
let paired: Yojson.Safe.t list = (map (fun l -> `Assoc (map (fun (b: ast_block) -> (((Base64.encode_exn (Bytes.to_string b.auuid)),
(`Assoc (List.append (match b.label with
| Some l -> [("label", `String l)]
| None -> []
)
[
("addr", `Int b.address);
("instructions", (jsoned b.asts))
]
)
))
)) l)) with_asts) in
List.iter (fun f -> Yojson.Safe.pretty_to_channel stderr f) paired;
map (Yojson.Safe.to_string) paired
in

(* Sandwich ASTs into the IR amongst the other auxdata *)
Expand Down Expand Up @@ -235,6 +295,6 @@ let () =
(* Reserialise to disk *)
let out = open_out_bin Sys.argv.(out_ind) in
(
Printf.fprintf out "%s" encoded;
output_string out encoded;
close_out out;
)
2 changes: 1 addition & 1 deletion dune-project
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
(name gtirb_semantics)
(synopsis "Add semantic information to the IR of a disassembled ARM64 binary")
(description "A longer description")
(depends ocaml dune Yojson asli ocaml-protoc-plugin hexstring base64)
(depends ocaml dune yojson asli ocaml-protoc-plugin base64 ctypes ctypes-foreign conf-llvm)
(tags
(decompilers instruction-lifters static-analysis)))

Expand Down
6 changes: 4 additions & 2 deletions gtirb_semantics.opam
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ bug-reports: "https://github.com/UQ-PAC/gtirb-semantics/issues"
depends: [
"ocaml"
"dune" {>= "3.6"}
"Yojson"
"yojson"
"asli"
"ocaml-protoc-plugin"
"hexstring"
"base64"
"ctypes"
"ctypes-foreign"
"conf-llvm"
"odoc" {with-doc}
]
build: [
Expand Down
2 changes: 1 addition & 1 deletion lib/dune
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(library
(name gtirb_semantics)
(libraries ocaml-protoc-plugin asli.libASL hexstring base64))
(libraries ocaml-protoc-plugin asli.libASL base64))

(rule
(targets AuxData.ml ByteInterval.ml CFG.ml CodeBlock.ml DataBlock.ml IR.ml Module.ml Offset.ml ProxyBlock.ml Section.ml Symbol.ml SymbolicExpression.ml)
Expand Down
25 changes: 25 additions & 0 deletions llvm-disas/dune
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

(library
(name llvm_disas)

(libraries ctypes ctypes.foreign)
(library_flags :standard (:include llvmconf.sexp))
)

; use opam's detected llvm if present, otherwise fall back to 'llvm-config'
(rule
(target llvmconf.pth)
(action (with-stdout-to %{target}
(bash "if command -v opam &>/dev/null ; then opam var conf-llvm:config ; else command -v llvm-config ; fi"))))

; https://dune.readthedocs.io/en/latest/reference/actions/index.html
(rule
(target llvmconf.sexp)
(action (with-stdout-to %{target}
(setenv conf "%{read:llvmconf.pth}"
(pipe-stdout
; (bash "printf 'llvm-config: %q\n' $conf >&2")
(bash "printf -- '-cclib %s\n' $($conf --libs)")
(bash "cat ; printf -- '-ccopt %s\n' $($conf --ldflags) $($conf --cflags)")
(bash "echo '(' ; cat ; echo ')'"))))))

74 changes: 74 additions & 0 deletions llvm-disas/llvm_disas.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
open Ctypes
open Foreign


let llvm_initialize_all_target_infos = foreign "LLVMInitializeAArch64TargetInfo" (void @-> returning void)
let llvm_initialize_all_target_mcs = foreign "LLVMInitializeAArch64TargetMC" (void @-> returning void)
let llvm_initialize_all_disassemblers = foreign "LLVMInitializeAArch64Disassembler" (void @-> returning void)
let llvm_create_disasm = foreign "LLVMCreateDisasm" ( string @-> ptr void @-> int @-> ptr void @-> ptr void @-> returning (ptr void))
let llvm_disasm_instruction = foreign "LLVMDisasmInstruction" (ptr void @-> ptr char @-> size_t @-> size_t @-> ptr char @-> size_t @-> returning int)


let () =
llvm_initialize_all_target_infos ();
llvm_initialize_all_target_mcs ();
llvm_initialize_all_disassemblers ()

let disassembler : unit ptr =
let triple = "aarch64-unknown-linux-gnu" in
let dc : unit ptr = llvm_create_disasm triple null 0 null null in
if (dc == null) then
failwith ("Error creating disassembler for " ^ triple);
dc

let byte_list (i: int) : char list =
let b = Bytes.create 4 in
Bytes.set_int32_le b 0 (Int32.of_int i);
List.init 4 (Bytes.get b)

let hexstring_to_opcode (s: string) =
byte_list (int_of_string s)

let assembly_of_bytelist (opcode : char list) : string =
let oc = CArray.of_list char opcode in
let array_len = 500 in
let out = CArray.make char array_len in
let outb = llvm_disasm_instruction disassembler (CArray.start oc) (Unsigned.Size_t.of_int 4) (Unsigned.Size_t.of_int 0) (CArray.start out) (Unsigned.Size_t.of_int array_len) in
if (outb == 0) then raise (Failure "Error disassembling instruction.") else
let takeWhile (p: 'a -> bool) (ar:('a list)) : 'a list =
let rec _take (c: 'a list) (a: 'a list) = match a with
| h :: tl -> if (p h) then (_take (c @ [h]) tl) else c
| _ -> c
in
_take [] ar
in
let strout = (String.concat "" (List.map (fun c -> String.make 1 c) (takeWhile (fun c -> c != '\000') (CArray.to_list out))))
in String.trim @@ String.map (fun f -> if f == '\t' then ' ' else f) strout

(* Get disassembly of a little-endian aarch64 opcode as a hexstring*)
let assembly_of_hexstring (opcode : string) : string =
assembly_of_bytelist (hexstring_to_opcode opcode)

(* Get disassembly of a little-endian aarch64 opcode as bytes*)
let assembly_of_bytes (opcode: bytes) : string =
assembly_of_bytelist (List.of_seq (Bytes.to_seq opcode))

(* Get disassembly of a little-endian aarch64 opcode as an int*)
let assembly_of_int (opcode : int) : string =
assembly_of_bytelist (byte_list opcode)

let assembly_of_int_opt (opcode : int) : string option =
match assembly_of_int opcode with
| exception Failure _ -> None
| x -> Some x

let assembly_of_hexstring_opt (opcode : string) : string option =
match assembly_of_hexstring opcode with
| exception Failure _ -> None
| x -> Some x

let assembly_of_bytes_opt (opcode: bytes) : string option =
match assembly_of_bytelist (List.of_seq (Bytes.to_seq opcode)) with
| exception Failure _ -> None
| x -> Some x

10 changes: 10 additions & 0 deletions llvm-disas/llvm_disas.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@



val assembly_of_bytelist : char list -> string
val assembly_of_int: int -> string
val assembly_of_bytes: bytes -> string
val assembly_of_hexstring: string -> string
val assembly_of_int_opt: int -> string option
val assembly_of_hexstring_opt: string -> string option
val assembly_of_bytes_opt: bytes -> string option