-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathutils_sig.py
executable file
·1139 lines (1075 loc) · 47.5 KB
/
utils_sig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
import angr,claripy
import sys,os
import logging
import copy,re
import networkx
from networkx.algorithms import approximation
import itertools
#logging.basicConfig(level=logging.DEBUG) # adjust to the wanted debug level
dbg_out = True
def test_loader(b,base):
print(b.loader.min_addr)
print(b.loader.max_addr)
print(b.loader.memory[base+0x38])
print(b.arch)
blk = b.factory.block(addr=base+0xc440c4).capstone
for ins in blk.insns:
print(ins)
irsb = b.factory.block(addr=base+0xc440c4).vex
irsb.pp()
for stmt_idx, stmt in enumerate(irsb.statements):
print(stmt_idx)
if hasattr(stmt,'expressions'):
for expr in stmt.expressions:
print(expr.tag)
print(expr)
def load_kernel_image(path,arch,base,segments=None):
load_options = {}
load_options['auto_load_libs'] = False
load_options['main_opts'] = {'backend': 'blob', 'custom_arch': arch, 'custom_base_addr': base, 'segments':segments}
#Use loader.provide_symbol() or loader.provide_symbol_batch() to import symbol table.
#-----------------------------------------------------------------------------------
#def provide_symbol(self, owner, name, offset, size=0, sym_type=None):
# return self.provide_symbol_batch(owner, {name: (offset, size, sym_type)})
#-----------------------------------------------------------------------------------
#Usage: owner --> the Backend object, we can use loader.main_bin
# offset --> the offset relative to 0, not actual kernel load address
# sym_type --> https://github.com/angr/cle/blob/master/cle/backends/__init__.py#L148
b = angr.Project(path, load_options=load_options, arch=arch)
#test_loader(b,base)
return b;
re_reg = 'reg_[\da-f]+_[\d]+_[\d]+'
re_mem = 'mem_[\da-f]+_[\d]+_[\d]+'
re_ret = 'fake_ret_value_[\d]+_[\d]+'
translate_arch = None
#We assume that the reg str looks like "reg_[offset Hex num]_N_[size in bits]"
def translate_reg_name(reg):
#Since this can be called from re.sub(), the parameter is a 'SRE_Match' object, thus we should convert it to string if necessary.
if reg is not str:
reg = reg.group(0)
global translate_arch
if translate_arch == None:
print('translate_reg_name: set the translate_arch at first!')
return reg
if not reg.startswith('reg'):
return reg
tokens = reg.split('_')
if len(tokens) != 4:
return reg
offset = int(tokens[1],base = 16)
size = int(tokens[3])/8
return translate_arch.translate_register_name(offset = offset, size = size) + '_' + tokens[2]
#Replace the reg name in the string with a human-readable form
def make_reg_readable(arch,s):
global translate_arch
translate_arch = arch
return re.sub(re_reg,translate_reg_name,s)
#Given a instruction address, this function will identify which registers/mems are read/write.
#This is done by parsing the generated VEX statements, which should work for all archs.
#Param: addr is the start of a block, size is the block size, if ins_list is provided, only the instructions on the list will be analyzed.
def get_ins_ops_info(proj,addr,size=None,ins_list=None, opt_level=0):
irsb = proj.factory.block(addr,size=size,opt_level=opt_level).vex
#info: addr --> {key:val}
info = {}
index = {}
ss = irsb.statements
for i in range(len(ss)):
if ss[i].tag == 'Ist_IMark':
index[ss[i].addr] = i
ins_list = list(index.keys()) if ins_list is None else [x for x in ins_list if x in index]
#NOTE: this list is arch-dependent
#x30 is link register
#ignore_regs = ['ip','pc','sp','xsp','x30']
ignore_regs = []
#TODO: it seems that VEX doesn't differentiate X0 and W0? But what if this matters?
reg_name = lambda off: proj.arch.translate_register_name(offset=off)
for addr in ins_list:
info[addr] = {}
#Skip the initial IMark
for i in range(index[addr]+1,len(ss)):
stmt = ss[i]
if stmt.tag == 'Ist_Put':
#Reg write
reg = reg_name(stmt.offset)
if not reg in ignore_regs:
info[addr].setdefault('put',set()).add(reg)
#Do some recording heres if it's a 'put_ip' stmt.
#TODO: Add register names if you want to support different archs.
if reg in ('ip','pc'):
e = stmt.expressions
if len(e) == 1 and e[0].tag == 'Iex_Const':
if 'put_ip' in info[addr]:
print('[???]Instruction @ %x has multiple put_ip stmts..' % addr)
info[addr]['put_ip'] = e[0].con.value
elif stmt.tag == 'Ist_Store':
#Mem write
#We only indicate that there is a mem write in this instruction, actual data and addr will be decided in runtime.
info[addr]['store'] = True
elif stmt.tag == 'Ist_Exit':
info[addr]['exit'] = True
elif stmt.tag == 'Ist_IMark':
break
#NOTE: Reg read is special because it's not a statement, but an expression with tag 'Iex_Get'.
#Similarly, mem read is 'Iex_Load' So to capture then, we need to iterate the expressions.
for exp in stmt.expressions:
if exp.tag == 'Iex_Get':
#Reg read
reg = reg_name(exp.offset)
if not reg in ignore_regs:
info[addr].setdefault('get',set()).add(reg)
elif exp.tag == 'Iex_Load':
#Mem read
#Similar to mem write, we only indicate that there is a mem read.
info[addr]['load'] = True
return (info,irsb.jumpkind)
def print_vex_blocks(proj,addrs,opt_level=0):
for addr in addrs:
print('[VEX] ' + hex(addr))
irsb = proj.factory.block(addr,opt_level=opt_level).vex
#irsb.pp()
for stmt_idx, stmt in enumerate(irsb.statements):
print('[' + str(stmt_idx) + '] ' + str(stmt) + ' -- ' + stmt.tag + ' -- ' + type(stmt).__name__)
def get_cfg_acc(proj,start,end):
return proj.analyses.CFGAccurate(context_sensitivity_level=0,starts=[start],call_depth=0,normalize=True)
#Get the CFGFast of the function starting from 'start' and ending at 'end'
def get_cfg_fast(proj,start,end):
# The regions in the image that the CFG should cover
regions = [(start,end)]
# Specify the start addresses of functions need to be analyzed
function_starts = [start]
# It seems that 'regions' and 'start_at_entry' parameters are not supported in this version?
return proj.analyses.CFGFast(symbols=False,regions=regions,start_at_entry=False,function_prologues=False,function_starts=function_starts)
#Get an element from a set
def pick_one(s):
for e in s:
break
return e
#'his' is a sequence of basic block addrs, this function judge whether 'state' has gone through the same history.
def old_days(state,his):
so_far = [x for x in state.history.bbl_addrs]
return str(so_far)[1:-1].startswith(str(his)[1:-1])
#Use DFS to get all the paths from start to target.
#TODO: To deal with paths with loops.
def get_path(cfg,path,target,visited=set()):
n = path[-1]
if n.addr == target.addr:
return path
if n.addr in visited:
return
visited.add(n.addr)
for x in cfg.successors(n):
if x.addr not in visited:
new_path = copy.deepcopy(path)
new_path.append(x)
get_path(cfg,new_path,target,visited)
#The cfg is a DiGraph
def get_all_preds_raw(cfg,n,preds):
preds.add(n)
for x in cfg.predecessors(n):
if x not in preds:
get_all_preds_raw(cfg,x,preds)
#The cfg is a DiGraph
def get_all_preds(cfg,n,preds):
preds.add(n.addr)
for x in cfg.predecessors(n):
if x.addr not in preds:
get_all_preds(cfg,x,preds)
#Given a subgraph instead of a single node, return all predecessors of this subgraph.
def get_all_preds4graph(cfg,g):
preds = set()
en = set([x.addr for x in g.nodes() if g.in_degree(x) == 0])
en = set([get_node_by_addr(cfg,x) for x in en])
for n in en:
t = set()
get_all_preds(cfg,n,t)
preds = preds.union(t)
return preds
#The cfg is a DiGraph
def get_all_succs(cfg,n,succs):
succs.add(n.addr)
for x in cfg.successors(n):
if x.addr not in succs:
get_all_succs(cfg,x,succs)
#Given a subgraph instead of a single node, return all successors of this subgraph.
def get_all_succs4graph(cfg,g):
succs = set()
en = set([x.addr for x in g.nodes() if g.out_degree(x) == 0])
en = set([get_node_by_addr(cfg,x) for x in en])
for n in en:
t = set()
get_all_succs(cfg,n,t)
succs = succs.union(t)
return succs
#Given a CFGAcc or CFGFast and a function start address, return the function CFG which is a DiGraph.
#'simplify' means whether we should optimize the CFG topology to eliminate all absolute control transfer nodes (eg. b,jmp)
def get_func_cfg(cfg,start,normalize=True,sym_tab=None,proj=None,simplify=False):
func = cfg.functions[start]
if func is None:
print('No function at %x' % start)
return None
if normalize:
func.normalize()
#Different from the 'cfg', func_cfg is a networkx.DiGraph.
g = copy.deepcopy(func.transition_graph)
#Do some extra checks
exception_nodes = [hex(x.addr) for x in g.nodes() if x.size is None]
if len(exception_nodes) > 0:
print('[FUNC_CFG] Plz check following nodes, they may contain un-decodable instructions: ' + str(exception_nodes))
#Issue 0: Sometimes this graph will include size 0 node, which represent functions that are called by current function.
#We don't need them actually.
g.remove_nodes_from([x for x in g.nodes() if x.size == 0 or x.size is None])
#Issue 1: This is a hacking, sometimes a function may call a non-return function in the end (e.g. __stack_chk_fail),
#we must ensure that callee node will not have any successors..
#*But* we need Angr proj and sym_table to enable the detection and correction below.
non_ret_funcs = ('__stack_chk_fail')
non_rets = []
if sym_tab and proj:
for n in g.nodes():
irsb = proj.factory.block(n.addr,size=n.size,opt_level=0).vex
if irsb.jumpkind == 'Ijk_Call':
name = get_exit_func_name(proj,irsb,sym_tab)
if name and name in non_ret_funcs:
non_rets += [n]
if non_rets:
g.remove_edges_from(g.out_edges(non_rets))
#If specified, optimize out absolute jump nodes.
if simplify:
#Note that here 'g' is already a deepcopy of original 'transition_graph' in cfg_acc
#So modify 'g' will not affect original cfg_acc.
simplify_cfg_aarch64(proj,g)
return g
def simplify_cfg_aarch64(proj,g):
if not proj:
return g
def _is_abs_jmp_node(n):
if not n.size or n.size != 4:
return False
succs = g.successors(n)
if succs and len(succs) > 1:
return False
cap = proj.factory.block(n.addr,size=4).capstone
opc = cap.insns[0].insn.mnemonic
if opc in ('b','B','prfm'):
return True
return False
absn = [n for n in g.nodes() if _is_abs_jmp_node(n)]
for n in absn:
preds = g.predecessors(n)
succs = g.successors(n)
g.remove_node(n)
if preds and succs:
for p in preds:
g.add_edge(p,succs[0])
return g
#The cfg is a DiGraph.
def get_node_by_addr(cfg,addr,any_addr=False):
nodes = [ x for x in cfg.nodes() if x.addr==addr and x.size is not None]
if nodes:
return sorted(nodes,key=lambda x:x.size)[0]
if any_addr:
nodes = [ x for x in cfg.nodes() if x.size is not None and x.addr <= addr and addr < x.addr + x.size ]
if nodes:
return sorted(nodes,key=lambda x:x.size)[0]
return None
#Below are some signature-related stuff.
def _calc_next_instruction_addr(proj,addr):
if proj.arch.name == 'AARCH64':
return addr + 4
else:
print('Unsupported arch: ' + proj.arch.name)
return addr + 4
#The main purpose is to set up the initial 'formulas' map for the given node in the signature.
#The main task is to identify the address and involved registers for each root instruction in the node.
#Param: g --> sig graph n --> node
def init_sig_node_formulas(proj,g,n,sym_tab=None):
g.node[n]['formulas'] = {}
formulas = g.node[n]['formulas']
(ins_info,jumpkind) = get_ins_ops_info(proj,n.addr,n.size,list(g.node[n]['root_ins']))
g.node[n]['jumpkind'] = jumpkind
#'x30' is link register for 'bl'
ignore_regs = ['ip','pc','sp','xsp','x30','cc_op','cc_ndep','cc_dep1','cc_dep2']
stack_regs = ['sp','xsp']
for addr in ins_info:
if 'exit' in ins_info[addr]:
#An exit instruction.
#'a-g-k' is a addr-guard-jumpkind tuple list.
formulas[addr] = {'type':'exit','a-g-k':[]}
elif 'put_ip' in ins_info[addr] and ins_info[addr]['put_ip'] != _calc_next_instruction_addr(proj,addr):
#We want to capture a special case here.
#The 'exit' type is from 'Ist_Exit' stmt, which usually indicates a conditional jump, but what about unconditional jump?
#Instructions like 'bl' 'b' usually don't have 'Ist_Exit' stmt, what they do is simply set 'ip' register to target address,
#but other normal instructions will also set 'ip' to the next instruction, so here we use a heuristic: if an instruction
#will set 'ip' to somewhere that is not its next instruction (but itself is not of 'exit' type), then it should be an unconditional jump.
formulas[addr] = {'type':'exit','a-g-k':[]}
#Parse the function name if possible.
if sym_tab is not None:
e = sym_tab.lookup(ins_info[addr]['put_ip'])
if e is not None:
g.node[n]['exit_func_name'] = e[1]
formulas[addr]['func_name'] = e[1]
elif 'store' in ins_info[addr]:
#I want to put a special filter here: for the instructions that store sth to stack variables, I want to exclude them from root instructions.
#Because they are very likely to be retrieved later, that's to say in theory they should have out edges in DFG but unfortunately current DFG
#analysis cannot recognize dependencies related to memory location.
#TODO: may consider to improve the DFG analysis to make it able to deal with simple memory dependencies.
if 'get' in ins_info[addr] and any(reg in ins_info[addr]['get'] for reg in stack_regs):
g.node[n]['root_ins'].remove(addr)
g.graph['root_ins'].remove(addr)
continue
#This is a mem store instruction
#a-d-l: addr-data-length tuple list
formulas[addr] = {'type':'store','a-d-l':[]}
elif 'load' in ins_info[addr]:
#This is a mem load instruction
formulas[addr] = {'type':'load'}
#For mem load instruction we only need to care about the regs that are written.
if 'put' in ins_info[addr]:
for reg in ins_info[addr]['put']:
if reg not in ignore_regs:
formulas[addr][reg] = []
else:
#This instruction has nothing to do with mem, it should be a data transfer between regs.
formulas[addr] = {'type':'other'}
if 'put' in ins_info[addr]:
for reg in ins_info[addr]['put']:
if reg not in ignore_regs:
formulas[addr][reg] = []
return
#A signature is basically an attributed CFG, we use DiGraph to implement this, which support attributes for nodes/edges.
#Params: cfg is a DiGraph with normal BBs as nodes.
def init_signature(proj,cfg,addrs,insns=None,pos=None,negs=None,sym_tab=None):
#print [hex(x) for x in addrs]
nodes = [get_node_by_addr(cfg,x) for x in addrs]
#Get in_degree and out_degree in original CFG.
ind = {}
outd = {}
for n in nodes:
ind[n.addr] = cfg.in_degree(n)
outd[n.addr] = cfg.out_degree(n)
#Get subgraph
og = cfg.subgraph(nodes)
og = copy.deepcopy(og)
sigs = []
#Get the weakly connected subgraphs
for g in networkx.weakly_connected_component_subgraphs(og):
#Get the DFG for these nodes
dfgs = get_dfg(proj,[ x for x in g.nodes() if x.size != 0])
#This holds all root instruction addrs of the whole signature graph.
g.graph['root_ins'] = set()
#Set attributes of the nodes in the subgraph
void_node = []
for n in g.nodes():
#The node is of the type 'angr.knowledge.codenode.BlockNode'
#print '---------------------------'
#print '%x,%x' % (n.addr,n.size)
if n.size == 0:
#This should be the start of another function which is called in current function.
#TODO: what information to add here? Maybe the function name?
continue
#The DFG is on the VEx statements (opt_level=0) level
#TODO: do we really need to store DFG for each node?
if n.addr not in dfgs or dfgs[n.addr] is None:
print('No DFG generated for (%x,%x)' % (n.addr,n.size))
g.node[n]['dfg'] = None
#This node should contain only one control-transfer instruction.
root_ins = set([n.addr])
else:
g.node[n]['dfg'] = dfgs[n.addr]
#Identify the root instructions for each node according to its DFG.
root_ins_d = get_root_ins_addr(dfgs[n.addr])
#We have two filter modes: pos/neg markers based and accurate instruction list based.
#If we have DWARF debug information, we should always use the accurate instruction list based filter, which is simply better.
root_ins = set(root_ins_d.keys())
if insns is not None:
root_ins = set(flt_root_ins_acc(g,n,root_ins_d,_get_in_node_addr(n,insns)))
elif pos is not None or negs is not None:
root_ins = set(flt_root_ins_marker(g,n,root_ins,_get_in_node_addr(n,pos),_get_in_node_addr(n,negs)))
if not root_ins:
void_node.append(n)
continue
g.node[n]['root_ins'] = root_ins
g.graph['root_ins'] = g.graph['root_ins'].union(g.node[n]['root_ins'])
#Original code block
#TODO: should we set opt_level = 0 here?
g.node[n]['block'] = proj.factory.block(n.addr,size=n.size)
#Record the in_degree and out_degree in original CFG
g.node[n]['in_d'] = ind[n.addr]
g.node[n]['out_d'] = outd[n.addr]
#Entry format of the formulas dict: addr --> (reg_name,formula)
init_sig_node_formulas(proj,g,n,sym_tab=sym_tab)
for n in void_node:
if ok_to_remove_node(g,n):
g.remove_node(n)
else:
set_padding_node(g,n)
sigs.append(g)
#If the identified patch areas are not within a same sub-graph, we need to organize them into one.
if len(sigs) > 1:
print('Need to combine scattered instructions.')
sigs = combine_subsigs(cfg,sigs)
return sigs
#Return true if removing node n from g doesn't break its connectivity.
#TODO: Any simpler way to decide this?
def ok_to_remove_node(g,n):
g0 = copy.deepcopy(g)
n0 = get_node_by_addr(g0,n.addr)
g0.remove_node(n0)
return len([x for x in networkx.weakly_connected_component_subgraphs(g0)]) <= 1
def combine_subsigs(cfg,sigs):
if not sigs or len(sigs) <= 1 or not cfg:
return sigs
#addrs of all non-padding nodes
addrs = {}
for g in sigs:
for n in g.nodes():
addrs[n.addr] = g.node[n]
#Start to merge the sub-sigs pair by pair.
sg = sigs[0]
for i in range(1,len(sigs)):
sg = _combine_subsigs(cfg,sg,sigs[i])
#TODO: Do we need to develop some heuristics (eg. give up the combined sig if it contains too many padding nodes.)?
sg = copy.deepcopy(sg)
for n in sg.nodes():
if n.addr in addrs:
sg.node[n] = addrs[n.addr]
else:
set_padding_node(sg,n)
sg.graph['root_ins'] = set()
for g in sigs:
sg.graph['root_ins'] = sg.graph['root_ins'].union(g.graph['root_ins'])
return [sg]
#Given a DiGraph and two separate sub-graphs, merge them together.
def _combine_subsigs(cfg,g0,g1):
if not len(g0) or not len(g1):
return g0 if not len(g1) else g1
o_cfg = cfg
#get_cfg_wo_loops() will return a copy of original graph without modifying it.
cfg = get_cfg_wo_loops(o_cfg)
g0 = get_cfg_wo_loops(g0)
g1 = get_cfg_wo_loops(g1)
a0 = set([x.addr for x in g0.nodes()])
a1 = set([x.addr for x in g1.nodes()])
if a0.intersection(a1):
#No padding nodes are needed.
t = a0.union(a1)
nodes = [get_node_by_addr(o_cfg,x) for x in t]
sg = o_cfg.subgraph(nodes)
gs = [g for g in networkx.weakly_connected_component_subgraphs(sg)]
if len(gs) > 1:
#How can this be possible?
print('!!!!Fail to merge two signatures with common nodes??')
return sorted(gs,key=lambda x:len(x))[-1]
p0 = get_all_preds4graph(cfg,g0)
p1 = get_all_preds4graph(cfg,g1)
s0 = get_all_succs4graph(cfg,g0)
s1 = get_all_succs4graph(cfg,g1)
if p0.intersection(a1):
t = p0.intersection(s1)
gs = cluster_padding_nodes(cfg,t)
t = set([x.addr for x in gs[0].nodes()]) if len(gs) > 0 else set([])
elif p1.intersection(a0):
t = p1.intersection(s0)
gs = cluster_padding_nodes(cfg,t)
t = set([x.addr for x in gs[0].nodes()]) if len(gs) > 0 else set([])
else:
#In this case, consider both common predecessor and successor, choose the smaller resulting signature.
#(1)Predecessor
tp = p0.intersection(p1)
if not tp:
print('!!!Null common predecessors')
print(len(g0))
print(len(g1))
print('g0 ' + hex_array_sorted([n.addr for n in g0.nodes()]))
print('a0 ' + hex_array_sorted(a0))
print('p0 ' + hex_array_sorted(p0))
print('g1 ' + hex_array_sorted([n.addr for n in g1.nodes()]))
print('a1 ' + hex_array_sorted(a1))
print('p1 ' + hex_array_sorted(p1))
gs = cluster_padding_nodes(cfg,tp)
en = set()
for g in gs:
en = en.union(set([x.addr for x in g.nodes() if g.out_degree(x) == 0]))
en = [get_node_by_addr(cfg,x,any_addr=True) for x in en]
#Choose a preceding node that minimize the number of padding nodes.
def _calc_common_pred_path(n):
succs = set()
get_all_succs(cfg,n,succs)
pad0 = succs.intersection(p0)
pad1 = succs.intersection(p1)
return pad0.union(pad1)
en = [(x,_calc_common_pred_path(x)) for x in list(en)]
en = sorted(en,key=lambda x:len(x[1]))
#(2)Successor
ts = s0.intersection(s1)
if ts:
gs = cluster_padding_nodes(cfg,ts)
sn = set()
for g in gs:
sn = sn.union(set([x.addr for x in g.nodes() if g.in_degree(x) == 0]))
sn = [get_node_by_addr(cfg,x,any_addr=True) for x in sn]
#Choose a succeding node that minimize the number of padding nodes.
def _calc_common_succ_path(n):
preds = set()
get_all_succs(cfg,n,preds)
pad0 = preds.intersection(s0)
pad1 = preds.intersection(s1)
return pad0.union(pad1)
sn = [(x,_calc_common_succ_path(x)) for x in list(sn)]
sn = sorted(sn,key=lambda x:len(x[1]))
else:
sn = []
if not sn and not en:
#Something very unusual must happen
print('Fail to combine two sub-CFG, possibly there are unrecognized instructions in the function...')
t = set()
else:
t = en[0][1] if not sn or len(en[0][1]) <= len(sn[0][1]) else sn[0][1]
t = t.union(a0).union(a1)
nodes = [get_node_by_addr(o_cfg,x) for x in t]
sg = o_cfg.subgraph(nodes)
gs = [g for g in networkx.weakly_connected_component_subgraphs(sg)]
if len(gs) > 1:
#This should be impossible...
print('!!!After merging sub-sigs, there are still multiple ones..')
else:
print('------Sig w/ padding nodes: ' + str([hex(x.addr) for x in gs[0].nodes()]))
return gs[0]
def cluster_padding_nodes(cfg,addrs):
nodes = [get_node_by_addr(cfg,x) for x in addrs]
sg = cfg.subgraph(nodes)
gs = [g for g in networkx.weakly_connected_component_subgraphs(sg)]
return sorted(gs,key=lambda x:len(x))
def combine_subsigs_steiner(cfg,sigs):
if not sigs or len(sigs) <= 1 or not cfg:
return sigs
#addrs of all non-padding nodes
addrs = {}
for g in sigs:
for n in g.nodes():
addrs[n.addr] = g.node[n]
#steiner tree problem is in general NP-hard, here we use the approximation algorithm in the networkx package.
u_cfg = cfg.to_undirected()
nodes = [get_node_by_addr(u_cfg,x) for x in addrs]
sg = approximation.steiner_tree(u_cfg,nodes)
c_addrs = [x.addr for x in sg.nodes()]
nodes = [get_node_by_addr(cfg,x) for x in c_addrs]
sg = cfg.subgraph(nodes)
sg = copy.deepcopy(sg)
for n in sg.nodes():
if n.addr in addrs:
sg.node[n] = addrs[n.addr]
else:
set_padding_node(sg,n)
sg.graph['root_ins'] = set()
for g in sigs:
sg.graph['root_ins'] = sg.graph['root_ins'].union(g.graph['root_ins'])
return [sg]
def set_padding_node(cfg,n):
cfg.node[n] = {'padding':1,'root_ins':set()}
#insns_d: a dict of root_ins_addr --> rooted_ins_addr set
#acc_list: accurate instruction list that composes the signature.
def flt_root_ins_acc(g,n,insns_d,acc_list,include_last=False,root_in_acc=False):
#(1)First pick up all root instructions from insns_d which are also within acc_list.
root_ins = set(insns_d.keys())
root_ins = root_ins.intersection(set(acc_list))
#(2)See whether all instructions in acc_list have been rooted by root_ins picked in (1), then add those un-rooted instructions to root_ins if root_in_acc is set.
#Else just add enough root_ins until all instructions in acc_list are rooted.
#Make a special optimization here: if there is no root instruction in acc_list and current node has no successors in the sig graph, then just return null.
#Because in this situation, it's very likely that addr2line has added some unnecessary instructions.
if not root_ins and g.out_degree(n) == 0 and g.number_of_nodes() > 1:
return set()
rooted_ins = set()
for r in root_ins:
rooted_ins = rooted_ins.union(insns_d[r])
remain_ins = set(acc_list) - rooted_ins
if root_in_acc:
root_ins = root_ins.union(remain_ins)
else:
cand_root_ins = set(insns_d.keys()) - root_ins
for c in cand_root_ins:
if not remain_ins:
break
if insns_d[c].intersection(remain_ins):
root_ins.add(c)
remain_ins = remain_ins - insns_d[c]
#Then we have a complementary rule: add the 'exit' instruction as root_ins if necessary.
if include_last:
in_deg = g.in_degree(n)
out_deg = g.out_degree(n)
last = n.addr + n.size - 4
if g.number_of_nodes() <= 1 or out_deg > 0:
root_ins.add(last)
return root_ins
def flt_root_ins_marker(g,n,insns,pos,neg):
_in_node = lambda n,m:m>=n.addr and m<n.addr+n.size
_is_null = lambda x:x is None or not x
_flt_pos = lambda ins,m:[x for x in ins if x>=m]
_flt_neg = lambda ins,m:[x for x in ins if x<=m]
in_deg = g.in_degree(n)
out_deg = g.out_degree(n)
if _is_null(pos):
if _is_null(neg):
#Full node is in the signature, no filtering.
return insns
else:
#First few instructions are of interest.
neg = max(neg)
f_insns = _flt_neg(insns,neg)
if _is_null(f_insns):
#It seems that the marked instructions are not root ones, just return pre-filtering insns.
return insns
else:
#Maybe we still need to include the 'exit' in the end..
if out_deg > 0 or g.number_of_nodes() <= 1:
last = n.addr+n.size-4
return set(f_insns).union(set([last])) if last in insns else f_insns
else:
return f_insns
else:
pos = min(pos)
if _is_null(neg):
#Last few instructions are of interest.
f_insns = _flt_pos(insns,pos)
if _is_null(f_insns):
#This should be impossible
print('!!!No root instructions after pos marker: ' + hex(pos))
return insns
return f_insns
else:
#Hang in the middle.
neg = max(neg)
fp_insns = _flt_pos(insns,pos)
fn_insns = _flt_neg(insns,neg)
fpn_insns = _flt_neg(fp_insns,neg)
if _is_null(fpn_insns):
if out_deg == 0 and g.number_of_nodes() > 1 and not _is_null(fn_insns):
return fn_insns
elif in_deg == 0 and not _is_null(fp_insns):
return fp_insns
else:
return insns
else:
if out_deg == 0 and g.number_of_nodes() > 1:
#No successors
return fpn_insns
else:
#Have successors, consider 'exit' instruction.
last = n.addr+n.size-4
return set(fpn_insns).union(set([last])) if last in insns else fpn_insns
_get_in_node_addr = lambda n,ms:[] if ms is None else [x for x in list(ms) if x>=n.addr and x<n.addr+n.size]
def show_signature(sig):
print('---------------------Signature-----------------------')
print('---------------------Graph Information----------------------')
if 'sig_name' in sig.graph:
print('[Sig Name] ' + sig.graph['sig_name'])
if 'func_name' in sig.graph:
print('[Function Name] ' + sig.graph['func_name'])
if 'options' in sig.graph:
print('[options]')
print(sig.graph['options'])
print('[Root Instructions]')
print([hex(x) for x in sig.graph.get('root_ins',[])])
print('---------------------Node Information----------------------')
for node in sig.nodes():
print('>>>>>[%x,%x]<<<<<' % (node.addr,node.size))
print('succs: ' + str([hex(x.addr) for x in sig.successors(node)]))
print('preds: ' + str([hex(x.addr) for x in sig.predecessors(node)]))
if 'padding' in sig.node[node]:
print('~~PADDING~~')
continue
print('jumpkind: ' + sig.node[node]['jumpkind'])
if 'exit_func_name' in sig.node[node]:
print('func: ' + sig.node[node]['exit_func_name'])
if 'fmt_str' in sig.node[node]:
print('fmt_str: ' + sig.node[node]['fmt_str'])
print('root_ins: ' + str([hex(x) for x in sig.node[node]['root_ins']]))
print('formulas:')
for addr in sig.node[node]['formulas']:
inf = sig.node[node]['formulas'][addr]
print('###[%s] %x' % (inf['type'],addr))
if 'a-d-l' in inf:
adl = inf['a-d-l']
for (a,d,l) in adl:
print('[A] %s [D] %s [L] %s' % _sig_get_str((a,d,l)))
elif 'a-g-k' in inf:
agk = inf['a-g-k']
for (a,g,k) in agk:
print('[A] %s [G] %s [K] %s' % _sig_get_str((a,g,k)))
if 'func_name' in inf:
print('[F] %s' % inf['func_name'])
else:
for k in inf:
if k in ('type') or not isinstance(inf[k],list):
continue
print('[' + k + ']')
for e in inf[k]:
print('%s' % _sig_get_str([e]))
def _sig_get_str(tlist):
def _to_str(t):
if isinstance(t,claripy.ast.Base):
return t.hz_repr()
elif isinstance(t,int) or isinstance(t,int):
return hex(t)
return str(t)
return tuple(map(_to_str,tlist))
hex_array = lambda a:str([hex(x) for x in a])
hex_array_sorted = lambda a:str([hex(x) for x in sorted(a)])
#I want to do some trimmings here, reduce some unnecessary (and possibly harmful when matching) nodes in the signature.
def trim_signature(sig,addrs={},options={}):
if options.get('trim_tail_abs_jmp',False):
_trim_tail_abs_jmp(sig)
if options.get('trim_tail_call_args',False):
_trim_tail_call_args(sig)
if options.get('trim_non_tail_roots',False):
_trim_non_tail_roots(sig)
#trim_non_tail_roots-%d-%d
range_trim = [x for x in list(options) if x.find('trim_non_tail_roots-')>=0]
for k in range_trim:
tks = k.split('-')
(n1,n2) = (int(tks[1]),int(tks[2]))
aset = get_insns_for_lines(addrs,n1,n2)
_trim_non_tail_roots(sig,aset)
#farg-ln0-ln1:Xn,Xm
farg = [x for x in list(options) if x.find('farg-')>=0]
for fa in farg:
tks = fa.split('-')
(n1,n2) = (int(tks[1]),int(tks[2]))
aset = get_insns_for_lines(addrs,n1,n2)
_trim_farg_aarch64(sig,aset,options[fa])
#Return the instruction set for a specific line range.
def get_insns_for_lines(addrs,l0,l1):
aset = set()
for ln in addrs:
if ln >= l0 and ln <= l1:
aset = aset.union(addrs[ln])
return aset
#Within the instruction set, only reserve the ones filling the specified registers.
def _trim_farg_aarch64(sig,aset,ps):
for n in sig.nodes():
if 'padding' in sig.node[n]:
continue
formulas = sig.node[n]['formulas']
for k in list(formulas):
if not k in aset:
continue
if formulas[k]['type'] in ('load','other'):
regs = set([x for x in formulas[k] if x != 'type' and isinstance(formulas[k][x],list)])
if not regs.intersection(ps):
formulas.pop(k)
sig.graph['root_ins'].discard(k)
sig.node[n]['root_ins'].discard(k)
#Only reserve the last instruction as the root instruction for each node.
def _trim_non_tail_roots(sig,aset=set()):
for n in sig.nodes():
if 'padding' in sig.node[n]:
continue
formulas = sig.node[n]['formulas']
kl = sorted(list(formulas))
for k in kl[:-1]:
if k in aset:
formulas.pop(k)
sig.graph['root_ins'].discard(k)
sig.node[n]['root_ins'].discard(k)
#For the tail Ijk_Call node, ignore all the root instructions beside the 'call' instruction itself.
#This is useful when sometimes we want to simply include a call below the patch site into the signature as context, while in the meanwhile
#don't want the args of this call to introduce extra noises with their semantic formulas.
def _trim_tail_call_args(sig):
for n in sig.nodes():
if 'padding' in sig.node[n]:
continue
if sig.out_degree(n) != 0 or sig.node[n]['jumpkind'] != 'Ijk_Call':
continue
formulas = sig.node[n]['formulas']
for k in list(formulas):
if formulas[k]['type'] in ('load','other','store'):
formulas.pop(k)
sig.graph['root_ins'].discard(k)
sig.node[n]['root_ins'].discard(k)
#Eliminate all tail nodes that simply do an absolute jump of Ijk_Boring.
def _trim_tail_abs_jmp(sig):
if sig.number_of_nodes() <= 1:
return
def _pure_abs_jmp(n):
if n.size > 4 or sig.node[n]['out_d'] > 1 or sig.node[n]['jumpkind'] != 'Ijk_Boring':
return False
form = sig.node[n]['formulas'][n.addr]
if form['type'] == 'exit':
for _,g,_ in form['a-g-k']:
if str(g).split(' ')[1][:-1] != 'True':
return False
return True
to_remove = []
for n in sig.nodes():
if 'padding' in sig.node[n]:
continue
if sig.out_degree(n) == 0 and _pure_abs_jmp(n):
to_remove.append(n)
sig.remove_nodes_from(to_remove)
#There can be 'Ijk_Call' nodes in the signature, for these callees, what can we do if we have their function prototype?
#One thing for example is that we can obtain the format string for 'printk', instead of a simple global str pointer loaded into x0. Thus
#we can compare the format string contents to locate patch.
def analyze_func_args_aarch64(image,base,sig,options={}):
if options.get('match_fmt_str',False):
_analyze_fmt_str_aarch64(image,base,sig)
fmt_func_map_aarch64 = {
'printk':'x0',
'snprintf':'x2',
'__dynamic_pr_debug':'x1',
'seq_printf':'x1',
'_dev_info':'x1',
'dev_err':'x1',
'dprintk':'x1',
'write_str':'x1',
}
#If there is any callee in the signature that has a format string as one parameter, try to extract the format string from the image if possible.
def _analyze_fmt_str_aarch64(image,base,sig):
with open(image,'r') as f:
for n in sig.nodes():
if not 'exit_func_name' in sig.node[n]:
continue
fmt_reg = fmt_func_map_aarch64.get(sig.node[n]['exit_func_name'],None)
if fmt_reg is None:
continue
formulas = sig.node[n]['formulas']
addr_ast = None
for k in formulas:
if fmt_reg in formulas[k]:
addr_ast = formulas[k][fmt_reg]
break
if addr_ast is None:
#Maybe the register is assigned a value in other nodes, we'll see..
print('!!! Cannot find the formulas for fmt arg reg %s in node %x' % (fmt_reg,n.addr))
continue
#OK, we get the addr ast for the fmt string.
#For now we only process the concrete addr.
addr = None
for a in addr_ast:
if a.op == 'BVV':
addr = a.args[0]
break
if addr is None:
print('No concrete addr found in: ' + str(addr_ast))
continue
#Finally... Extract the string from image.
offset = addr - base
f.seek(-1,2)
size = f.tell()
if offset < 0 or offset > size:
#print 'Addr %x out bound.' % addr
continue
f.seek(offset)
fmt_str = ''
while True:
c = f.read(1)
if c == '\0':
break
else:
fmt_str += c
if fmt_str != '':
sig.node[n]['fmt_str'] = fmt_str
#This intends to
#(1) reduce the redundant formulas stored in signature.
#(2) extract 'if' terms from state_merge related ITEs.
def simplify_signature(sig):
#Deduplicate based on str rep.
def _dedup(tlist):
seen = set()
def _f1(x):
x = x if type(x).__name__ in ('tuple','list') else [x]
s = str(_sig_get_str(x))
if s in seen:
return False
else:
seen.add(s)
return True
return list(filter(_f1,tlist))
for node in sig.nodes():
if 'padding' in sig.node[node]:
continue
for addr in sig.node[node]['formulas']:
inf = sig.node[node]['formulas'][addr]
if 'a-d-l' in inf:
r = []
for e in inf['a-d-l']:
r += excavate_if_tuple(e)
inf['a-d-l'] = _dedup(r)
elif 'a-g-k' in inf:
r = []
for e in inf['a-g-k']:
r += excavate_if_tuple(e)
inf['a-g-k'] = _dedup(r)
else:
for k in inf:
if k in ('type'):
continue
if isinstance(inf[k],list):
r = []
for e in inf[k]:
t = excavate_if(e)
r += [x[1] for x in t]
inf[k] = _dedup(r)
def excavate_if_tuple(t):
terms = [excavate_if(e) for e in t]
res = []
def _is_consistent(cl):
seen = set()
for e in cl:
for i in e:
if str(claripy.Not(i)) in seen:
return False
else:
seen.add(str(i))
return True
for e in itertools.product(*terms):
if _is_consistent([x[0] for x in e]):
#e: ((),(),..)
res += [tuple([x[1] for x in e])]
return res
#Excavate the 'if' conditions and ignore all 'state_merge' conditions.
def excavate_if(f):
if not isinstance(f,claripy.ast.Base):
return [([],f)]
fs = [([],f)]
prev = 0
while len(fs) > prev:
prev = len(fs)
new = []
for e in fs:
ee = e[1].ite_excavated
if ee.op == 'If':
if is_state_merge_condition(ee.args[0]):
new += [(e[0]+[ee.args[0]],ee.args[1]),(e[0]+[claripy.Not(ee.args[0])],ee.args[2])]
else:
#TODO: What can we do here? Maybe a deeper excavation..