forked from alinamatyukhina/Attribution_Real_World
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kothari.py
125 lines (95 loc) · 3.83 KB
/
kothari.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#This code allows to extract Kothari et al. features (described in Kothari et al. study https://ieeexplore.ieee.org/document/4151691) from source code files.
#The main output file is kothari.arff, which contains Kothari et al. feature vectors.
#This code should be located in the same folder as input files and can be executed by any Python IDEs.
#The input are java files. The names of java files should have the following pattern, such as
#“a_____N10001.java”, where “a” is a file name, N10001 is an author. For example an author N10001 can have 4 files:
#“a_____N10001.java”, “b_____N10001.java”, “c_____N10001.java”, “d_____N10001.java”.
from functools import partial
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
from collections import Counter
import string
import itertools
import re
import math
import glob, os
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
#for C language
#for inputFilename in sorted(glob.glob("*.c"),key=numericalSort) :
for inputFilename in sorted(glob.glob("*.java"),key=numericalSort) :
lines=open(inputFilename,'r').readlines()
data = ''.join(lines)
data = string.replace(data, '\n', '')
data = string.replace(data, '\r', '')
#print(data)
#print(data)
k=[l for l in iter(partial(StringIO(data).read, 4), '')]
#print(k)
#print(www)
a = dict(Counter(k))
#print(a)
open("%s.final4gram" % inputFilename.split('.')[0], 'w').close()
with open ("%s.final4gram" % inputFilename.split('.')[0],"a+") as myfile1:
for key in a:
myfile1.write ('%s ===== %f\n' % (key, a[key]))
open("all_4gram.txt", 'w').close()
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
for inputFilename1 in sorted(glob.glob('*.final4gram'), key=numericalSort):
print(inputFilename1)
crimefile1 = open(inputFilename1, 'r')
yourResult1 = [line.split(' ===== ') for line in crimefile1.readlines()]
for el in range(len(yourResult1)):
with open("all_4gram.txt", "r+") as file:
for line in file:
if yourResult1[el][0]==line[:-1]:
break
else: # not found, we are at the eof
file.write(str(yourResult1[el][0])+'\n') # append missing data
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
open("kothari.arff", 'w').close()
#inputFilename1="AN1.finallf"
for inputFilename1 in sorted(glob.glob('*.final4gram'), key=numericalSort):
crimefile1 = open(inputFilename1, 'r')
yourResult1 = [line.split(' ===== ') for line in crimefile1.readlines()]
#print(yourResult1 )
x={d[0]: float(d[1][:-1]) for d in yourResult1 }
#print(x)
inputFilename2="all_4gram.txt"
crimefile2 = open(inputFilename2, 'r')
yourResult2 = [line.split('\n') for line in crimefile2.readlines()]
#print(yourResult2 )
b=[]
for j in range (0,len(yourResult2)):
b.append(yourResult2[j][0])
#print('Lalala=',b)
y={d: float(0) for d in b }
#print(y)
#print('-------')
z= { k: x.get(k, 0) + y.get(k, 0) for k in set(y) }
with open ("kothari.arff","a+") as myfile:
ar=[]
for key, values in sorted(z.items()):
#print ( key,values)
ar.append(values)
print('ar=',len(ar),inputFilename1)
ar = map(str, ar)
ar1 = ','.join(ar)
#myfile.write(ar1)
myfile.write(str(ar1)+","+str(inputFilename1).rsplit('_____', 1)[1].rsplit('.',1)[0]+"\n")
#myfile.write(ar1+","+str(inputFilename1).rsplit('.', 1)[0][-4:][1:]+"\n")
myfile.close()