-
Notifications
You must be signed in to change notification settings - Fork 3
/
word_autogram.py
154 lines (95 loc) · 4.55 KB
/
word_autogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#word_autogram.py allows to extract token (word) n-gram features with different n size.
#The output file is term_freq_%dword.arff, where d is a n-gram size.
#This code should be located in the same folder as java files and can be executed by any Python IDEs.
#The input are java files. The names of java files should have the following pattern, such as
#“a_____N10001.java”, where “a” is a file name, N10001 is an author. For example an author N10001 can have 4 files:
#“a_____N10001.java”, “b_____N10001.java”, “c_____N10001.java”, “d_____N10001.java”.
from functools import partial
from StringIO import StringIO
from collections import Counter
import string
import itertools
import re
import math
from collections import Counter
import glob, os
import sys
from nltk.util import ngrams
import collections
for split_word in range(1, 10):
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
for inputFilename in sorted(glob.glob("*.java"),key=numericalSort) :
with open(inputFilename, "rU") as f:
sixgrams = ngrams(f.read().decode('latin-1').encode("utf-8").split(), split_word)
result = collections.Counter(sixgrams)
print result
open("%s.final%dword" % (inputFilename.split('.')[0], split_word), 'w').close()
with open("%s.final%dword" % (inputFilename.split('.')[0], split_word), "w") as f:
for item, count in sorted(result.iteritems()):
if count >= 0:
text = "{} ===== {}".format(" ".join(item).decode('latin-1').encode("utf-8"), count)
#print text
print >>f, text
print("///////////////////////Begin of creating all_unigram.txt file//////////")
open("all_%dword.txt" % split_word, 'w').close()
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
for inputFilename1 in sorted(glob.glob('*.final%dword' % split_word), key=numericalSort):
print(inputFilename1)
crimefile1 = open(inputFilename1, 'r')
yourResult1 = [line.split(' ===== ') for line in crimefile1.readlines()]
for el in range(len(yourResult1)):
with open("all_%dword.txt" % split_word, "r+") as file:
for line in file:
if yourResult1[el][0]==line[:-1]:
break
else: # not found, we are at the eof
file.write(str(yourResult1[el][0])+'\n') # append missing data
print("///////////////////////End of creating all_unigram.txt file//////////")
print("///////////////////////Begin of creating term_freq_unigram.arff file//////////")
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
open("term_freq_%dword.arff" %split_word, 'w').close()
#inputFilename1="AN1.finallf"
for inputFilename1 in sorted(glob.glob('*.final%dword' %split_word), key=numericalSort):
crimefile1 = open(inputFilename1, 'r')
yourResult1 = [line.split(' ===== ') for line in crimefile1.readlines()]
#print(yourResult1 )
x={d[0]: float(d[1][:-1]) for d in yourResult1 }
#print(x)
inputFilename2="all_%dword.txt" %split_word
crimefile2 = open(inputFilename2, 'r')
yourResult2 = [line.split('\n') for line in crimefile2.readlines()]
#print(yourResult2 )
b=[]
for j in range (0,len(yourResult2)):
b.append(yourResult2[j][0])
#print('Lalala=',b)
y={d: float(0) for d in b }
#print(y)
#print('-------')
z= { k: x.get(k, 0) + y.get(k, 0) for k in set(y) }
with open ("term_freq_%dword.arff" % split_word,"a+") as myfile:
ar=[]
for key, values in sorted(z.items()):
#print ( key,values)
ar.append(values)
print('ar=',len(ar),inputFilename1)
ar = map(str, ar)
ar1 = ','.join(ar)
#myfile.write(ar1)
myfile.write(str(ar1)+","+str(inputFilename1).rsplit('_____', 1)[1].rsplit('.',1)[0]+"\n")
#myfile.write(ar1+","+str(inputFilename1).rsplit('.', 1)[0][-4:][1:]+"\n")
myfile.close()
print("///////////////////////End of creating term_freq_unigram.arff file//////////")