forked from alinamatyukhina/Attribution_Real_World
-
Notifications
You must be signed in to change notification settings - Fork 0
/
moss_tool_plag.py
85 lines (69 loc) · 2.79 KB
/
moss_tool_plag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import re
import itertools
import os
import sys
import glob
#This code was used for getting the names of files with mutual overlap more or equal to selected threshold.
#1. Run MOSS using a command which is indicated in their moss.pl file “moss [-l language] [-d] [-b basefile1] ... [-b basefilen] [-m #] [-c "string"] file1 file2 file3 ...”.
#Their server outputs a link, for example, http://moss.stanford.edu/results/898124800
#2. The results from accessing this link can be stored to file url.txt after running the following command from command line: "wget -O url.txt 'http://moss.stanford.edu/results/898124800'"
#3. The following code moss_tool_plag can be run to get the list of files which have the mutual overlap more or equal to selected threshold
#Currently the threshold is equal to 10. It can be changed to any number in the **CHANGE** in the code
#4. To remove files from the folder the standard command in Linux terminal can be used “rm /path/to/directory/file_name”
#Repeat 1-2-3-4 until there will be not be files left with the mutual overlap more or equal to selected threshold.
#This code should be located in the same folder as java files
#Can be run by any Python IDEs
# parsing the webpage
total_total=[]
crimefile1 = open("url.txt", 'r')
h=[]
for line in crimefile1:
if line.strip() == '<TR><TH>File 1<TH>File 2<TH>Lines Matched': # Or whatever test is needed
break
# Reads text until the end of the block:
for line in crimefile1: # This keeps reading the file
if line.strip() == '</TABLE>':
break
h.append(line)
n=[]
for i in range(0,len(h)):
if not h[i].startswith('<TD ALIGN'):
n.append(h[i].split('html">')[1].split('</A>')[0])
#reading the results
a=[]
for element in n:
k=element.split(' ')[0]
m=element.split(' ')[1]
a.append(k)
a.append(m.split("(")[1].split("%")[0])
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
output=[]
b=[]
for infile in sorted(glob.glob('*.java'), key=numericalSort): ##*.java can be changed to *.c or *.cc
name=str(infile)
b.append(name)
indexes = [i for i,x in enumerate(a) if x == name]
l=[]
for i in indexes:
l.append(int(a[i+1]))
b.append(max(l) if l else 0)
output.append(int(max(l) if l else 0))
print(output)
print(b)
hh=[]
c=0
for el in range(len(output)):
if int(output[el])>=10: #**CHANGE THRESHOLD IF NEEDED**
hh.append(int(output[el]))
c=c+1
total_total.append(c)
# outputing the answer
r=[]
for i in range(1,int(len(b)/2)+1):
if b[2*i-1]>=10: #**CHANGE THRESHOLD IF NEEDED**
r.append(b[2*i-2])
print('Answer:', r) #this outputs files which have mutual overlap more or equal to selected threshold