-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5a8c562
commit 454519a
Showing
4 changed files
with
203 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/** | ||
* The script aims to remove duplicated files within each directory. | ||
* | ||
* Environment Requirement: Java 7+, Eclipse | ||
* Date of Last Modified: 06/19/2020 | ||
* Author: Yingfei(Jeremy) Xiang | ||
* | ||
*/ | ||
|
||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.InputStream; | ||
import java.security.MessageDigest; | ||
import java.util.HashMap; | ||
|
||
public class deduplication { | ||
public static void main(String args[]) throws Exception { | ||
File f1 = new File("~\\test"); | ||
//change path | ||
for(File dir : f1.listFiles()){ | ||
HashMap h = new HashMap(); | ||
if(dir.isDirectory()){ | ||
|
||
} | ||
for(File f:dir.listFiles()){ | ||
String checksum = getMD5Checksum(dir + "//" + f.getName()); | ||
if(!h.containsValue(checksum)){ | ||
h.put(f.getName(), checksum); | ||
} | ||
else{ | ||
f.delete(); | ||
} | ||
|
||
} | ||
} | ||
|
||
} | ||
|
||
|
||
public static byte[] createChecksum(String filename) throws Exception{ | ||
InputStream fis = new FileInputStream(filename); | ||
|
||
byte[] buffer = new byte[1024]; | ||
MessageDigest complete = MessageDigest.getInstance("MD5"); | ||
int numRead; | ||
do { | ||
numRead = fis.read(buffer); | ||
if (numRead > 0) { | ||
complete.update(buffer, 0, numRead); | ||
} | ||
} while (numRead != -1); | ||
fis.close(); | ||
return complete.digest(); | ||
} | ||
|
||
// convert a byte array to a HEX string | ||
public static String getMD5Checksum(String filename) throws Exception { | ||
byte[] b = createChecksum(filename); | ||
String result = ""; | ||
for (int i=0; i < b.length; i++) { | ||
result += Integer.toString( ( b[i] & 0xff ) + 0x100, 16).substring( 1 ); | ||
}return result; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
############################################################################# | ||
############################################################################# | ||
# | ||
# | ||
# The script aims to remove duplicated files within each directory. | ||
# | ||
# | ||
# Environment Requirement: Java 7+, Eclipse | ||
# Date of Last Modified: 06/19/2020 | ||
# Author: Yingfei(Jeremy) Xiang | ||
# | ||
# Step: | ||
# 1. Copy the original dataset(the duplicated files will be removed directly) | ||
# 2. Open and Run the script in Java 7+ environment via Eclipse | ||
# 3. Change the path of directory accordingly | ||
# 4. Duplicated files will be removed within the directory | ||
# | ||
# | ||
############################################################################# | ||
############################################################################# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
############################################################################# | ||
############################################################################# | ||
# | ||
# | ||
# The script aims to remove non-text characters in each sample. | ||
# | ||
# | ||
# Environment Requirement: Python 3.5, Anaconda3(with nltk installed) | ||
# Date of Last Modified: 06/19/2020 | ||
# Author: Yingfei(Jeremy) Xiang | ||
# | ||
# Step: | ||
# 1. Open and Run the script in Python 3.5 environment via Anaconda 3 | ||
# 2. Set the 'argv' as the path of directory | ||
# 3. Sample with non-text characters removed will be saved | ||
# | ||
# | ||
############################################################################# | ||
############################################################################# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#################################################################################################################################################### | ||
# | ||
# The script aims to remove non-text characters in each sample. | ||
# | ||
# Environment Requirement: Python 3.5, Anaconda3 | ||
# Date of Last Modified: 06/19/2020 | ||
# Author: Yingfei(Jeremy) Xiang | ||
# | ||
#################################################################################################################################################### | ||
|
||
import re | ||
from sgmllib import SGMLParser | ||
import sys,os | ||
import nltk | ||
from nltk.parse.stanford import StanfordDependencyParser, StanfordNeuralDependencyParser | ||
from langdetect import detect | ||
|
||
def main(argv): | ||
indir = argv[0] | ||
files = os.listdir(indir) | ||
|
||
numfiles = 0 | ||
for filename in files: | ||
if not filename.endswith('.txt'): | ||
continue | ||
else: | ||
numfiles = numfiles + 1 | ||
|
||
cnt = 0 | ||
for filename in files: | ||
if not filename.endswith('.txt'): | ||
continue | ||
cnt = cnt + 1 | ||
print('(%d/%d) - Processing %s' % (cnt,numfiles,filename)) | ||
fname = os.path.join(indir,filename) | ||
text_file = open(fname,"r+") | ||
text = text_file.read() | ||
|
||
#-----remove html tags----- | ||
class TextExtracter(SGMLParser): | ||
def __init__(self): | ||
self.text = [] | ||
SGMLParser.__init__(self) | ||
def handle_data(self, data): | ||
self.text.append(data) | ||
def getvalue(self): | ||
return ''.join(ex.text) | ||
ex = TextExtracter() | ||
ex.feed(text) | ||
text = ex.getvalue() | ||
|
||
#-----remove urls----------------------- | ||
Url_RE=re.compile(r'https?://[^\s<>"]+|www\.[^\s<>"]+') | ||
def remove_urls(text): | ||
return Url_RE.sub('',text) | ||
text = remove_urls(text) | ||
|
||
#------remove non-ascii characters------ | ||
text = re.sub(r'[^\x00-\x7f]',r'', text) | ||
|
||
#-----remove urlLink-------------------- | ||
text = text.replace("urlLink","") | ||
|
||
# #-------remove emoji-------------------- | ||
# emoji_RE=re.compile(r'\*\s[a-z]+\s\*') | ||
# emoj_list = (':)',': )',':~',':-)',': - )',':-(',': - (',':(',': (',':B',':|','8-)',':<',':$',':X',': X',':Z',':\'(',':-|',': - |',':@',':P',': P',':D',': D',':O',':+','Cb',':Q',':T',',@P',',@-D',':d',',@o',':g','|-)',':!',':L',':>',',@f',':-S',',@x',',@@',',@!','xx','&-(','B-)','<@','@>',':-O',': - O','>-|','P-(',':\'|','X-)',':*','@x','8*','pd','<W>','@)','jj','@@','lvu','<L>','<O>','/[]','#-0','/[]','<&','&>','oY') | ||
# text=emoji_RE.sub('',text) | ||
# for i in xrange(len(emoj_list)): | ||
# text=text.replace(emoj_list[i],'') | ||
|
||
#parse sentences and get valid sentences | ||
dep_parser = StanfordNeuralDependencyParser() | ||
sents = nltk.sent_tokenize(text) | ||
valid_sents = [] | ||
invalid_sents = 0 | ||
for sent in sents: | ||
try: | ||
parsed = dep_parser.raw_parse(sent) | ||
# Retain only english sentences | ||
if (detect(sent) == 'en'): | ||
valid_sents.append(sent) | ||
else: | ||
invalid_sents = invalid_sents + 1 | ||
except: | ||
invalid_sents = invalid_sents + 1 | ||
print('%d/%d sentences were valid' % (len(sents)-invalid_sents,len(sents))) | ||
text = ' '.join(valid_sents) | ||
|
||
text_file.close() | ||
if len(text) > 0: | ||
text_file = open(fname,"w") | ||
text_file.write(text) | ||
text_file.close() | ||
else: | ||
print('Deleting %s' % fname) | ||
os.remove(fname) | ||
|
||
if __name__ == "__main__": | ||
main(sys.argv[1:]) |