Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
SuperXiang authored Jun 19, 2020
1 parent 5a8c562 commit 454519a
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 0 deletions.
65 changes: 65 additions & 0 deletions Preprocessing/Remove duplication/deduplication.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/**
* The script aims to remove duplicated files within each directory.
*
* Environment Requirement: Java 7+, Eclipse
* Date of Last Modified: 06/19/2020
* Author: Yingfei(Jeremy) Xiang
*
*/

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.security.MessageDigest;
import java.util.HashMap;

public class deduplication {
public static void main(String args[]) throws Exception {
File f1 = new File("~\\test");
//change path
for(File dir : f1.listFiles()){
HashMap h = new HashMap();
if(dir.isDirectory()){

}
for(File f:dir.listFiles()){
String checksum = getMD5Checksum(dir + "//" + f.getName());
if(!h.containsValue(checksum)){
h.put(f.getName(), checksum);
}
else{
f.delete();
}

}
}

}


public static byte[] createChecksum(String filename) throws Exception{
InputStream fis = new FileInputStream(filename);

byte[] buffer = new byte[1024];
MessageDigest complete = MessageDigest.getInstance("MD5");
int numRead;
do {
numRead = fis.read(buffer);
if (numRead > 0) {
complete.update(buffer, 0, numRead);
}
} while (numRead != -1);
fis.close();
return complete.digest();
}

// convert a byte array to a HEX string
public static String getMD5Checksum(String filename) throws Exception {
byte[] b = createChecksum(filename);
String result = "";
for (int i=0; i < b.length; i++) {
result += Integer.toString( ( b[i] & 0xff ) + 0x100, 16).substring( 1 );
}return result;
}

}
20 changes: 20 additions & 0 deletions Preprocessing/Remove duplication/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#############################################################################
#############################################################################
#
#
# The script aims to remove duplicated files within each directory.
#
#
# Environment Requirement: Java 7+, Eclipse
# Date of Last Modified: 06/19/2020
# Author: Yingfei(Jeremy) Xiang
#
# Step:
# 1. Copy the original dataset(the duplicated files will be removed directly)
# 2. Open and Run the script in Java 7+ environment via Eclipse
# 3. Change the path of directory accordingly
# 4. Duplicated files will be removed within the directory
#
#
#############################################################################
#############################################################################
19 changes: 19 additions & 0 deletions Preprocessing/Remove non-text/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#############################################################################
#############################################################################
#
#
# The script aims to remove non-text characters in each sample.
#
#
# Environment Requirement: Python 3.5, Anaconda3(with nltk installed)
# Date of Last Modified: 06/19/2020
# Author: Yingfei(Jeremy) Xiang
#
# Step:
# 1. Open and Run the script in Python 3.5 environment via Anaconda 3
# 2. Set the 'argv' as the path of directory
# 3. Sample with non-text characters removed will be saved
#
#
#############################################################################
#############################################################################
99 changes: 99 additions & 0 deletions Preprocessing/Remove non-text/remove_non_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
####################################################################################################################################################
#
# The script aims to remove non-text characters in each sample.
#
# Environment Requirement: Python 3.5, Anaconda3
# Date of Last Modified: 06/19/2020
# Author: Yingfei(Jeremy) Xiang
#
####################################################################################################################################################

import re
from sgmllib import SGMLParser
import sys,os
import nltk
from nltk.parse.stanford import StanfordDependencyParser, StanfordNeuralDependencyParser
from langdetect import detect

def main(argv):
indir = argv[0]
files = os.listdir(indir)

numfiles = 0
for filename in files:
if not filename.endswith('.txt'):
continue
else:
numfiles = numfiles + 1

cnt = 0
for filename in files:
if not filename.endswith('.txt'):
continue
cnt = cnt + 1
print('(%d/%d) - Processing %s' % (cnt,numfiles,filename))
fname = os.path.join(indir,filename)
text_file = open(fname,"r+")
text = text_file.read()

#-----remove html tags-----
class TextExtracter(SGMLParser):
def __init__(self):
self.text = []
SGMLParser.__init__(self)
def handle_data(self, data):
self.text.append(data)
def getvalue(self):
return ''.join(ex.text)
ex = TextExtracter()
ex.feed(text)
text = ex.getvalue()

#-----remove urls-----------------------
Url_RE=re.compile(r'https?://[^\s<>"]+|www\.[^\s<>"]+')
def remove_urls(text):
return Url_RE.sub('',text)
text = remove_urls(text)

#------remove non-ascii characters------
text = re.sub(r'[^\x00-\x7f]',r'', text)

#-----remove urlLink--------------------
text = text.replace("urlLink","")

# #-------remove emoji--------------------
# emoji_RE=re.compile(r'\*\s[a-z]+\s\*')
# emoj_list = (':)',': )',':~',':-)',': - )',':-(',': - (',':(',': (',':B',':|','8-)',':<',':$',':X',': X',':Z',':\'(',':-|',': - |',':@',':P',': P',':D',': D',':O',':+','Cb',':Q',':T',',@P',',@-D',':d',',@o',':g','|-)',':!',':L',':>',',@f',':-S',',@x',',@@',',@!','xx','&-(','B-)','<@','@>',':-O',': - O','>-|','P-(',':\'|','X-)',':*','@x','8*','pd','<W>','@)','jj','@@','lvu','<L>','<O>','/[]','#-0','/[]','<&','&>','oY')
# text=emoji_RE.sub('',text)
# for i in xrange(len(emoj_list)):
# text=text.replace(emoj_list[i],'')

#parse sentences and get valid sentences
dep_parser = StanfordNeuralDependencyParser()
sents = nltk.sent_tokenize(text)
valid_sents = []
invalid_sents = 0
for sent in sents:
try:
parsed = dep_parser.raw_parse(sent)
# Retain only english sentences
if (detect(sent) == 'en'):
valid_sents.append(sent)
else:
invalid_sents = invalid_sents + 1
except:
invalid_sents = invalid_sents + 1
print('%d/%d sentences were valid' % (len(sents)-invalid_sents,len(sents)))
text = ' '.join(valid_sents)

text_file.close()
if len(text) > 0:
text_file = open(fname,"w")
text_file.write(text)
text_file.close()
else:
print('Deleting %s' % fname)
os.remove(fname)

if __name__ == "__main__":
main(sys.argv[1:])

0 comments on commit 454519a

Please sign in to comment.