Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix chunking issue #3

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.pyc

# Windows image file caches
Thumbs.db
ehthumbs.db
Expand Down
80 changes: 1 addition & 79 deletions createDictionaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
import cPickle
import re
from twokenize import tokenize
from utils import process_line

from random import seed

seed(50)

#optimization that is currently not used
Expand All @@ -40,15 +40,6 @@ def read_random_line(f, chunk_size=128):
return f_handle.readline()



def is_number(s):
try:
float(s)
return True
except ValueError:
return False


def diff_times_in_seconds(t1,t2,date1,date2):
t1 = t1.split(':')
t2 = t2.split(':')
Expand All @@ -69,75 +60,6 @@ def diff_times_in_seconds(t1,t2,date1,date2):
return t2_secs - t1_secs


def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') or s.startswith('ftps://') or s.startswith('smb://')


def replace_sentence(text):
if isinstance(text,basestring) == False:
return text
words = nltk.word_tokenize(text)
sent = nltk.pos_tag(words)
chunks = nltk.ne_chunk(sent, binary=False)
sentence = []
nodelist = ['PERSON','ORGANIZATION','GPE','LOCATION','FACILITY','GSP']
for c,word in zip(chunks, words):
changed = False
if hasattr(c, 'node'):
if c.node in nodelist:
sentence.append("__%s__" % c.node)
changed = True
if not changed:
if is_url(word):
sentence.append("__URL__")
elif is_number(word):
sentence.append("__NUMBER__")
elif os.path.isabs(word):
sentence.append("__PATH__")
else:
sentence.append(word)
return " ".join(sentence)

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
string = string.replace('</s>', '__EOS__')
return string.strip()

def process_token(c, word):
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
#return [process_token(None,token).lower() for token in tokens]
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]


class CreateDataset:

def __init__(self,path):
Expand Down
59 changes: 1 addition & 58 deletions createdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,10 @@
import cPickle
import re
from twokenize import tokenize
from utils import process_line

seed(500)


def is_number(s):
try:
float(s)
return True
except ValueError:
return False

def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
or s.startswith('ftps://') or s.startswith('smb://')


def diff_times_in_seconds(t1,t2,date1,date2):
"""
Returns the difference in time (in seconds) between two dates
Expand All @@ -55,51 +43,6 @@ def diff_times_in_seconds(t1,t2,date1,date2):
return t2_secs - t1_secs


def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
return string.strip()

def process_token(c, word):
"""
Use NLTK to replace named entities with generic tags.
Also replace URLs, numbers, and paths.
"""
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]


class CreateDataset:

def __init__(self,path):
Expand Down
58 changes: 1 addition & 57 deletions find_testfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,63 +8,7 @@
from twokenize import tokenize
import nltk
from sklearn.externals import joblib



def is_number(s):
try:
float(s)
return True
except ValueError:
return False

def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
or s.startswith('ftps://') or s.startswith('smb://')

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
return string.strip()

def process_token(c, word):
"""
Use NLTK to replace named entities with generic tags.
Also replace URLs, numbers, and paths.
"""
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
from utils import process_line

def writeFiles(csvname, data, listbool=False, overwrite=False):
"""
Expand Down
58 changes: 1 addition & 57 deletions find_testfiles2.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,63 +8,7 @@
from twokenize import tokenize
import nltk
from sklearn.externals import joblib



def is_number(s):
try:
float(s)
return True
except ValueError:
return False

def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
or s.startswith('ftps://') or s.startswith('smb://')

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
return string.strip()

def process_token(c, word):
"""
Use NLTK to replace named entities with generic tags.
Also replace URLs, numbers, and paths.
"""
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
from utils import process_line

def writeFiles(csvname, data, listbool=False, overwrite=False):
"""
Expand Down
72 changes: 72 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import itertools
import nltk
import os
import re
from twokenize import tokenize

def is_number(s):
try:
float(s)
return True
except ValueError:
return False

def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
or s.startswith('ftps://') or s.startswith('smb://')

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
return string.strip()

def process_token(c):
"""
Use NLTK to replace named entities with generic tags.
Also replace URLs, numbers, and paths.
"""
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
word = ' '.join([t[0] for t in c.leaves()]) if isinstance(c, nltk.tree.Tree) else c[0]
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c).lower().encode('UTF-8') for c in chunks]

def test():
s='''
hi, please some1 can help me with my driver in ubuntu :( its a intel GM965 i tried compiz, but give me the error, Checking for Xgl: not present. Blacklisted PCIID '8086:2a02' found aborting and using fallback: /usr/bin/metacity some1 can help me please :( what kind of video card are you running? if you're not sure exactly, lspci | grep -i vga will tell you nickrud 00:02.0 VGA compatible controller: Intel Corporation Mobile GM965/GL960 Integrated Graphics Controller (rev 03) http://wiki.compiz-fusion.org/Hardware/Blacklist nickrud ty i go try it
'''

print process_line(s)

if __name__ == '__main__':
test()