-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
60 lines (52 loc) · 1.64 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
import inflect
from nltk.corpus import stopwords
import spacy
EN = spacy.load('en_core_web_sm')
import nltk
import sys
def tokenize_text(text):
"Apply tokenization using spacy to docstrings."
tokens = EN.tokenizer(text)
return [token.text.lower() for token in tokens if not token.is_space]
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = nltk.stem.lancaster.LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def normalize(words):
words = to_lowercase(words)
words = remove_punctuation(words)
words = remove_stopwords(words)
words = stem_words(words)
return words
def tokenize_code(text):
"A very basic procedure for tokenizing code strings."
return RegexpTokenizer(r'\w+').tokenize(text)
def preprocess_text(text):
return ' '.join(normalize(tokenize_text(text)))