-
Notifications
You must be signed in to change notification settings - Fork 8
/
17_nltk2_tokenize2.py
30 lines (23 loc) · 1008 Bytes
/
17_nltk2_tokenize2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# The Natural Language Toolkit Library contains many tools
# that will be useful to us over the course of this class
import nltk
textfile = open("holmes.txt","r",encoding="utf8")
holmesstring = textfile.read()
textfile.close()
startpoint = holmesstring.find('*** START OF THIS PROJECT GUTENBERG EBOOK')
endpoint = holmesstring.find('*** END OF THIS PROJECT GUTENBERG EBOOK')
holmesstring = holmesstring[startpoint:endpoint]
# Combined sentence and word tokenization:
# first create a dummy list to contain results
sentencesWordsTokens = []
# First break into sentences:
sentences = nltk.sent_tokenize(holmesstring)
# Then break each sentence into words:
for sentence in sentences:
tokenizedSentence = nltk.word_tokenize(sentence)
sentencesWordsTokens.append(tokenizedSentence)
print(sentencesWordsTokens[1000])
# We can use list comprehensions to condense this code into a
# single line:
sWT = [nltk.word_tokenize(sentence) for sentence in nltk.sent_tokenize(holmesstring)]
print(sWT[1000])