-
Notifications
You must be signed in to change notification settings - Fork 0
/
bigram_model.py
65 lines (51 loc) · 2.05 KB
/
bigram_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import random
from collections import defaultdict
import requests
import spacy
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
# URL of the text file for "Alice's Adventures in Wonderland"
url = 'https://www.gutenberg.org/files/11/11-0.txt'
# Fetch the text from the URL
response = requests.get(url)
response.encoding = 'utf-8-sig' # Ensure correct encoding
text = response.text
# Clean the text
start_index = text.find("CHAPTER I")
end_index = text.find("End of Project Gutenberg's Alice's Adventures in Wonderland")
text = text[start_index:end_index]
# Use spaCy to tokenize the text
doc = nlp(text)
# Extract tokens
tokens = [token.text for token in doc if not token.is_space]
# Count the number of tokens
num_tokens = len(tokens)
print(f"Number of tokens: {num_tokens}")
# Generate bigrams
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
# Create a dictionary to store the bigram probabilities
bigram_model = defaultdict(lambda: defaultdict(int))
# Populate the bigram model with counts
for word1, word2 in bigrams:
bigram_model[word1][word2] += 1
# Convert counts to probabilities
for word1 in bigram_model:
total_count = sum(bigram_model[word1].values())
for word2 in bigram_model[word1]:
bigram_model[word1][word2] /= total_count
# Function to generate text using the bigram model
def generate_text(start_word, model, num_words=50, threshold=0.01):
current_word = start_word
result = [current_word]
for _ in range(num_words - 1):
next_words = [word for word in model[current_word].keys() if model[current_word][word] >= threshold]
next_word_probs = [model[current_word][word] for word in next_words]
if not next_words:
break # Stop if there are no next words above the threshold
next_word = random.choices(next_words, next_word_probs)[0]
result.append(next_word)
current_word = next_word
return ' '.join(result)
# Generate text starting with "Alice"
generated_text = generate_text("Alice", bigram_model, num_words=50, threshold=0.01)
print(generated_text)