tweet.py

# -*- coding: utf-8 -*-
"""Tweet Emotion Recognition.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1o_u3O0JJrtFW_IoRbA9wioGFPvsbtLRK

## Tweet Emotion Recognition: Natural Language Processing with TensorFlow

---

Manuel Arambula Gonzalez

Dataset: [Tweet Emotion Dataset](https://github.com/dair-ai/emotion_dataset)

Guided project [Tweet Emotion Recognition with TensorFlow](https://www.coursera.org/learn/tweet-emotion-tensorflow)

---

## Task 1: Setup and Imports

1. Installing Hugging Face's nlp package
2. Importing libraries
"""

!pip install nlp

# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random


def show_history(h):
    epochs_trained = len(h.history['loss'])
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    plt.plot(range(0, epochs_trained), h.history.get('accuracy'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_accuracy'), label='Validation')
    plt.ylim([0., 1.])
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(0, epochs_trained), h.history.get('loss'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_loss'), label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()


def show_confusion_matrix(y_true, y_pred, classes):
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(y_true, y_pred, normalize='true')

    plt.figure(figsize=(8, 8))
    sp = plt.subplot(1, 1, 1)
    ctx = sp.matshow(cm)
    plt.xticks(list(range(0, 6)), labels=classes)
    plt.yticks(list(range(0, 6)), labels=classes)
    plt.colorbar(ctx)
    plt.show()


print('Using TensorFlow version', tf.__version__)

"""## Task 2: Importing Data

1. Importing the Tweet Emotion dataset
2. Creating train, validation and test sets
3. Extracting tweets and labels from the examples
"""

!pip install datasets
import datasets

dataset = datasets.load_dataset('emotion')

dataset

train = dataset['train']
val = dataset['validation']
test = dataset['test']

def get_tweets(data):
    tweets = [x['text'] for x in data]
    labels = [x['label'] for x in data]
    return tweets, labels

tweets, labels = get_tweets(train)

tweets[0], labels[0]

"""## Task 3: Tokenizer

1. Tokenizing the tweets
"""

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

tokenizer.fit_on_texts(tweets)

print(tokenizer.texts_to_sequences([tweets[0]]))

"""## Task 4: Padding and Truncating Sequences

1. Checking length of the tweets
2. Creating padded sequences
"""

lengths = [len(t.split(' ')) for t in tweets]

plt.hist(lengths, bins=len(set(lengths)))
plt.show()

from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_sequences(tokenizer, tweets):
    sequences = tokenizer.texts_to_sequences(tweets)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=50, padding='post')
    return padded_sequences

padded_train_sequences = get_sequences(tokenizer, tweets)

padded_train_sequences[0]

"""## Task 5: Preparing the Labels

1. Creating classes to index and index to classes dictionaries
2. Converting text labels to numeric labels
"""

classes = set(labels)
print(classes)

plt.hist(labels, bins=11)
plt.show()

classes_to_index = dict((c, i) for i, c in enumerate(classes))
index_to_classes = dict((v, k) for k, v in classes_to_index.items())

classes_to_index

index_to_classes

names_to_ids = lambda labels: np.array([classes_to_index.get(x) for x in labels])

train_labels = names_to_ids(labels)
print(train_labels[0])

"""## Task 6: Creating the Model

1. Creating the model
2. Compiling the model
"""

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=50),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
    tf.keras.layers.Dense(6, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

"""## Task 7: Training the Model

1. Preparing a validation set
2. Training the model
"""

val_tweets, val_labels = get_tweets(val)
val_sequences = get_sequences(tokenizer, val_tweets)
val_labels = names_to_ids(val_labels)

val_tweets[0], val_labels[0]

h = model.fit(
    padded_train_sequences, train_labels,
    validation_data=(val_sequences, val_labels),
    epochs=20,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)
    ]
)

"""## Task 8: Evaluating the Model

1. Visualizing training history
2. Prepraring a test set
3. A look at individual predictions on the test set
4. A look at all predictions on the test set
"""

show_history(h)

test_tweets, test_labels = get_tweets(test)
test_sequences = get_sequences(tokenizer, test_tweets)
test_labels = names_to_ids(test_labels)

_ = model.evaluate(test_sequences, test_labels)

classes_to_index = {'anger': 0, 'joy': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'sadness': 5}
index_to_classes = {0: 'anger', 1: 'joy', 2: 'love', 3: 'surprise', 4: 'fear', 5: 'sadness'}

predicted_emotion = index_to_classes.get(p)

i = random.randint(0, len(test_labels) - 1)

print('Sentence:', test_tweets[i])
print('Emotion:', index_to_classes[test_labels[i]])

# Use predict and argmax to get predicted class
predictions = model.predict(np.expand_dims(test_sequences[i], axis=0))
p = np.argmax(predictions)

print('Predicted Emotion:', index_to_classes.get(p))

# Use predict to get the predicted probabilities for each class
preds = model.predict(test_sequences)
# Get the class with the highest probability
preds = np.argmax(preds, axis=1)

preds.shape, test_labels.shape

# Use predict to get the predicted probabilities for each class
preds = model.predict(test_sequences)
# Get the class with the highest probability
preds = np.argmax(preds, axis=1)

preds.shape, test_labels.shape

show_confusion_matrix(test_labels, preds, list(classes))

"""'anger': 0, 'joy': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'sadness': 5"""