import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
# Define path for file with sonnets
SONNETS_FILE = './sonnets.txt'
# Read the data
with open('./sonnets.txt') as f:
data = f.read()
# Convert to lower case and save as a list
corpus = data.lower().split("\n")
print(f"There are {len(corpus)} lines of sonnets\n")
print(f"The first 5 lines look like this:\n")
for i in range(5):
print(corpus[i])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
When converting the text into sequences you can use the texts_to_sequences method as you have done throughout this course.
It is important to keep in mind that the way you are feeding the data unto this method affects the result. Check the following example to make this clearer.
The first example of the corpus is a string and looks like this:
corpus[0]
If you pass this text directly into the texts_to_sequences method you will get an unexpected result:
tokenizer.texts_to_sequences(corpus[0])
This happened because texts_to_sequences expects a list and you are providing a string. However a string is still and iterable in Python so you will get the word index of every character in the string.
Instead you need to place the example whithin a list before passing it to the method:
tokenizer.texts_to_sequences([corpus[0]])
Notice that we received the sequence wrapped inside a list so in order to get only the desired sequence you need to explicitly get the first item in the list like this:
tokenizer.texts_to_sequences([corpus[0]])[0]
This function receives the fitted tokenizer and the corpus (which is a list of strings) and should return a list containing the n_gram sequences for each line in the corpus:
def n_gram_seqs(corpus, tokenizer):
"""
Generates a list of n-gram sequences
Args:
corpus (list of string): lines of texts to generate n-grams for
tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
Returns:
input_sequences (list of int): the n-gram sequences for each line in the corpus
"""
input_sequences = []
n_gram_sequence = []
# Loop over every line
for line in corpus:
# Tokenize the current line
token_list = tokenizer.texts_to_sequences([line])[0]
# Loop over the line several times to generate the subphrases
for i in range(1, len(token_list)):
# Generate the subphrase
n_gram_sequence = token_list[:i+1]
# Append the subphrase to the sequences list
input_sequences.append(n_gram_sequence)
return input_sequences
# Test our function with one example
first_example_sequence = n_gram_seqs([corpus[0]], tokenizer)
print("n_gram sequences for first example look like this:\n")
first_example_sequence
# Test your function with a bigger corpus
next_3_examples_sequence = n_gram_seqs(corpus[1:4], tokenizer)
print("n_gram sequences for next 3 examples look like this:\n")
next_3_examples_sequence
Apply the n_gram_seqs transformation to the whole corpus and save the maximum sequence length to use it later:
# Apply the n_gram_seqs transformation to the whole corpus
input_sequences = n_gram_seqs(corpus, tokenizer)
# Save max length
max_sequence_len = max([len(x) for x in input_sequences])
print(f"n_grams of input_sequences have length: {len(input_sequences)}")
print(f"maximum length of sequences is: {max_sequence_len}")
Now we will code the pad_seqs function which will pad any given sequences to the desired maximum length. Notice that this function receives a list of sequences and should return a numpy array with the padded sequences:
def pad_seqs(input_sequences, maxlen):
"""
Pads tokenized sequences to the same length
Args:
input_sequences (list of int): tokenized sequences to pad
maxlen (int): maximum length of the token sequences
Returns:
padded_sequences (array of int): tokenized sequences padded to the same length
"""
padded_sequences = np.array(pad_sequences(input_sequences, maxlen=maxlen, padding='pre'))
return padded_sequences
# Test your function with the n_grams_seq of the first example
first_padded_seq = pad_seqs(first_example_sequence, len(first_example_sequence))
first_padded_seq
# Test your function with the n_grams_seq of the next 3 examples
next_3_padded_seq = pad_seqs(next_3_examples_sequence, max([len(s) for s in next_3_examples_sequence]))
next_3_padded_seq
# Pad the whole corpus
input_sequences = pad_seqs(input_sequences, max_sequence_len)
print(f"padded corpus has shape: {input_sequences.shape}")
Before feeding the data into the neural network you should split it into features and labels. In this case the features will be the padded n_gram sequences with the last word removed from them and the labels will be the removed word.
The below function expects the padded n_gram sequences as input and should return a tuple containing the features and the one hot encoded labels.
def features_and_labels(input_sequences, total_words):
"""
Generates features and labels from n-grams
Args:
input_sequences (list of int): sequences to split features and labels from
total_words (int): vocabulary size
Returns:
features, one_hot_labels (array of int, array of int): arrays of features and one-hot encoded labels
"""
features = input_sequences[:,:-1]
labels = input_sequences[:,-1]
one_hot_labels = to_categorical(labels, num_classes=total_words)
return features, one_hot_labels
# Test your function with the padded n_grams_seq of the first example
first_features, first_labels = features_and_labels(first_padded_seq, total_words)
print(f"labels have shape: {first_labels.shape}")
print("\nfeatures look like this:\n")
first_features
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
# Compile the model
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# Train the model
history = model.fit(features, labels, epochs=50, verbose=1)
# Take a look at the training curves of your model
acc = history.history['accuracy']
loss = history.history['loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.title('Training accuracy')
plt.figure()
plt.plot(epochs, loss, 'b', label='Training Loss')
plt.title('Training loss')
plt.legend()
plt.show()
After all your work it is finally time to see your model generating text.
Run the cell below to generate the next 100 words of a seed text.
seed_text = "Help me Obi Wan Kenobi, you're my only hope"
next_words = 100
for _ in range(next_words):
# Convert the text into sequences
token_list = tokenizer.texts_to_sequences([seed_text])[0]
# Pad the sequences
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
# Get the probabilities of predicting a word
predicted = model.predict(token_list, verbose=0)
# Choose the next word based on the maximum probability
predicted = np.argmax(predicted, axis=-1).item()
# Get the actual word from the word index
output_word = tokenizer.index_word[predicted]
# Append to the current text
seed_text += " " + output_word
print(seed_text)