import io
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
You probably remember structure of the csv:
with open("./bbc-text.csv", 'r') as csvfile:
print(f"First line (header) looks like this:\n\n{csvfile.readline()}")
print(f"Each data point looks like this:\n\n{csvfile.readline()}")
As you can see, each data point is composed of the category of the news article followed by a comma and then the actual text of the article.
NUM_WORDS: The maximum number of words to keep, based on word frequency. Defaults to 1000.EMBEDDING_DIM: Dimension of the dense embedding, will be used in the embedding layer of the model. Defaults to 16.MAXLEN: Maximum length of all sequences. Defaults to 120.PADDING: Padding strategy (pad either before or after each sequence.). Defaults to 'post'.OOV_TOKEN: Token to replace out-of-vocabulary words during text_to_sequence calls. Defaults to "\TRAINING_SPLIT: Proportion of data used for training. Defaults to 0.8NUM_WORDS = 1000
EMBEDDING_DIM = 16
MAXLEN = 120
PADDING = 'post'
OOV_TOKEN = "<OOV>"
TRAINING_SPLIT = .8
def remove_stopwords(sentence):
"""
Removes a list of stopwords
Args:
sentence (string): sentence to remove the stopwords from
Returns:
sentence (string): lowercase sentence without the stopwords
"""
# List of stopwords
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
# Sentence converted to lowercase-only
sentence = sentence.lower()
words = sentence.split()
no_words = [w for w in words if w not in stopwords]
sentence = " ".join(no_words)
return sentence
filename = "./bbc-text.csv"
sentences = []
labels = []
with open(filename, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader)
for row in reader:
labels.append(row[0])
sentence = row[1]
sentence = remove_stopwords(sentence)
sentences.append(sentence)
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 5 labels are {labels[:5]}")
# Compute the number of sentences that will be used for training (should be an integer)
train_size = int(len(sentences) * training_split)
# Split the sentences and labels into train/validation splits
train_sentences = sentences[0:train_size]
train_labels = labels[0:train_size]
validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]
print(f"There are {len(train_sentences)} sentences for training.\n")
print(f"There are {len(train_labels)} labels for training.\n")
print(f"There are {len(val_sentences)} sentences for validation.\n")
print(f"There are {len(val_labels)} labels for validation.")
Now that we have sets for training and validation it is time for you to begin the tokenization process.
# Instantiate the Tokenizer class, passing in the correct values for num_words and oov_token
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
# Fit the tokenizer to the training sentences
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
print(f"Vocabulary contains {len(word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")
Now that the tokenizer has been fitted to the training data, we need to convert each text data point into its padded sequence representation:
# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)
# Pad the sequences using the correct padding and maxlen
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding)
print(f"Padded training sequences have shape: {train_padded_seq.shape}\n")
print(f"Padded validation sequences have shape: {val_padded_seq.shape}")
Finally we need to tokenize the labels:
# Instantiate the Tokenizer (no additional arguments needed)
label_tokenizer = Tokenizer()
# Fit the tokenizer on all the labels
label_tokenizer.fit_on_texts(all_labels)
# Convert labels to sequences
label_seq = label_tokenizer.texts_to_sequences(split_labels)
# Convert sequences to a numpy array. Don't forget to substact 1 from every entry in the array!
label_seq_np = np.array(label_seq) - 1
print(f"First 5 labels of the training set should look like this:\n{train_label_seq[:5]}\n")
print(f"First 5 labels of the validation set should look like this:\n{val_label_seq[:5]}\n")
print(f"Tokenized labels of the training set have shape: {train_label_seq.shape}\n")
print(f"Tokenized labels of the validation set have shape: {val_label_seq.shape}\n")
tf.random.set_seed(123)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_DIM, input_length=MAXLEN),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(65, activation='relu'),
tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = model.fit(train_padded_seq, train_label_seq, epochs=30, validation_data=(val_padded_seq, val_label_seq))
Once training has finished you can run the following cell to check the training and validation accuracy achieved at the end of each epoch.
def plot_graphs(history, metric):
plt.plot(history.history[metric])
plt.plot(history.history[f'val_{metric}'])
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, f'val_{metric}'])
plt.show()
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")