import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
Begin by looking at the structure of the csv that contains the data:
with open("./data/bbc-text.csv", 'r') as csvfile:
print(f"First line (header) looks like this:\n\n{csvfile.readline()}")
print(f"Each data point looks like this:\n\n{csvfile.readline()}")
As you can see, each data point is composed of the category of the news article followed by a comma and then the actual text of the article.
Let's create a function that remove the stop words:
def remove_stopwords(sentence):
"""
Removes a list of stopwords
Args:
sentence (string): sentence to remove the stopwords from
Returns:
sentence (string): lowercase sentence without the stopwords
"""
# List of stopwords
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
# Sentence converted to lowercase-only
sentence = sentence.lower()
words = []
filtered_sentence = []
words.append(sentence)
words = words[0].split()
for w in words:
if w not in stopwords:
filtered_sentence.append(w)
sentence = ' '.join([str(x) for x in filtered_sentence])
return sentence
# Test our function
remove_stopwords("I am about to go to the store and get any snack")
Now we will read the data from the csv file: A couple of things to note:
remove_stopwords function in each sentence.filename = "./data/bbc-text.csv"
sentences = []
labels = []
with open(filename, 'r') as csvfile:
data = []
line = 0
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
#this to skip the first line, (we could also use next(reader) instead)
if line == 0:
line += 1
else:
labels.append(row[0])
sentences.append(remove_stopwords(str(row[1])))
line += 1
Let's check our data:
print("ORIGINAL DATASET:\n")
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 5 labels are {labels[:5]}\n\n")
# With a miniature version of the dataset that contains only first 5 rows
mini_sentences, mini_labels = parse_data_from_file("./data/bbc-text-minimal.csv")
print("MINIATURE DATASET:\n")
print(f"There are {len(mini_sentences)} sentences in the miniature dataset.\n")
print(f"First sentence has {len(mini_sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(mini_labels)} labels in the miniature dataset.\n")
print(f"The first 5 labels are {mini_labels[:5]}")
Now it is time to tokenize the sentences of the dataset:
# Instantiate the Tokenizer class by passing in the oov_token argument
tokenizer = Tokenizer(oov_token="<OOV>")
# Fit on the sentences
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(f"Vocabulary contains {len(word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")
Here is what we are doing: From Text -> To Sentences -> To Words -> To Tokens -> To Sequence -> To Padded Sequence
# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)
# Pad the sequences using the post padding strategy
padded_sequences = pad_sequences(sequences, padding='post')
print(f"First padded sequence looks like this: \n\n{padded_sequences[0]}\n")
print(f"Numpy array of all sequences has shape: {padded_sequences.shape}\n")
print(f"This means there are {padded_sequences.shape[0]} sequences in total and each one has a size of {padded_sequences.shape[1]}")
# Instantiate the Tokenizer class
# No need to pass additional arguments since you will be tokenizing the labels
label_tokenizer = Tokenizer()
# Fit the tokenizer to the labels
label_tokenizer.fit_on_texts(labels)
# Save the word index
label_word_index = label_tokenizer.word_index
# Save the sequences
label_sequences = label_tokenizer.texts_to_sequences(labels)
print(f"Vocabulary of labels looks like this {label_word_index}\n")
print(f"First ten sequences {label_sequences[:10]}\n")