import os
import zipfile
import random
import shutil
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile
import matplotlib.pyplot as plt
Now the images are stored within the C:\Users\User\Downloads\PetImages directory. There is a subdirectory for each class, so one for dogs and one for cats.
source_path = r'C:\Users\User\Downloads\PetImages'
source_path_dogs = os.path.join(source_path, 'Dog')
source_path_cats = os.path.join(source_path, 'Cat')
# os.listdir returns a list containing all files under the given path
print(f"There are {len(os.listdir(source_path_dogs))} images of dogs.")
print(f"There are {len(os.listdir(source_path_cats))} images of cats.")
You will need a directory for cats-v-dogs, and subdirectories for training and validation. These in turn will need subdirectories for 'cats' and 'dogs':
# Define root directory
root_dir = r'C:\Users\User\Downloads\PetImages\cats-v-dogs'
# Empty directory to prevent FileExistsError is the function is run several times
if os.path.exists(root_dir):
shutil.rmtree(root_dir)
try:
traindir = "training"
valdir = "validation"
catsdir = "cats"
dogsdir = "dogs"
# Create training and validation directories along with subdirectories using makedirs
training = os.path.join(root_dir,traindir)
validation = os.path.join(root_dir,valdir)
os.makedirs(root_dir)
os.makedirs(training)
os.makedirs(validation)
os.makedirs(os.path.join(training,catsdir))
os.makedirs(os.path.join(training,dogsdir))
os.makedirs(os.path.join(validation,catsdir))
os.makedirs(os.path.join(validation,dogsdir))
except FileExistsError:
print("You should not be seeing this since the upper directory is removed beforehand")
Let's check our structure:
for rootdir, dirs, files in os.walk(root_dir):
for subdir in dirs:
print(os.path.join(rootdir, subdir))
The files will be randomized, so that the training set is a random sample of the files, and the validation set is made up of the remaining files.
split_size will be the percentage of the training set
We'll check images before the copy, so if they have a zero file length, they will be omitted from the copying process. If this is the case we will print out a message such as "filename is zero length, so ignoring.". we will perform this check before the split so that only non-zero images are considered when doing the actual split.
Let's create a function to do so:
def split_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, SPLIT_SIZE):
"""
Splits the data into train and test sets
Args:
SOURCE_DIR (string): directory path containing the images
TRAINING_DIR (string): directory path to be used for training
VALIDATION_DIR (string): directory path to be used for validation
SPLIT_SIZE (float): proportion of the dataset to be used for training
Returns:
None
"""
images_count = len(os.listdir(SOURCE_DIR))
training_count = int(images_count * SPLIT_SIZE)+1
validation_count = int(images_count - training_count)
images = os.listdir(SOURCE_DIR)
training_images = random.sample(images,training_count)
validation_images = []
train_counter = 0
val_counter = 0
for image in training_images:
if image in images:
images.remove(image)
validation_images = images
for train_counter in range(training_count):
if os.path.getsize(os.path.join(SOURCE_DIR,training_images[train_counter])) == 0:
print(f"{training_images[train_counter]} is zero length, so ignoring.")
else:
copyfile(os.path.join(SOURCE_DIR,training_images[train_counter]), os.path.join(TRAINING_DIR,training_images[train_counter]))
train_counter = train_counter + 1
for val_counter in range(validation_count):
if os.path.getsize(os.path.join(SOURCE_DIR,validation_images[val_counter])) == 0:
print(f"{validation_images[val_counter]} is zero length, so ignoring.")
else:
copyfile(os.path.join(SOURCE_DIR,validation_images[val_counter]), os.path.join(VALIDATION_DIR,validation_images[val_counter]))
val_counter = val_counter + 1
# Define paths
CAT_SOURCE_DIR = r'C:\Users\User\Downloads\PetImages\Cat'
DOG_SOURCE_DIR = r'C:\Users\User\Downloads\PetImages\Dog'
TRAINING_DIR = r'C:\Users\User\Downloads\PetImages\cats-v-dogs\training'
VALIDATION_DIR = r'C:\Users\User\Downloads\PetImages\cats-v-dogs\validation'
TRAINING_CATS_DIR = os.path.join(TRAINING_DIR, "cats/")
VALIDATION_CATS_DIR = os.path.join(VALIDATION_DIR, "cats/")
TRAINING_DOGS_DIR = os.path.join(TRAINING_DIR, "dogs/")
VALIDATION_DOGS_DIR = os.path.join(VALIDATION_DIR, "dogs/")
# Empty directories in case we run this cell multiple times
if len(os.listdir(TRAINING_CATS_DIR)) > 0:
for file in os.scandir(TRAINING_CATS_DIR):
os.remove(file.path)
if len(os.listdir(TRAINING_DOGS_DIR)) > 0:
for file in os.scandir(TRAINING_DOGS_DIR):
os.remove(file.path)
if len(os.listdir(VALIDATION_CATS_DIR)) > 0:
for file in os.scandir(VALIDATION_CATS_DIR):
os.remove(file.path)
if len(os.listdir(VALIDATION_DOGS_DIR)) > 0:
for file in os.scandir(VALIDATION_DOGS_DIR):
os.remove(file.path)
# Define proportion of images used for training
split_size = .9
# Run the function
split_data(CAT_SOURCE_DIR, TRAINING_CATS_DIR, VALIDATION_CATS_DIR, split_size)
split_data(DOG_SOURCE_DIR, TRAINING_DOGS_DIR, VALIDATION_DOGS_DIR, split_size)
# Check that the number of images
print(f"\n\nThere are {len(os.listdir(TRAINING_CATS_DIR))} images of cats for training")
print(f"There are {len(os.listdir(TRAINING_DOGS_DIR))} images of dogs for training")
print(f"There are {len(os.listdir(VALIDATION_CATS_DIR))} images of cats for validation")
print(f"There are {len(os.listdir(VALIDATION_DOGS_DIR))} images of dogs for validation")
Now that we have successfully organized the data in a way that can be easily fed to Keras' ImageDataGenerator, it is time to code the generators that will yield batches of images, both for training and validation.
flow_from_directory method allows us to standarize this by defining a tuple called target_size that will be used to convert each image to this target resolution. we will use target_size of (150, 150).
# Instantiate the ImageDataGenerator class and rescale for training
train_datagen = ImageDataGenerator( rescale = 1.0/255. )
train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
batch_size=32,
class_mode='binary',
target_size=(150, 150))
# Instantiate the ImageDataGenerator class and rescale for validation
validation_datagen = ImageDataGenerator( rescale = 1.0/255. )
validation_generator = validation_datagen.flow_from_directory(directory=VALIDATION_DIR,
batch_size=32,
class_mode='binary',
target_size=(150, 150))
from tensorflow.keras.optimizers import RMSprop
model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(150, 150, 3)),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=RMSprop(learning_rate=0.001),
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
# Train the model
history = model.fit(train_generator,
epochs=15,
verbose=1,
validation_data=validation_generator)
Let's run the below cell to check the training and validation accuracy achieved at the end of each epoch.
#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']
epochs=range(len(acc)) # Get number of epochs
#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.show()
print("")
#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.show()
You will probably encounter that the model is overfitting, which means that it is doing a great job at classifying the images in the training set but struggles with new data.