Cats vs Dogs Dataset - Using CNN¶

You can get the dataset from here: https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip ¶

import os
import zipfile
import random
import shutil
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile
import matplotlib.pyplot as plt

Now the images are stored within the C:\Users\User\Downloads\PetImages directory. There is a subdirectory for each class, so one for dogs and one for cats.

source_path = r'C:\Users\User\Downloads\PetImages'

source_path_dogs = os.path.join(source_path, 'Dog')
source_path_cats = os.path.join(source_path, 'Cat')


# os.listdir returns a list containing all files under the given path
print(f"There are {len(os.listdir(source_path_dogs))} images of dogs.")
print(f"There are {len(os.listdir(source_path_cats))} images of cats.")

There are 12501 images of dogs.
There are 12501 images of cats.

You will need a directory for cats-v-dogs, and subdirectories for training and validation. These in turn will need subdirectories for 'cats' and 'dogs':

# Define root directory
root_dir = r'C:\Users\User\Downloads\PetImages\cats-v-dogs'

# Empty directory to prevent FileExistsError is the function is run several times
if os.path.exists(root_dir):
  shutil.rmtree(root_dir)

try:

    traindir = "training"
    valdir = "validation"
    catsdir = "cats"
    dogsdir = "dogs"

    # Create training and validation directories along with subdirectories using makedirs
    training = os.path.join(root_dir,traindir)
    validation = os.path.join(root_dir,valdir)
    os.makedirs(root_dir)
    os.makedirs(training)
    os.makedirs(validation)
    os.makedirs(os.path.join(training,catsdir))
    os.makedirs(os.path.join(training,dogsdir))
    os.makedirs(os.path.join(validation,catsdir))
    os.makedirs(os.path.join(validation,dogsdir))

except FileExistsError:
  print("You should not be seeing this since the upper directory is removed beforehand")

Let's check our structure:

for rootdir, dirs, files in os.walk(root_dir):
    for subdir in dirs:
        print(os.path.join(rootdir, subdir))

C:\Users\User\Downloads\PetImages\cats-v-dogs\training
C:\Users\User\Downloads\PetImages\cats-v-dogs\validation
C:\Users\User\Downloads\PetImages\cats-v-dogs\training\cats
C:\Users\User\Downloads\PetImages\cats-v-dogs\training\dogs
C:\Users\User\Downloads\PetImages\cats-v-dogs\validation\cats
C:\Users\User\Downloads\PetImages\cats-v-dogs\validation\dogs

Data splitting and preparation¶

The files will be randomized, so that the training set is a random sample of the files, and the validation set is made up of the remaining files.

split_size will be the percentage of the training set

We'll check images before the copy, so if they have a zero file length, they will be omitted from the copying process. If this is the case we will print out a message such as "filename is zero length, so ignoring.". we will perform this check before the split so that only non-zero images are considered when doing the actual split.

Let's create a function to do so:

def split_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, SPLIT_SIZE):
  """
  Splits the data into train and test sets
  
  Args:
    SOURCE_DIR (string): directory path containing the images
    TRAINING_DIR (string): directory path to be used for training
    VALIDATION_DIR (string): directory path to be used for validation
    SPLIT_SIZE (float): proportion of the dataset to be used for training
    
  Returns:
    None
  """

  images_count = len(os.listdir(SOURCE_DIR))
  training_count = int(images_count * SPLIT_SIZE)+1
  validation_count = int(images_count - training_count)
  images = os.listdir(SOURCE_DIR)
  training_images = random.sample(images,training_count)
  validation_images = []
  train_counter = 0
  val_counter = 0
  
  for image in training_images:
    if image in images:
        images.remove(image)

  validation_images = images

  for train_counter in  range(training_count):
    if os.path.getsize(os.path.join(SOURCE_DIR,training_images[train_counter])) == 0:
      print(f"{training_images[train_counter]} is zero length, so ignoring.")
    else:
      copyfile(os.path.join(SOURCE_DIR,training_images[train_counter]), os.path.join(TRAINING_DIR,training_images[train_counter]))
    train_counter = train_counter + 1
  for val_counter in  range(validation_count):
    if os.path.getsize(os.path.join(SOURCE_DIR,validation_images[val_counter])) == 0:
      print(f"{validation_images[val_counter]} is zero length, so ignoring.")
    else:
      copyfile(os.path.join(SOURCE_DIR,validation_images[val_counter]), os.path.join(VALIDATION_DIR,validation_images[val_counter]))
    val_counter = val_counter + 1

# Define paths
CAT_SOURCE_DIR = r'C:\Users\User\Downloads\PetImages\Cat'
DOG_SOURCE_DIR = r'C:\Users\User\Downloads\PetImages\Dog'

TRAINING_DIR = r'C:\Users\User\Downloads\PetImages\cats-v-dogs\training'
VALIDATION_DIR = r'C:\Users\User\Downloads\PetImages\cats-v-dogs\validation'

TRAINING_CATS_DIR = os.path.join(TRAINING_DIR, "cats/")
VALIDATION_CATS_DIR = os.path.join(VALIDATION_DIR, "cats/")

TRAINING_DOGS_DIR = os.path.join(TRAINING_DIR, "dogs/")
VALIDATION_DOGS_DIR = os.path.join(VALIDATION_DIR, "dogs/")

# Empty directories in case we run this cell multiple times
if len(os.listdir(TRAINING_CATS_DIR)) > 0:
  for file in os.scandir(TRAINING_CATS_DIR):
    os.remove(file.path)
if len(os.listdir(TRAINING_DOGS_DIR)) > 0:
  for file in os.scandir(TRAINING_DOGS_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_CATS_DIR)) > 0:
  for file in os.scandir(VALIDATION_CATS_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_DOGS_DIR)) > 0:
  for file in os.scandir(VALIDATION_DOGS_DIR):
    os.remove(file.path)

# Define proportion of images used for training
split_size = .9

# Run the function
split_data(CAT_SOURCE_DIR, TRAINING_CATS_DIR, VALIDATION_CATS_DIR, split_size)
split_data(DOG_SOURCE_DIR, TRAINING_DOGS_DIR, VALIDATION_DOGS_DIR, split_size)

# Check that the number of images
print(f"\n\nThere are {len(os.listdir(TRAINING_CATS_DIR))} images of cats for training")
print(f"There are {len(os.listdir(TRAINING_DOGS_DIR))} images of dogs for training")
print(f"There are {len(os.listdir(VALIDATION_CATS_DIR))} images of cats for validation")
print(f"There are {len(os.listdir(VALIDATION_DOGS_DIR))} images of dogs for validation")

666.jpg is zero length, so ignoring.
11702.jpg is zero length, so ignoring.


There are 11250 images of cats for training
There are 11251 images of dogs for training
There are 1250 images of cats for validation
There are 1249 images of dogs for validation

Now that we have successfully organized the data in a way that can be easily fed to Keras' ImageDataGenerator, it is time to code the generators that will yield batches of images, both for training and validation.

flow_from_directory method allows us to standarize this by defining a tuple called target_size that will be used to convert each image to this target resolution. we will use target_size of (150, 150).

# Instantiate the ImageDataGenerator class and rescale for training
train_datagen = ImageDataGenerator( rescale = 1.0/255. )

train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
                                                  batch_size=32,
                                                  class_mode='binary',
                                                  target_size=(150, 150))

# Instantiate the ImageDataGenerator class and rescale for validation
validation_datagen = ImageDataGenerator( rescale = 1.0/255. )

validation_generator = validation_datagen.flow_from_directory(directory=VALIDATION_DIR,
                                                            batch_size=32,
                                                            class_mode='binary',
                                                            target_size=(150, 150))

Found 22499 images belonging to 2 classes.
Found 2499 images belonging to 2 classes.

Creating our model¶

from tensorflow.keras.optimizers import RMSprop

model = tf.keras.models.Sequential([ 
            tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(150, 150, 3)),
            tf.keras.layers.MaxPooling2D(2,2),
            tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
            tf.keras.layers.MaxPooling2D(2,2), 
            tf.keras.layers.Conv2D(64, (3,3), activation='relu'), 
            tf.keras.layers.MaxPooling2D(2,2),
            tf.keras.layers.Conv2D(64, (3,3), activation='relu'), 
            tf.keras.layers.MaxPooling2D(2,2),
            tf.keras.layers.Conv2D(64, (3,3), activation='relu'), 
            tf.keras.layers.MaxPooling2D(2,2),
            tf.keras.layers.Flatten(), 
            tf.keras.layers.Dense(512, activation='relu'), 
            tf.keras.layers.Dense(1, activation='sigmoid')
            ])


model.compile(optimizer=RMSprop(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy'])

Let's check our architechture:¶

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d (Conv2D)             (None, 148, 148, 16)      448       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 74, 74, 16)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 72, 72, 32)        4640      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 36, 36, 32)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 34, 34, 64)        18496     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 17, 17, 64)       0         
 2D)                                                             
                                                                 
 conv2d_3 (Conv2D)           (None, 15, 15, 64)        36928     
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 7, 7, 64)         0         
 2D)                                                             
                                                                 
 conv2d_4 (Conv2D)           (None, 5, 5, 64)          36928     
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 2, 2, 64)         0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 512)               131584    
                                                                 
 dense_1 (Dense)             (None, 1)                 513       
                                                                 
=================================================================
Total params: 229,537
Trainable params: 229,537
Non-trainable params: 0
_________________________________________________________________

Now, let's train the model¶

# Train the model
history = model.fit(train_generator,
                    epochs=15,
                    verbose=1,
                    validation_data=validation_generator)

Epoch 1/15
137/704 [====>.........................] - ETA: 4:13 - loss: 0.6885 - accuracy: 0.5602

C:\Users\User\anaconda3\lib\site-packages\PIL\TiffImagePlugin.py:819: UserWarning: Truncated File Read
  warnings.warn(str(msg))

704/704 [==============================] - 347s 490ms/step - loss: 0.6158 - accuracy: 0.6547 - val_loss: 0.5130 - val_accuracy: 0.7315
Epoch 2/15
704/704 [==============================] - 316s 449ms/step - loss: 0.4992 - accuracy: 0.7600 - val_loss: 0.5260 - val_accuracy: 0.7359
Epoch 3/15
704/704 [==============================] - 315s 448ms/step - loss: 0.4102 - accuracy: 0.8161 - val_loss: 0.3753 - val_accuracy: 0.8215
Epoch 4/15
704/704 [==============================] - 322s 457ms/step - loss: 0.3374 - accuracy: 0.8511 - val_loss: 0.3525 - val_accuracy: 0.8403
Epoch 5/15
704/704 [==============================] - 353s 502ms/step - loss: 0.2799 - accuracy: 0.8802 - val_loss: 0.2865 - val_accuracy: 0.8683
Epoch 6/15
704/704 [==============================] - 355s 504ms/step - loss: 0.2396 - accuracy: 0.9008 - val_loss: 0.3416 - val_accuracy: 0.8699
Epoch 7/15
704/704 [==============================] - 325s 461ms/step - loss: 0.2109 - accuracy: 0.9124 - val_loss: 0.2570 - val_accuracy: 0.8900
Epoch 8/15
704/704 [==============================] - 323s 459ms/step - loss: 0.1877 - accuracy: 0.9220 - val_loss: 0.2660 - val_accuracy: 0.8976
Epoch 9/15
704/704 [==============================] - 338s 480ms/step - loss: 0.1698 - accuracy: 0.9318 - val_loss: 0.2658 - val_accuracy: 0.8984
Epoch 10/15
704/704 [==============================] - 333s 473ms/step - loss: 0.1555 - accuracy: 0.9381 - val_loss: 0.3021 - val_accuracy: 0.8932
Epoch 11/15
704/704 [==============================] - 332s 471ms/step - loss: 0.1424 - accuracy: 0.9464 - val_loss: 0.4271 - val_accuracy: 0.8359
Epoch 12/15
704/704 [==============================] - 323s 458ms/step - loss: 0.1336 - accuracy: 0.9494 - val_loss: 1.5644 - val_accuracy: 0.7163
Epoch 13/15
704/704 [==============================] - 308s 437ms/step - loss: 0.1354 - accuracy: 0.9532 - val_loss: 0.2527 - val_accuracy: 0.9040
Epoch 14/15
704/704 [==============================] - 318s 451ms/step - loss: 0.1234 - accuracy: 0.9565 - val_loss: 0.2924 - val_accuracy: 0.8944
Epoch 15/15
704/704 [==============================] - 333s 473ms/step - loss: 0.1204 - accuracy: 0.9580 - val_loss: 0.3340 - val_accuracy: 0.8735

Let's run the below cell to check the training and validation accuracy achieved at the end of each epoch.

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.show()
print("")

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.show()

You will probably encounter that the model is overfitting, which means that it is doing a great job at classifying the images in the training set but struggles with new data.