import os
import zipfile
import random
import shutil
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile
import matplotlib.pyplot as plt
Download the dataset from its original source by running the cell below.
Note that the zip file that contains the images is unzipped under the /tmp directory.
# If the URL doesn't work, visit https://www.microsoft.com/en-us/download/confirmation.aspx?id=54765
# And right click on the 'Download Manually' link to get a new URL to the dataset
# Note: This is a very large dataset and will take some time to download
!wget --no-check-certificate \
"https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip" \
-O "/tmp/cats-and-dogs.zip"
local_zip = '/tmp/cats-and-dogs.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/tmp')
zip_ref.close()
Now the images are stored within the /tmp/PetImages directory. There is a subdirectory for each class, so one for dogs and one for cats.
source_path = '/tmp/PetImages'
source_path_dogs = os.path.join(source_path, 'Dog')
source_path_cats = os.path.join(source_path, 'Cat')
# os.listdir returns a list containing all files under the given path
print(f"There are {len(os.listdir(source_path_dogs))} images of dogs.")
print(f"There are {len(os.listdir(source_path_cats))} images of cats.")
Expected Output:
There are 12501 images of dogs.
There are 12501 images of cats.
You will need a directory for cats-v-dogs, and subdirectories for training
and validation. These in turn will need subdirectories for 'cats' and 'dogs'. To accomplish this, complete the create_train_val_dirs below:
# Define root directory
root_dir = '/tmp/cats-v-dogs'
# Empty directory to prevent FileExistsError is the function is run several times
if os.path.exists(root_dir):
shutil.rmtree(root_dir)
try:
traindir = "training"
valdir = "validation"
catsdir = "cats"
dogsdir = "dogs"
training = os.path.join(root_path,traindir)
validation = os.path.join(root_path,valdir)
os.makedirs(root_path)
os.makedirs(training)
os.makedirs(validation)
os.makedirs(os.path.join(training,catsdir))
os.makedirs(os.path.join(training,dogsdir))
os.makedirs(os.path.join(validation,catsdir))
os.makedirs(os.path.join(validation,dogsdir))
except FileExistsError:
print("You should not be seeing this since the upper directory is removed beforehand")
# Check your folder structure
for rootdir, dirs, files in os.walk(root_dir):
for subdir in dirs:
print(os.path.join(rootdir, subdir))
The files will be randomized, so that the training set is a random sample of the files, and the validation set is made up of the remaining files.
split_size will be the percentage of the training set
We'll check images before the copy, so if they have a zero file length, they will be omitted from the copying process. If this is the case we will print out a message such as "filename is zero length, so ignoring.". we will perform this check before the split so that only non-zero images are considered when doing the actual split.
# GRADED FUNCTION: split_data
def split_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, SPLIT_SIZE):
"""
Splits the data into train and test sets
Args:
SOURCE_DIR (string): directory path containing the images
TRAINING_DIR (string): directory path to be used for training
VALIDATION_DIR (string): directory path to be used for validation
SPLIT_SIZE (float): proportion of the dataset to be used for training
Returns:
None
"""
images_count = len(os.listdir(SOURCE_DIR))
training_count = int(images_count * SPLIT_SIZE)+1
validation_count = int(images_count - training_count)
images = os.listdir(SOURCE_DIR)
training_images = random.sample(images,training_count)
validation_images = []
train_counter = 0
val_counter = 0
for image in training_images:
if image in images:
images.remove(image)
validation_images = images
for train_counter in range(training_count):
if os.path.getsize(os.path.join(SOURCE_DIR,training_images[train_counter])) == 0:
print(f"{training_images[train_counter]} is zero length, so ignoring.")
else:
copyfile(os.path.join(SOURCE_DIR,training_images[train_counter]), os.path.join(TRAINING_DIR,training_images[train_counter]))
train_counter = train_counter + 1
for val_counter in range(validation_count):
if os.path.getsize(os.path.join(SOURCE_DIR,validation_images[val_counter])) == 0:
print(f"{validation_images[val_counter]} is zero length, so ignoring.")
else:
copyfile(os.path.join(SOURCE_DIR,validation_images[val_counter]), os.path.join(VALIDATION_DIR,validation_images[val_counter]))
val_counter = val_counter + 1
# Define paths
CAT_SOURCE_DIR = "/tmp/PetImages/Cat/"
DOG_SOURCE_DIR = "/tmp/PetImages/Dog/"
TRAINING_DIR = "/tmp/cats-v-dogs/training/"
VALIDATION_DIR = "/tmp/cats-v-dogs/validation/"
TRAINING_CATS_DIR = os.path.join(TRAINING_DIR, "cats/")
VALIDATION_CATS_DIR = os.path.join(VALIDATION_DIR, "cats/")
TRAINING_DOGS_DIR = os.path.join(TRAINING_DIR, "dogs/")
VALIDATION_DOGS_DIR = os.path.join(VALIDATION_DIR, "dogs/")
# Empty directories in case you run this cell multiple times
if len(os.listdir(TRAINING_CATS_DIR)) > 0:
for file in os.scandir(TRAINING_CATS_DIR):
os.remove(file.path)
if len(os.listdir(TRAINING_DOGS_DIR)) > 0:
for file in os.scandir(TRAINING_DOGS_DIR):
os.remove(file.path)
if len(os.listdir(VALIDATION_CATS_DIR)) > 0:
for file in os.scandir(VALIDATION_CATS_DIR):
os.remove(file.path)
if len(os.listdir(VALIDATION_DOGS_DIR)) > 0:
for file in os.scandir(VALIDATION_DOGS_DIR):
os.remove(file.path)
# Define proportion of images used for training
split_size = .9
# Run the function
split_data(CAT_SOURCE_DIR, TRAINING_CATS_DIR, VALIDATION_CATS_DIR, split_size)
split_data(DOG_SOURCE_DIR, TRAINING_DOGS_DIR, VALIDATION_DOGS_DIR, split_size)
# Check that the number of images
print(f"\n\nThere are {len(os.listdir(TRAINING_CATS_DIR))} images of cats for training")
print(f"There are {len(os.listdir(TRAINING_DOGS_DIR))} images of dogs for training")
print(f"There are {len(os.listdir(VALIDATION_CATS_DIR))} images of cats for validation")
print(f"There are {len(os.listdir(VALIDATION_DOGS_DIR))} images of dogs for validation")
Now that we have successfully organized the data in a way that can be easily fed to Keras' ImageDataGenerator, it is time to code the generators that will yield batches of images, both for training and validation.
flow_from_directory method allows us to standarize this by defining a tuple called target_size that will be used to convert each image to this target resolution. we will use target_size of (150, 150).
# Instantiate the ImageDataGenerator class (Setting the arguments to augment the images)
train_datagen = ImageDataGenerator(rescale=1.0/255,
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest')
train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
batch_size=32,
class_mode='binary',
target_size=(150, 150))
# Instantiate the ImageDataGenerator class and rescale for validation
validation_datagen = ImageDataGenerator( rescale = 1.0/255. )
validation_generator = validation_datagen.flow_from_directory(directory=VALIDATION_DIR,
batch_size=32,
class_mode='binary',
target_size=(150, 150))
Found 22498 images belonging to 2 classes.
Found 2500 images belonging to 2 classes.
# GRADED FUNCTION: create_model
from tensorflow.keras.optimizers import RMSprop
model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(150, 150, 3)),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=RMSprop(learning_rate=0.001),
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_generator,
epochs=15,
verbose=1,
validation_data=validation_generator)
Retrieve a list of list results on training and test datasets for each training epoch
#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']
epochs=range(len(acc)) # Get number of epochs
#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.show()
print("")
#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.show()