Source code for flowws_keras_experimental.images.ImagenetDirectory

import functools
import hashlib
import os
import random

import flowws
from flowws import Argument as Arg
import numpy as np
from tensorflow import keras
import PIL

AUGMENTATIONS = {}

def augmentation(f):
    AUGMENTATIONS[f.__name__] = f
    return f

@augmentation
def null(img):
    return img

AUGMENTATIONS['id'] = null

@augmentation
def scale(img, low=256, high=480):
    (w, h) = img.size
    new_size = random.randint(low, high)

    if w < h:
        (new_w, new_h) = new_size, int(h*new_size/w)
    else:
        (new_w, new_h) = int(w*new_size/h), new_size

    return img.resize((new_w, new_h))

AUGMENTATIONS['scale_256_480'] = scale

@augmentation
def crop(img, target=256):
    (w, h) = img.size
    left = random.randint(0, w - target)
    bottom = random.randint(0, h - target)
    box = (left, bottom, left + target, bottom + target)
    return img.crop(box)

AUGMENTATIONS['crop_224'] = functools.partial(crop, target=224)
AUGMENTATIONS['crop_256'] = functools.partial(crop, target=256)

@augmentation
def keras_preprocess(img):
    return keras.applications.imagenet_utils.preprocess_input(
        np.asarray(img))

@augmentation
def maybe_flip(img):
    if random.random() >= .5:
        return img.transpose(PIL.Image.FLIP_LEFT_RIGHT)
    return img

def split_filenames(directory, label_names, validation_fraction=.3, base=int(1e21)):
    train, val = [], []
    thresh = int(validation_fraction*base)
    for label_name in label_names:
        base_hash = hashlib.sha1(label_name.encode())
        dirname = os.path.join(directory, label_name)
        for fname in os.listdir(dirname):
            file_hash = base_hash.copy()
            file_hash.update(fname.encode())
            file_hash = int(file_hash.hexdigest(), base=16)%base
            if file_hash > thresh:
                train.append((label_name, fname))
            else:
                val.append((label_name, fname))
    return train, val

def batch_generator(directory_base, labeled_files, label_map, batch_size=32,
                    augmentations=[]):
    while True:
        chosen_files = random.choices(labeled_files, k=batch_size)
        xs = []
        ys = []
        for (label_name, filename) in chosen_files:
            filename = os.path.join(directory_base, label_name, filename)
            ys.append(label_map[label_name])
            x = PIL.Image.open(filename).convert('RGB')
            for aug in augmentations:
                x = aug(x)
            if np.asarray(x).ndim < 3:
                print(label_name, filename)
            xs.append(np.asarray(x))

        yield np.array(xs), np.array(ys)

[docs]@flowws.add_stage_arguments class ImagenetDirectory(flowws.Stage): """Load ImageNet images from a specified directory.""" ARGS = [ Arg('base', '-b', str, help='Base directory storing images'), Arg('validation_fraction', '-v', float, .3, help='Fraction of files to be used in validation set'), Arg('augmentations', '-a', [str], help='Names of augmentations to perform on each image (use "null" for none)'), Arg('batch_size', None, int, 32, help='Batch size for training and validation'), Arg('train_epoch_scaling', None, float, 1., help='Factor to scale the number of batches considered to be part of an epoch by (train set)'), Arg('val_epoch_scaling', None, float, 1., help='Factor to scale the number of batches considered to be part of an epoch by (validation set)'), Arg('test_epoch_scaling', None, float, 1., help='Factor to scale the number of batches considered to be part of an epoch by (test set)'), ] def run(self, scope, storage): train_dir = os.path.join(self.arguments['base'], 'train') test_dir = os.path.join(self.arguments['base'], 'val') label_names = list(sorted(os.listdir(train_dir))) label_map = {label: i for (i, label) in enumerate(label_names)} train_files, val_files = split_filenames( train_dir, label_names, self.arguments['validation_fraction']) test_files, _ = split_filenames(test_dir, label_names, -1) augmentation_names = (self.arguments['augmentations'] or ['scale', 'crop', 'maybe_flip', 'keras_preprocess']) augmentations = [AUGMENTATIONS[name] for name in augmentation_names] train_generator = batch_generator( train_dir, train_files, label_map, self.arguments['batch_size'], augmentations) val_generator = batch_generator( train_dir, val_files, label_map, self.arguments['batch_size'], augmentations) test_generator = batch_generator( test_dir, test_files, label_map, self.arguments['batch_size'], augmentations) steps_per_epoch = int(len(train_files)//self.arguments['batch_size']* self.arguments['train_epoch_scaling']) validation_steps = (len(val_files)//self.arguments['batch_size']* self.arguments['val_epoch_scaling']) test_steps = (len(test_files)//self.arguments['batch_size']* self.arguments['test_epoch_scaling']) scope['label_names'] = label_names scope['label_map'] = label_map scope['train_generator'] = train_generator scope['generator_train_steps'] = steps_per_epoch scope['validation_generator'] = val_generator scope['generator_val_steps'] = validation_steps scope['test_generator'] = test_generator scope['generator_test_steps'] = test_steps scope['loss'] = 'sparse_categorical_crossentropy' scope['num_classes'] = len(label_names)