Added script for converting a folder-based dataset to an iNat style dataset

This commit is contained in:
Marcel Simon 2019-03-08 00:25:35 +00:00
Родитель 847a1cd5da
Коммит 6b61819e71
1 изменённых файлов: 145 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,145 @@
# This script was used to create the extended iNat dataset.
import json
import numpy as np
import PIL.Image
import random
import glob
import os
import tqdm
import sys
import shutil
import argparse
def main():
parser = argparse.ArgumentParser('Script for converting a folder-based classification dataset into iNat 2017 format. ' + \
'Your dataset should be organized by folder, e.g. you need a separate subfolder for each class and '+\
'within this folder should be all the images of the corresponding class.')
parser.add_argument("input_directory", type=str, metavar='IMAGE_ROOT',
help='Path to the root of your image collection. Each subfolder is treated as a seprate class. ' + \
'For example, if you set this to "/path/to/images", then images of class "myfirstclass" should be in the ' + \
'folder "path/to/images/myfirstclass/", images of class "mysecondclass" should be in the folder ' + \
'"/path/to/images/mysecondclass" and so on')
parser.add_argument("output_directory", type=str, metavar='OUTPUT_PATH',
help='The folder where you want to output be written to.')
parser.add_argument("--test_proportion", type=float, default=0.3, metavar='0.3',
help='Proportion of images used for testing.')
args = parser.parse_args()
print('\n-----------------------')
print('IMPORTANT: please make sure that there are only images in your folder tree.')
print('-----------------------\n')
OUTPUT_DIR = args.output_directory
IMAGE_DIR = os.path.join(OUTPUT_DIR, 'images')
OUTPUT_TRAIN_JSON = os.path.join(OUTPUT_DIR, 'training.json')
OUTPUT_VAL_JSON = os.path.join(OUTPUT_DIR, 'testing.json')
# Create output dir
try:
os.makedirs(OUTPUT_DIR)
except:
print('Output folder {} already exists.'.format(OUTPUT_DIR))
train_json = dict(info='', images=[], categories=[], annotations=[], licenses=[])
val_json = dict(info='', images=[], categories=[], annotations=[], licenses=[])
add_new_categories(train_json, val_json, os.path.join(args.input_directory, '*'), args.test_proportion, OUTPUT_DIR)
# Finished, writing json
with open(OUTPUT_TRAIN_JSON, 'wt') as tj:
json.dump(train_json, tj)
with open(OUTPUT_VAL_JSON, 'wt') as tj:
json.dump(val_json, tj)
def add_new_categories(train_js, val_js, folder, val_prob, OUTPUT_DIR):
# Adds new categories to the training and validation json
# train_js and val_js are json structure as used in iNat2017
# Both json files should be identical except for the field 'images'
# folder should be a glob pattern, e.g. 'myfolder/*'
# we will derive the classnames from all immediate subdirectories
# and use all images located in these subdirectories as class images
# avg_val_images_per_cat is the average number of images that we will
# assign to the validation json, all other images will be added to the
# training json
random.seed(0)
print('Working on {}'.format(folder))
# Add new license
if 'unknown' in [lic['name'] for lic in train_js['licenses']]:
lic_id = [lic['id'] for lic in train_js['licenses'] if lic['name'] == 'unknown'][0]
else:
lic_id = max([-1] + [lic['id'] for lic in train_js['licenses']]) + 1
train_js['licenses'].append(dict(url='unknown', id=lic_id, name='unknown'))
val_js['licenses'].append(train_js['licenses'][-1])
# Add new classes
new_classes_dirs = sorted(glob.glob(folder))
new_classes_dirs = list(filter(lambda x: os.path.isdir(x), new_classes_dirs))
classnames = [os.path.split(longpath)[-1] for longpath in new_classes_dirs]
next_cat_id = max([-1] + [cat['id'] for cat in train_js['categories']]) + 1
existing_classes = set([cat['name'] for cat in train_js['categories']])
for new_cat_idx, new_cat_name in enumerate(classnames):
if new_cat_name not in existing_classes:
target_superclass = 'Entity'
train_js['categories'].append({'id':next_cat_id,
'name':new_cat_name,
'supercategory':target_superclass})
val_js['categories'].append(train_js['categories'][-1])
next_cat_id = next_cat_id + 1
# Add images
# This function assumes that all annotations in train_js and val_js are
# equivalent except for the field 'images'
max_im_id = max([-1] + [im['id'] for im in train_js['images']] + [im['id'] for im in val_js['images']])
max_an_id = max([-1] + [an['id'] for an in train_js['annotations']] + [an['id'] for an in val_js['annotations']])
cat_to_id = {cat['name']:cat['id'] for cat in train_js['categories']}
for classname, classfolder in tqdm.tqdm(list(zip(classnames, new_classes_dirs))):
cat_id = cat_to_id[classname]
classimages = sorted(glob.glob(os.path.join(classfolder, '*')))
target_class = os.path.split(classfolder)[1]
target_superclass = 'Entity'
target_folder = os.path.join(OUTPUT_DIR, target_superclass, target_class)
os.makedirs(target_folder, exist_ok=True)
for classimage in classimages:
# This calculation is within the loop to handle the case of 0 images
try:
width, height = PIL.Image.open(classimage).size
if PIL.Image.open(classimage).mode != 'RGB':
PIL.Image.open(classimage).convert('RGB').save(classimage)
# Reading the image should come first so a failed load does not
# mess up the IDs
max_im_id = max_im_id + 1
max_an_id = max_an_id + 1
next_im_id = max_im_id
next_an_id = max_an_id
if random.random() < val_prob:
js = val_js
else:
js = train_js
# Make link to the image in the output folder
target_file = os.path.join(target_folder, os.path.split(classimage)[1])
if not os.path.exists(target_file):
shutil.copy(os.path.abspath(classimage), target_file)
else:
print('File / Symlink already exists: '+ target_file)
js['images'].append(dict(id=next_im_id,
width=width,
height=height,
file_name=target_file,
license=lic_id,
rights_holder=''))
js['annotations'].append(dict(id=next_an_id,
image_id=next_im_id,
category_id=cat_id))
except IOError:
print('Cannot read image {}'.format(classimage))
os.remove(classimage)
except:
print('Cannot read image {}'.format(classimage))
raise
if __name__ == '__main__':
main()