From d98e4c6c0ddfac38e7f3b99a4b6a9030259fcf4d Mon Sep 17 00:00:00 2001 From: Manan Shah Date: Wed, 8 Mar 2023 10:04:57 +0530 Subject: [PATCH 1/2] added option to symlink (`ln -s`) --- README.md | 7 +++++-- splitfolders/cli.py | 13 +++++++++++-- splitfolders/split.py | 42 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 4da3df1..be9c076 100644 --- a/README.md +++ b/README.md @@ -97,8 +97,10 @@ Occasionally, you may have things that comprise more than a single file (e.g. pi Set `group_prefix` to the length of the group (e.g. `2`). But now _all_ files should be part of groups. -Set `move=True` if you want to move the files instead of copying. - +Set +- `move=True` or `move='move'` if you want to move the files instead of copying. +- `move=False` or `move='copy'` if you want to copy the files. (default behavior) +- `move='symlink'` if you want to symlink(i.e create shortcuts `ln -s`) instead of copying ### CLI ``` @@ -114,6 +116,7 @@ Options: --oversample enable oversampling of imbalanced datasets, works only with --fixed. --group_prefix split files into equally-sized groups based on their prefix --move move the files instead of copying + --symlink symlink(create shortcut) the files instead of copying Example: splitfolders --ratio .8 .1 .1 -- folder_with_images ``` diff --git a/splitfolders/cli.py b/splitfolders/cli.py index 2645660..4b9f0cf 100644 --- a/splitfolders/cli.py +++ b/splitfolders/cli.py @@ -5,7 +5,7 @@ def run(): parser = argparse.ArgumentParser( - description="Split folders with files (e.g. images) into training, validation and test(dataset) folders." + description="Split folders with files (e.g. images) by copying them into training, validation and test(dataset) folders." ) parser.add_argument( "--output", @@ -41,11 +41,17 @@ def run(): default=None, help="split files into equally-sized groups based on their prefix", ) - parser.add_argument( + group = parser.add_mutually_exclusive_group() + group.add_argument( "--move", action="store_true", help="move the files instead of copying", ) + group.add_argument( + "--symlink", + action="store_true", + help="symlink(create shortcut) the files instead of copying", + ) parser.add_argument( "input", help="directory with the input data. The directory needs to have the labels as sub-directories. In those sub-directories are then the actual files that gets split.", @@ -53,6 +59,9 @@ def run(): args = parser.parse_args() + if args.symlink: + args.move = 'symlink' + if args.ratio: ratio( args.input, args.output, args.seed, args.ratio, args.group_prefix, args.move diff --git a/splitfolders/split.py b/splitfolders/split.py index fef8972..bfda1c2 100644 --- a/splitfolders/split.py +++ b/splitfolders/split.py @@ -35,7 +35,7 @@ from pathlib import Path import random import shutil -from os import path +from os import path, symlink from .utils import list_dirs, list_files @@ -150,8 +150,22 @@ def fixed( if use_tqdm: iteration = tqdm(iteration, desc="Oversampling", unit=" classes") - copy_fun = shutil.move if move else shutil.copy2 + if move == 'move' or move is True: + copy_fun = shutil.move + elif move == 'copy' or move is False: + copy_fun = shutil.copy2 + else: + copy_fun = symlink + def copyer(f_orig, f_dest): + if isinstance(move, bool) or move == 'move' or move == 'copy': + copy_fun(str(f_orig), str(f_dest)) + else: + try: + copy_fun(f_orig.resolve(), f_dest.resolve()) + except FileExistsError: + pass + for num_items, class_dir in iteration: class_name = path.split(class_dir)[1] full_path = path.join(output, "train", class_name) @@ -169,7 +183,8 @@ def fixed( for f_orig in f_chosen: new_name = f_orig.stem + "_" + str(i) + f_orig.suffix f_dest = f_orig.with_name(new_name) - copy_fun(str(f_orig), str(f_dest)) + print(f'{f_orig}, {f_orig.stem}, {f_orig.suffix}, {f_dest}') + copyer(f_orig, f_dest) def group_by_prefix(files, len_pairs): @@ -296,8 +311,23 @@ def copy_files(files_type, class_dir, output, prog_bar, move): Copies the files from the input folder to the output folder """ - copy_fun = shutil.move if move else shutil.copy2 + if move == 'move' or move is True: + copy_fun = shutil.move + elif move == 'copy' or move is False: + copy_fun = shutil.copy2 + else: + copy_fun = symlink + def copyer(base_file, full_path): + if isinstance(move, bool) or move == 'move' or move == 'copy': + copy_fun(str(base_file), str(full_path)) + else: + try: + copy_fun(base_file.resolve(), path.join(full_path, + path.split(Path(base_file))[1])) + except FileExistsError: + pass + # get the last part within the file class_name = path.split(class_dir)[1] for (files, folder_type) in files_type: @@ -309,6 +339,6 @@ def copy_files(files_type, class_dir, output, prog_bar, move): prog_bar.update() if type(f) == tuple: for x in f: - copy_fun(str(x), str(full_path)) + copyer(x, full_path) else: - copy_fun(str(f), str(full_path)) + copyer(f, full_path) From d00b3a2c9542bc00c23ef8f3e021366c596fbaa3 Mon Sep 17 00:00:00 2001 From: Manan Shah Date: Wed, 8 Mar 2023 10:37:05 +0530 Subject: [PATCH 2/2] remove debugging print statement --- splitfolders/split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/splitfolders/split.py b/splitfolders/split.py index bfda1c2..ceeb9c2 100644 --- a/splitfolders/split.py +++ b/splitfolders/split.py @@ -183,7 +183,6 @@ def copyer(f_orig, f_dest): for f_orig in f_chosen: new_name = f_orig.stem + "_" + str(i) + f_orig.suffix f_dest = f_orig.with_name(new_name) - print(f'{f_orig}, {f_orig.stem}, {f_orig.suffix}, {f_dest}') copyer(f_orig, f_dest)