diff --git a/tensorflow_datasets/scripts/cli/convert_format_utils.py b/tensorflow_datasets/scripts/cli/convert_format_utils.py index 8c03e00500f..d5afad23fdc 100644 --- a/tensorflow_datasets/scripts/cli/convert_format_utils.py +++ b/tensorflow_datasets/scripts/cli/convert_format_utils.py @@ -280,6 +280,48 @@ def _remove_incomplete_files(path: epath.Path) -> None: logging.info('Removed %d incomplete files.', num_incomplete_files) +def _get_info_for_dirs_to_convert( + from_dir: epath.Path, + to_dir: epath.Path, + out_file_format: file_adapters.FileFormat, + overwrite: bool, +) -> dataset_info.DatasetInfo | None: + """Returns the dataset info for the given dataset dirs.""" + builder = read_only_builder_lib.builder_from_directory(from_dir) + if out_file_format == builder.info.file_format: + raise ValueError( + f'The file format of the dataset ({builder.info.file_format}) is the' + f' same as the specified out file format! ({out_file_format})' + ) + if out_file_format in builder.info.alternative_file_formats: + if overwrite: + logging.warning( + 'The file format to convert to (%s) is already an alternative file' + ' format. Overwriting the shards!', + out_file_format.value, + ) + return builder.info + elif os.fspath(from_dir) == os.fspath(to_dir): + logging.info( + 'The file format to convert to (%s) is already an alternative file' + ' format of the dataset in %s. Skipping conversion.', + os.fspath(from_dir), + out_file_format.value, + ) + # TODO(weide) add check whether data files are actually present. + return None + else: + logging.warning( + 'The file format to convert to (%s) is already an alternative file' + ' format, but the converted output is being written to a different' + ' folder, so the shards will be converted anyway. From: %s, to: %s', + out_file_format.value, + os.fspath(from_dir), + os.fspath(to_dir), + ) + return builder.info + + def _convert_dataset_dirs( from_to_dirs: Mapping[epath.Path, epath.Path], out_file_format: file_adapters.FileFormat, @@ -303,36 +345,16 @@ def _convert_dataset_dirs( logging.info('Converting %d datasets.', len(from_to_dirs)) found_dataset_versions: dict[epath.Path, dataset_info.DatasetInfo] = {} + # TODO(weide) parallelize this, because it's slow for dirs with many datasets. for from_dir, to_dir in from_to_dirs.items(): - builder = read_only_builder_lib.builder_from_directory(from_dir) - if out_file_format == builder.info.file_format: - raise ValueError( - f'The file format of the dataset ({builder.info.file_format}) is the' - f' same as the specified out file format! ({out_file_format})' - ) - if out_file_format in builder.info.alternative_file_formats: - if overwrite: - logging.warning( - 'The file format to convert to (%s) is already an alternative file' - ' format. Overwriting the shards!', - out_file_format.value, - ) - elif from_dir == to_dir: - logging.info( - 'The file format to convert to (%s) is already an alternative file' - ' format of the dataset in %s. Skipping conversion.', - os.fspath(from_dir), - out_file_format.value, - ) - continue - else: - logging.warning( - 'The file format to convert to (%s) is already an alternative file' - ' format, but the converted output is being written to a different' - ' folder, so the shards will be converted anyway.', - out_file_format.value, - ) - found_dataset_versions[from_dir] = builder.info + info = _get_info_for_dirs_to_convert( + from_dir=from_dir, + to_dir=to_dir, + out_file_format=out_file_format, + overwrite=overwrite, + ) + if info is not None: + found_dataset_versions[from_dir] = info convert_dataset_fn = functools.partial( _convert_dataset,