Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more detailed logging in convert_format_utils for when from and to dirs are the same #5661

Merged
merged 1 commit into from
Oct 15, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 51 additions & 29 deletions tensorflow_datasets/scripts/cli/convert_format_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,48 @@ def _remove_incomplete_files(path: epath.Path) -> None:
logging.info('Removed %d incomplete files.', num_incomplete_files)


def _get_info_for_dirs_to_convert(
from_dir: epath.Path,
to_dir: epath.Path,
out_file_format: file_adapters.FileFormat,
overwrite: bool,
) -> dataset_info.DatasetInfo | None:
"""Returns the dataset info for the given dataset dirs."""
builder = read_only_builder_lib.builder_from_directory(from_dir)
if out_file_format == builder.info.file_format:
raise ValueError(
f'The file format of the dataset ({builder.info.file_format}) is the'
f' same as the specified out file format! ({out_file_format})'
)
if out_file_format in builder.info.alternative_file_formats:
if overwrite:
logging.warning(
'The file format to convert to (%s) is already an alternative file'
' format. Overwriting the shards!',
out_file_format.value,
)
return builder.info
elif os.fspath(from_dir) == os.fspath(to_dir):
logging.info(
'The file format to convert to (%s) is already an alternative file'
' format of the dataset in %s. Skipping conversion.',
os.fspath(from_dir),
out_file_format.value,
)
# TODO(weide) add check whether data files are actually present.
return None
else:
logging.warning(
'The file format to convert to (%s) is already an alternative file'
' format, but the converted output is being written to a different'
' folder, so the shards will be converted anyway. From: %s, to: %s',
out_file_format.value,
os.fspath(from_dir),
os.fspath(to_dir),
)
return builder.info


def _convert_dataset_dirs(
from_to_dirs: Mapping[epath.Path, epath.Path],
out_file_format: file_adapters.FileFormat,
Expand All @@ -303,36 +345,16 @@ def _convert_dataset_dirs(
logging.info('Converting %d datasets.', len(from_to_dirs))

found_dataset_versions: dict[epath.Path, dataset_info.DatasetInfo] = {}
# TODO(weide) parallelize this, because it's slow for dirs with many datasets.
for from_dir, to_dir in from_to_dirs.items():
builder = read_only_builder_lib.builder_from_directory(from_dir)
if out_file_format == builder.info.file_format:
raise ValueError(
f'The file format of the dataset ({builder.info.file_format}) is the'
f' same as the specified out file format! ({out_file_format})'
)
if out_file_format in builder.info.alternative_file_formats:
if overwrite:
logging.warning(
'The file format to convert to (%s) is already an alternative file'
' format. Overwriting the shards!',
out_file_format.value,
)
elif from_dir == to_dir:
logging.info(
'The file format to convert to (%s) is already an alternative file'
' format of the dataset in %s. Skipping conversion.',
os.fspath(from_dir),
out_file_format.value,
)
continue
else:
logging.warning(
'The file format to convert to (%s) is already an alternative file'
' format, but the converted output is being written to a different'
' folder, so the shards will be converted anyway.',
out_file_format.value,
)
found_dataset_versions[from_dir] = builder.info
info = _get_info_for_dirs_to_convert(
from_dir=from_dir,
to_dir=to_dir,
out_file_format=out_file_format,
overwrite=overwrite,
)
if info is not None:
found_dataset_versions[from_dir] = info

convert_dataset_fn = functools.partial(
_convert_dataset,
Expand Down
Loading