Skip to content

Commit

Permalink
Cleaned and extended function that extracts datetimes from paths (#2181)
Browse files Browse the repository at this point in the history
Co-authored-by: Valeriu Predoi <[email protected]>
  • Loading branch information
schlunma and valeriupredoi authored Oct 5, 2023
1 parent 13a444e commit 9b323aa
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 104 deletions.
2 changes: 1 addition & 1 deletion esmvalcore/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ def _update_timerange(self):
dataset.facets.pop('timerange')
dataset.supplementaries = []
check.data_availability(dataset)
intervals = [_get_start_end_date(f.name) for f in dataset.files]
intervals = [_get_start_end_date(f) for f in dataset.files]

min_date = min(interval[0] for interval in intervals)
max_date = max(interval[1] for interval in intervals)
Expand Down
2 changes: 1 addition & 1 deletion esmvalcore/esgf/_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def select_by_time(files, timerange):
for file in files:
start_date, end_date = _parse_period(timerange)
try:
start, end = _get_start_end_date(file.name)
start, end = _get_start_end_date(file)
except ValueError:
# If start and end year cannot be read from the filename
# just select everything.
Expand Down
162 changes: 74 additions & 88 deletions esmvalcore/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
from glob import glob
from pathlib import Path
from typing import Any, Union
from typing import TYPE_CHECKING, Any, Union

import iris
import isodate
Expand All @@ -17,17 +17,19 @@
from .exceptions import RecipeError
from .typing import Facets, FacetValue

if TYPE_CHECKING:
from .esgf import ESGFFile

logger = logging.getLogger(__name__)


def _get_from_pattern(pattern, date_range_pattern, stem, group):
"""Get time, date or datetime from date range patterns in file names."""
#
# Next string allows to test that there is an allowed delimiter (or
# string start or end) close to date range (or to single date)
start_point = end_point = None
context = r"(?:^|[-_]|$)"
#

# First check for a block of two potential dates
date_range_pattern_with_context = context + date_range_pattern + context
daterange = re.search(date_range_pattern_with_context, stem)
Expand All @@ -37,6 +39,7 @@ def _get_from_pattern(pattern, date_range_pattern, stem, group):
date_range_pattern_with_context = (context + date_range_pattern +
context)
daterange = re.search(date_range_pattern_with_context, stem)

if daterange:
start_point = daterange.group(group)
end_group = '_'.join([group, 'end'])
Expand All @@ -59,41 +62,72 @@ def _get_from_pattern(pattern, date_range_pattern, stem, group):
return start_point, end_point


def _get_start_end_date(filename):
def _get_start_end_date(
file: str | Path | LocalFile | ESGFFile
) -> tuple[str, str]:
"""Get the start and end dates as a string from a file name.
Examples of allowed dates : 1980, 198001, 19801231,
1980123123, 19801231T23, 19801231T2359, 19801231T235959,
19801231T235959Z (ISO 8601).
Examples of allowed dates: 1980, 198001, 1980-01, 19801231, 1980-12-31,
1980123123, 19801231T23, 19801231T2359, 19801231T235959, 19801231T235959Z
(ISO 8601).
Dates must be surrounded by '-', '_' or '.' (the latter is used by CMIP3
data), or string start or string end (after removing filename suffix).
Look first for two dates separated by '-', '_' or '_cat_' (the latter is
used by CMIP3 data), then for one single date, and if there are multiple,
for one date at start or end.
Parameters
----------
file:
The file to read the start and end data from.
Returns
-------
tuple[str, str]
The start and end date.
Dates must be surrounded by - or _ or string start or string end
(after removing filename suffix).
Raises
------
ValueError
Start or end date cannot be determined.
Look first for two dates separated by - or _, then for one single
date, and if they are multiple, for one date at start or end.
"""
stem = Path(filename).stem
if hasattr(file, 'name'): # Path, LocalFile, ESGFFile
stem = Path(file.name).stem
else: # str
stem = Path(file).stem

start_date = end_date = None
#

# Build regex
time_pattern = (r"(?P<hour>[0-2][0-9]"
r"(?P<minute>[0-5][0-9]"
r"(?P<second>[0-5][0-9])?)?Z?)")
date_pattern = (r"(?P<year>[0-9]{4})"
r"(?P<month>[01][0-9]"
r"(?P<day>[0-3][0-9]"
r"(?P<month>-?[01][0-9]"
r"(?P<day>-?[0-3][0-9]"
rf"(T?{time_pattern})?)?)?")
datetime_pattern = (rf"(?P<datetime>{date_pattern})")
#
end_datetime_pattern = datetime_pattern.replace(">", "_end>")
date_range_pattern = datetime_pattern + r"[-_]" + end_datetime_pattern

# Dates can either be delimited by '-', '_', or '_cat_' (the latter for
# CMIP3)
date_range_pattern = (
datetime_pattern + r"[-_](?:cat_)?" + end_datetime_pattern
)

# Find dates using the regex
start_date, end_date = _get_from_pattern(datetime_pattern,
date_range_pattern, stem,
'datetime')

# As final resort, try to get the dates from the file contents
if (start_date is None or end_date is None) and Path(filename).exists():
logger.debug("Must load file %s for daterange ", filename)
cubes = iris.load(filename)
if ((start_date is None or end_date is None) and
isinstance(file, (str, Path)) and Path(file).exists()):
logger.debug("Must load file %s for daterange ", file)
cubes = iris.load(file)

for cube in cubes:
logger.debug(cube)
Expand All @@ -109,12 +143,30 @@ def _get_start_end_date(filename):
break

if start_date is None or end_date is None:
raise ValueError(f'File {filename} dates do not match a recognized '
'pattern and time can not be read from the file')
raise ValueError(
f"File {file} datetimes do not match a recognized pattern and "
f"time coordinate can not be read from the file"
)

# Remove potential '-' characters from datetimes
start_date = start_date.replace('-', '')
end_date = end_date.replace('-', '')

return start_date, end_date


def _get_start_end_year(
file: str | Path | LocalFile | ESGFFile
) -> tuple[int, int]:
"""Get the start and end year as int from a file name.
See :func:`_get_start_end_date`.
"""
(start_date, end_date) = _get_start_end_date(file)
return (int(start_date[:4]), int(end_date[:4]))


def _dates_to_timerange(start_date, end_date):
"""Convert ``start_date`` and ``end_date`` to ``timerange``.
Expand Down Expand Up @@ -162,72 +214,6 @@ def _replace_years_with_timerange(variable):
variable.pop('end_year', None)


def _get_start_end_year(file):
"""Get the start and end year from a file name.
Examples of allowed dates : 1980, 198001, 19801231,
1980123123, 19801231T23, 19801231T2359, 19801231T235959,
19801231T235959Z (ISO 8601).
Dates must be surrounded by - or _ or string start or string end
(after removing filename suffix).
Look first for two dates separated by - or _, then for one single
date, and if they are multiple, for one date at start or end.
Parameters
----------
file: LocalFile or esmvalcore.esgf.ESGFFile
The file to read the start and end year from.
Returns
-------
tuple[int, int]
The start and end year.
Raises
------
ValueError
When start or end year cannot be determined.
"""
start_year = end_year = None

time_pattern = (r"(?P<hour>[0-2][0-9]"
r"(?P<minute>[0-5][0-9]"
r"(?P<second>[0-5][0-9])?)?Z?)")
date_pattern = (r"(?P<year>[0-9]{4})"
r"(?P<month>[01][0-9]"
r"(?P<day>[0-3][0-9]"
rf"(T?{time_pattern})?)?)?")

end_date_pattern = date_pattern.replace(">", "_end>")
date_range_pattern = date_pattern + r"[-_]" + end_date_pattern
start_year, end_year = _get_from_pattern(date_pattern, date_range_pattern,
Path(file.name).stem, 'year')
# As final resort, try to get the dates from the file contents
if ((start_year is None or end_year is None) and isinstance(file, Path)
and file.exists()):
logger.debug("Must load file %s for daterange ", file)
cubes = iris.load(file)

for cube in cubes:
logger.debug(cube)
try:
time = cube.coord('time')
except iris.exceptions.CoordinateNotFoundError:
continue
start_year = time.cell(0).point.year
end_year = time.cell(-1).point.year
break

if start_year is None or end_year is None:
raise ValueError(f'File {file} dates do not match a recognized '
'pattern and time can not be read from the file')

return int(start_year), int(end_year)


def _parse_period(timerange):
"""Parse `timerange` values given as duration periods.
Expand Down
Loading

0 comments on commit 9b323aa

Please sign in to comment.