Skip to content

Commit

Permalink
Merge pull request #44 from cedadev/qc-flag-checks
Browse files Browse the repository at this point in the history
Qc flag checks
  • Loading branch information
joshua-hampton authored Mar 1, 2024
2 parents f68c7fa + 2f5dc3b commit e618d83
Show file tree
Hide file tree
Showing 54 changed files with 573 additions and 524 deletions.
31 changes: 19 additions & 12 deletions checksit/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
def _get_bounds_var_ids(dct):
return [var_id for var_id in dct["variables"] if (
var_id.startswith("bounds_") or var_id.startswith("bnds_") or
var_id.endswith("_bounds") or var_id.endswith("_bnds"))]
var_id.endswith("_bounds") or var_id.endswith("_bnds"))]


def one_spelling_mistake(word):
Expand All @@ -29,7 +29,7 @@ def one_spelling_mistake(word):
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)

def two_spelling_mistakes(word):
def two_spelling_mistakes(word):
"""
All edits that are two edits away from `word`.
From https://norvig.com/spell-correct.html
Expand All @@ -56,14 +56,14 @@ def check_var_attrs(dct, defined_attrs, ignore_bounds=True, skip_spellcheck=Fals
bounds_vars = _get_bounds_var_ids(dct)

for var_id, var_dict in dct["variables"].items():
if var_id in bounds_vars: continue
if var_id in bounds_vars: continue

for attr in defined_attrs:
if is_undefined(var_dict.get(attr)):
errors.append(f"[variable**************:{var_id}]: Attribute '{attr}' must have a valid definition.")

return errors, warnings


def check_global_attrs(dct, defined_attrs=None, vocab_attrs=None, regex_attrs=None, rules_attrs=None, skip_spellcheck=False):
"""
Expand Down Expand Up @@ -112,7 +112,7 @@ def check_global_attrs(dct, defined_attrs=None, vocab_attrs=None, regex_attrs=No
errors.append(
f"[global-attributes:******:{attr}]: '{dct['global_attributes'].get(attr, UNDEFINED)}' "
f"does not match regex pattern '{regex_attrs[attr]}'."
)
)

for attr in rules_attrs:
if attr not in dct['global_attributes']:
Expand Down Expand Up @@ -182,10 +182,10 @@ def check_dim_exists(dct, dimensions, skip_spellcheck=False):
f"{search_close_match(dim, dct['dimensions'].keys()) if not skip_spellcheck else ''}"
)

return errors, warnings
return errors, warnings


def check_var(dct, variable, defined_attrs, skip_spellcheck=False):
def check_var(dct, variable, defined_attrs, attr_rules=[], skip_spellcheck=False):
"""
Check variable exists and has attributes defined.
"""
Expand Down Expand Up @@ -220,7 +220,7 @@ def check_var(dct, variable, defined_attrs, skip_spellcheck=False):
attr_value = attr_value.strip(',')
attr_value = [ int(i.strip('b')) for i in attr_value.split(',') ]
attr_value = np.array(attr_value, dtype=np.int8)
if not np.all(dct["variables"][variable].get(attr_key) == attr_value):
if not ((len(dct["variables"][variable].get(attr_key)) == len(attr_value)) and np.all(dct["variables"][variable].get(attr_key) == attr_value)):
errors.append(
f"[variable**************:{variable}]: Attribute '{attr_key}' must have definition '{attr_value}', "
f"not '{dct['variables'][variable].get(attr_key)}'."
Expand All @@ -233,6 +233,13 @@ def check_var(dct, variable, defined_attrs, skip_spellcheck=False):
f"[variable**************:{variable}]: Attribute '{attr_key}' must have definition {attr_value}, "
f"not {dct['variables'][variable].get(attr_key).encode('unicode_escape').decode('utf-8')}."
)
for rule_to_check in attr_rules:
if rule_to_check == "rule-func:check-qc-flags":
rule_errors, rule_warnings = rules.check(rule_to_check, dct['variables'][variable].get("flag_values"), context=dct['variables'][variable].get("flag_meanings"), label=f"[variable******:{variable}]: ")
errors.extend(rule_errors)
warnings.extend(rule_warnings)


else:
if variable not in dct["variables"].keys():
errors.append(
Expand All @@ -242,7 +249,7 @@ def check_var(dct, variable, defined_attrs, skip_spellcheck=False):
else:
for attr in defined_attrs:
attr_key = attr.split(':')[0]
attr_value = ':'.join(attr.split(':')[1:])
attr_value = ':'.join(attr.split(':')[1:])
if attr_key not in dct["variables"][variable]:
errors.append(
f"[variable**************:{variable}]: Attribute '{attr_key}' does not exist. "
Expand Down Expand Up @@ -286,10 +293,10 @@ def check_file_name(file_name, vocab_checks=None, **kwargs):
else:
msg = "No platform vocab defined in specs"
raise KeyError(msg)

# check date format
# could be yyyy, yyyymm, yyyymmdd, yyyymmdd-HH, yyyymmdd-HHMM, yyyymmdd-HHMMSS
# first checks format, then date validity
# first checks format, then date validity
if not date_regex.match(file_name_parts[2]):
errors.append(f"[file name]: Invalid file name format - bad date format {file_name_parts[2]}")
else:
Expand All @@ -301,7 +308,7 @@ def check_file_name(file_name, vocab_checks=None, **kwargs):
valid_date_found = True
break
except ValueError:
pass
pass
if not valid_date_found:
errors.append(f"[file name]: Invalid file name format - invalid date in file name {file_name_parts[2]}")

Expand Down
38 changes: 22 additions & 16 deletions checksit/make_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@ def map_data_type(dtype):
'byte':'byte',
}
return data_map[dtype]



# main function

def make_amof_specs(version_number):
def make_amof_specs(version_number):
###############
# DIRECTORIES #
###############
cvs_dir = f"./checksit/vocabs/AMF_CVs/{version_number}"
out_dir = f"./specs/groups/ncas-amof-{version_number}"

cvs_dir = f"vocabs/AMF_CVs/{version_number}"
out_dir = f"../specs/groups/ncas-amof-{version_number}"


################
Expand Down Expand Up @@ -103,7 +103,7 @@ def make_amof_specs(version_number):
f.write(' rules_attrs:\n')
for attr, rule in attr_rules.items():
if rule.split(':')[0] in ['regex','regex-rule','type-rule','rule-func']:
f.write(f' {attr}: {rule}\n')
f.write(f' {attr}: {rule}\n')

####################
# DEPLOYMENT MODES #
Expand All @@ -125,7 +125,7 @@ def make_amof_specs(version_number):
if attr == 'type':
attr_value = map_data_type(attr_value)
deploy_vars[variable].append(f'{attr}:{attr_value}')



spec_file_name = f'{out_dir}/amof-common-{mode}.yml'
Expand All @@ -145,7 +145,7 @@ def make_amof_specs(version_number):
' params:\n dimensions:\n'))
for dim in deploy_dims:
f.write(f' - {dim}\n')

##############
## PRODUCTS ##
##############
Expand Down Expand Up @@ -183,11 +183,11 @@ def make_amof_specs(version_number):
else:
prod_dims_exist = False


if exists(f'{cvs_dir}/AMF_product_{product}_global-attributes.json'):
with open(f'{cvs_dir}/AMF_product_{product}_global-attributes.json') as f:
data = json.load(f)[f'product_{product}_global-attributes']

attr_rules = {}

for attr in data.keys():
Expand Down Expand Up @@ -240,21 +240,27 @@ def make_amof_specs(version_number):
else:
prod_attrs_exist = False



spec_file_name = f'{out_dir}/amof-{product}.yml'
with open(spec_file_name, 'w') as f:
if prod_vars_exist:
for i, var in enumerate(product_info.items()):
qc_flags = False
f.write(f'var-requires{i}:\n')
f.write((' func: checksit.generic.check_var\n'
' params:\n variable:\n'
f' - {var[0]}:__OPTIONAL__\n defined_attrs:\n'))
for attr in var[1]:
attr_key = attr.split(':')[0]
attr_value = ':'.join(attr.split(':')[1:])
f.write(f' - {attr_key}:{attr_value}\n')

if attr_key not in ["flag_values", "flag_meanings"]:
f.write(f' - {attr_key}:{attr_value}\n')
else:
qc_flags = True
if qc_flags:
f.write(f' attr_rules:\n - rule-func:check-qc-flags\n')

if prod_dims_exist:
f.write(('dims-requires:\n func: checksit.generic.check_dim_exists\n'
' params:\n dimensions:\n'))
Expand All @@ -270,10 +276,10 @@ def make_amof_specs(version_number):
f.write(' rules_attrs:\n')
for attr, rule in attr_rules.items():
if rule.split(':')[0] in ['regex','regex-rule','type-rule','rule-func']:
f.write(f' {attr}: {rule}\n')
f.write(f' {attr}: {rule}\n')


if __name__ == "__main__":
import sys
version_number = sys.argv[1]
make_amof_specs(version_number)
make_amof_specs(version_number)
58 changes: 46 additions & 12 deletions checksit/rules/rule_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime
import requests
from urllib.request import urlopen
import numpy as np

from . import processors
from ..config import get_config
Expand Down Expand Up @@ -86,7 +87,7 @@ def string_of_length(value, context, extras=None, label=""):
errors.append(f"{label} '{value}' must be exactly {min_length} characters")

return errors


def validate_image_date_time(value, context, extras=None, label=""):
"""
Expand All @@ -104,7 +105,7 @@ def validate_image_date_time(value, context, extras=None, label=""):

if not match:
errors.append(f"{label} '{value}' needs to be of the format YYYY:MM:DD hh:mm:ss or YYYY:MM:DD hh:mm:ss.s")

return errors


Expand All @@ -113,24 +114,24 @@ def validate_orcid_ID(value, context, extras=None, label=""):
A function to verify the format of an orcid ID
"""
orcid_string = "https://orcid.org/" # required format of start of the string

errors = []

PI_orcid_digits = value[-19:]
PI_orcid_digits_only = PI_orcid_digits.replace("-", "")

# Check that total the length is correct
if len(value) != 37:
if len(value) != 37:
errors.append(f"{label} '{value}' needs to be of the format https://orcid.org/XXXX-XXXX-XXXX-XXXX")

# Check the start of the string (first 18 characters)
elif (value[0:18] != orcid_string or

# Check that the "-" are in the correct places
value[22] != "-" or
value[27] != "-" or
value[32] != "-" or

# Check that the last characters contain only "-" and digits (plus 'X' for last digit)
not (
PI_orcid_digits_only.isdigit() or (PI_orcid_digits_only[0:15].isdigit() and PI_orcid_digits_only[15] == "X")
Expand All @@ -157,7 +158,7 @@ def list_of_names(value, context, extras=None, label=""):
warnings.append(f"{label} '{value}' should be of the format <last name>, <first name> <middle initials(s)> or <last name>, <first name> <middle name(s)> where appropriate")
if not re.fullmatch(character_name_pattern, i):
warnings.append(f"{label} '{value}' - please use characters A-Z, a-z, À-ÿ where appropriate")

if type(value) == str:
if not re.fullmatch(name_pattern, value):
warnings.append(f"{label} '{value}' should be of the format <last name>, <first name> <middle initials(s)> or <last name>, <first name> <middle name(s)> where appropriate")
Expand Down Expand Up @@ -225,7 +226,7 @@ def relation_url_checker(value, context, extras=None, label=""):
A function to check if Relation field is in the correct format, and that the url exists
"""
errors = []

if " " not in value:
errors.append(f"{label} '{value}' should contain a space before the url")
else:
Expand All @@ -241,7 +242,7 @@ def latitude(value, context, extras=None, label=""):
A function to check if the latitude is within -90 and +90
"""
errors = []

latitude = re.findall(r'[0-9]+', value)
int_latitude = int(latitude[0])
dec_latitude = int(latitude[1])
Expand All @@ -257,7 +258,7 @@ def longitude(value, context, extras=None, label=""):
A function to check if the longitude is within -180 and +180
"""
errors = []

longitude = re.findall(r'[0-9]+', value)
int_longitude = int(longitude[0])
dec_longitude = int(longitude[1])
Expand All @@ -266,3 +267,36 @@ def longitude(value, context, extras=None, label=""):
errors.append(f"{label} '{value}' must be within -180 and +180 ")

return errors


def check_qc_flags(value, context, extras=None, label=""):
"""
A function to check flag_values and flag_meanings
value - flag_values
context - flag_meanings
"""
errors = []

meanings = context.split(" ")

# check flag_values are correctly formatted (should be array of bytes)
if not (isinstance(value, np.ndarray) or isinstance(value, tuple)):
errors.append(f"{label} QC flag_values must be an array or tuple of byte values, not {type(value)}.")

# check there are at least two values and they start with 0 and 1
if not len(value) > 2:
errors.append(f"{label} There must be at least two QC flag values.")
elif not (np.all(value[:2] == [0, 1]) or np.all(value[:2] == (0, 1))):
errors.append(f"{label} First two QC flag_values must be '[0, 1]'.")

# check there are at least two meanings and the first two are correct
if not len(meanings) > 2:
errors.append(f"{label} There must be at least two QC flag meanings (space separated).")
elif not np.all(meanings[:2] == ["not_used", "good_data"]):
errors.append(f"{label} First two QC flag_meanings must be 'not_used' and 'good_data'.")

# check number of values is same as number of meanings
if not len(value) == len(meanings):
errors.append(f"{label} Number of flag_values must equal number of flag_meanings.")

return errors
Loading

0 comments on commit e618d83

Please sign in to comment.