Merge pull request #44 from cedadev/qc-flag-checks

Qc flag checks
cedadev · Mar 1, 2024 · e618d83 · e618d83
2 parents f68c7fa + 2f5dc3b
commit e618d83
Show file tree

Hide file tree

Showing 54 changed files with 573 additions and 524 deletions.
diff --git a/checksit/generic.py b/checksit/generic.py
@@ -13,7 +13,7 @@
 def _get_bounds_var_ids(dct):
     return [var_id for var_id in dct["variables"] if (
             var_id.startswith("bounds_") or var_id.startswith("bnds_") or
-            var_id.endswith("_bounds") or var_id.endswith("_bnds"))] 
+            var_id.endswith("_bounds") or var_id.endswith("_bnds"))]
 
 
 def one_spelling_mistake(word):
@@ -29,7 +29,7 @@ def one_spelling_mistake(word):
     inserts    = [L + c + R               for L, R in splits for c in letters]
     return set(deletes + transposes + replaces + inserts)
 
-def two_spelling_mistakes(word): 
+def two_spelling_mistakes(word):
     """
     All edits that are two edits away from `word`.
     From https://norvig.com/spell-correct.html
@@ -56,14 +56,14 @@ def check_var_attrs(dct, defined_attrs, ignore_bounds=True, skip_spellcheck=Fals
     bounds_vars = _get_bounds_var_ids(dct)
 
     for var_id, var_dict in dct["variables"].items():
-        if var_id in bounds_vars: continue 
+        if var_id in bounds_vars: continue
 
         for attr in defined_attrs:
             if is_undefined(var_dict.get(attr)):
                 errors.append(f"[variable**************:{var_id}]: Attribute '{attr}' must have a valid definition.")
 
     return errors, warnings
- 
+
 
 def check_global_attrs(dct, defined_attrs=None, vocab_attrs=None, regex_attrs=None, rules_attrs=None, skip_spellcheck=False):
     """
@@ -112,7 +112,7 @@ def check_global_attrs(dct, defined_attrs=None, vocab_attrs=None, regex_attrs=No
             errors.append(
                 f"[global-attributes:******:{attr}]: '{dct['global_attributes'].get(attr, UNDEFINED)}' "
                 f"does not match regex pattern '{regex_attrs[attr]}'."
-            ) 
+            )
 
     for attr in rules_attrs:
         if attr not in dct['global_attributes']:
@@ -182,10 +182,10 @@ def check_dim_exists(dct, dimensions, skip_spellcheck=False):
                     f"{search_close_match(dim, dct['dimensions'].keys()) if not skip_spellcheck else ''}"
                 )
 
-    return errors, warnings 
+    return errors, warnings
 
 
-def check_var(dct, variable, defined_attrs, skip_spellcheck=False):
+def check_var(dct, variable, defined_attrs, attr_rules=[], skip_spellcheck=False):
     """
     Check variable exists and has attributes defined.
     """
@@ -220,7 +220,7 @@ def check_var(dct, variable, defined_attrs, skip_spellcheck=False):
                     attr_value = attr_value.strip(',')
                     attr_value = [ int(i.strip('b')) for i in attr_value.split(',') ]
                     attr_value = np.array(attr_value, dtype=np.int8)
-                    if not np.all(dct["variables"][variable].get(attr_key) == attr_value):
+                    if not ((len(dct["variables"][variable].get(attr_key)) == len(attr_value)) and np.all(dct["variables"][variable].get(attr_key) == attr_value)):
                         errors.append(
                             f"[variable**************:{variable}]: Attribute '{attr_key}' must have definition '{attr_value}', "
                             f"not '{dct['variables'][variable].get(attr_key)}'."
@@ -233,6 +233,13 @@ def check_var(dct, variable, defined_attrs, skip_spellcheck=False):
                         f"[variable**************:{variable}]: Attribute '{attr_key}' must have definition {attr_value}, "
                         f"not {dct['variables'][variable].get(attr_key).encode('unicode_escape').decode('utf-8')}."
                     )
+            for rule_to_check in attr_rules:
+                if rule_to_check == "rule-func:check-qc-flags":
+                    rule_errors, rule_warnings = rules.check(rule_to_check, dct['variables'][variable].get("flag_values"), context=dct['variables'][variable].get("flag_meanings"), label=f"[variable******:{variable}]: ")
+                    errors.extend(rule_errors)
+                    warnings.extend(rule_warnings)
+
+
     else:
         if variable not in dct["variables"].keys():
             errors.append(
@@ -242,7 +249,7 @@ def check_var(dct, variable, defined_attrs, skip_spellcheck=False):
         else:
             for attr in defined_attrs:
                 attr_key = attr.split(':')[0]
-                attr_value = ':'.join(attr.split(':')[1:]) 
+                attr_value = ':'.join(attr.split(':')[1:])
                 if attr_key not in dct["variables"][variable]:
                     errors.append(
                         f"[variable**************:{variable}]: Attribute '{attr_key}' does not exist. "
@@ -286,10 +293,10 @@ def check_file_name(file_name, vocab_checks=None, **kwargs):
     else:
         msg = "No platform vocab defined in specs"
         raise KeyError(msg)
-    
+
     # check date format
     # could be yyyy, yyyymm, yyyymmdd, yyyymmdd-HH, yyyymmdd-HHMM, yyyymmdd-HHMMSS
-    # first checks format, then date validity 
+    # first checks format, then date validity
     if not date_regex.match(file_name_parts[2]):
         errors.append(f"[file name]: Invalid file name format - bad date format {file_name_parts[2]}")
     else:
@@ -301,7 +308,7 @@ def check_file_name(file_name, vocab_checks=None, **kwargs):
                 valid_date_found = True
                 break
             except ValueError:
-                pass 
+                pass
         if not valid_date_found:
             errors.append(f"[file name]: Invalid file name format - invalid date in file name {file_name_parts[2]}")
 

diff --git a/checksit/make_specs.py b/checksit/make_specs.py
@@ -14,18 +14,18 @@ def map_data_type(dtype):
         'byte':'byte',
     }
     return data_map[dtype]
-    
-    
+
+
 
 # main function
 
-def make_amof_specs(version_number): 
+def make_amof_specs(version_number):
     ###############
     # DIRECTORIES #
     ###############
-    
-    cvs_dir = f"./checksit/vocabs/AMF_CVs/{version_number}"
-    out_dir = f"./specs/groups/ncas-amof-{version_number}"
+
+    cvs_dir = f"vocabs/AMF_CVs/{version_number}"
+    out_dir = f"../specs/groups/ncas-amof-{version_number}"
 
 
     ################
@@ -103,7 +103,7 @@ def make_amof_specs(version_number):
         f.write('    rules_attrs:\n')
         for attr, rule in attr_rules.items():
             if rule.split(':')[0] in ['regex','regex-rule','type-rule','rule-func']:
-                f.write(f'      {attr}: {rule}\n') 
+                f.write(f'      {attr}: {rule}\n')
 
     ####################
     # DEPLOYMENT MODES #
@@ -125,7 +125,7 @@ def make_amof_specs(version_number):
                     if attr == 'type':
                         attr_value = map_data_type(attr_value)
                     deploy_vars[variable].append(f'{attr}:{attr_value}')
-         
+
 
 
         spec_file_name = f'{out_dir}/amof-common-{mode}.yml'
@@ -145,7 +145,7 @@ def make_amof_specs(version_number):
                      '  params:\n    dimensions:\n'))
             for dim in deploy_dims:
                 f.write(f'      - {dim}\n')
-    
+
     ##############
     ## PRODUCTS ##
     ##############
@@ -183,11 +183,11 @@ def make_amof_specs(version_number):
         else:
             prod_dims_exist = False
 
-    
+
         if exists(f'{cvs_dir}/AMF_product_{product}_global-attributes.json'):
             with open(f'{cvs_dir}/AMF_product_{product}_global-attributes.json') as f:
                 data = json.load(f)[f'product_{product}_global-attributes']
-            
+
                 attr_rules = {}
 
                 for attr in data.keys():
@@ -240,21 +240,27 @@ def make_amof_specs(version_number):
         else:
             prod_attrs_exist = False
 
-        
+
 
         spec_file_name = f'{out_dir}/amof-{product}.yml'
         with open(spec_file_name, 'w') as f:
             if prod_vars_exist:
                 for i, var in enumerate(product_info.items()):
+                    qc_flags = False
                     f.write(f'var-requires{i}:\n')
                     f.write(('  func: checksit.generic.check_var\n'
                              '  params:\n    variable:\n'
                              f'      - {var[0]}:__OPTIONAL__\n    defined_attrs:\n'))
                     for attr in var[1]:
                         attr_key = attr.split(':')[0]
                         attr_value = ':'.join(attr.split(':')[1:])
-                        f.write(f'      - {attr_key}:{attr_value}\n')
-
+                        if attr_key not in ["flag_values", "flag_meanings"]:
+                            f.write(f'      - {attr_key}:{attr_value}\n')
+                        else:
+                            qc_flags = True
+                    if qc_flags:
+                        f.write(f'    attr_rules:\n      - rule-func:check-qc-flags\n')
+
             if prod_dims_exist:
                 f.write(('dims-requires:\n  func: checksit.generic.check_dim_exists\n'
                          '  params:\n    dimensions:\n'))
@@ -270,10 +276,10 @@ def make_amof_specs(version_number):
                 f.write('    rules_attrs:\n')
                 for attr, rule in attr_rules.items():
                     if rule.split(':')[0] in ['regex','regex-rule','type-rule','rule-func']:
-                        f.write(f'      {attr}: {rule}\n') 
+                        f.write(f'      {attr}: {rule}\n')
 
 
 if __name__ == "__main__":
     import sys
     version_number = sys.argv[1]
-    make_amof_specs(version_number) 
+    make_amof_specs(version_number)
diff --git a/checksit/rules/rule_funcs.py b/checksit/rules/rule_funcs.py
@@ -3,6 +3,7 @@
 from datetime import datetime
 import requests
 from urllib.request import urlopen
+import numpy as np
 
 from . import processors
 from ..config import get_config
@@ -86,7 +87,7 @@ def string_of_length(value, context, extras=None, label=""):
         errors.append(f"{label} '{value}' must be exactly {min_length} characters")
 
     return errors
-    
+
 
 def validate_image_date_time(value, context, extras=None, label=""):
     """
@@ -104,7 +105,7 @@ def validate_image_date_time(value, context, extras=None, label=""):
 
     if not match:
         errors.append(f"{label} '{value}' needs to be of the format YYYY:MM:DD hh:mm:ss or YYYY:MM:DD hh:mm:ss.s")
-    
+
     return errors
 
 
@@ -113,24 +114,24 @@ def validate_orcid_ID(value, context, extras=None, label=""):
     A function to verify the format of an orcid ID
     """
     orcid_string = "https://orcid.org/"                                     # required format of start of the string
-    
+
     errors = []
-    
+
     PI_orcid_digits = value[-19:]
     PI_orcid_digits_only = PI_orcid_digits.replace("-", "")
 
     # Check that total the length is correct
-    if len(value) != 37:    
+    if len(value) != 37:
         errors.append(f"{label} '{value}' needs to be of the format https://orcid.org/XXXX-XXXX-XXXX-XXXX")
-       
+
     # Check the start of the string (first 18 characters)
     elif (value[0:18] != orcid_string or
-        
+
         # Check that the "-" are in the correct places
         value[22] != "-" or
         value[27] != "-" or
         value[32] != "-" or
-        
+
         # Check that the last characters contain only "-" and digits (plus 'X' for last digit)
         not (
             PI_orcid_digits_only.isdigit() or (PI_orcid_digits_only[0:15].isdigit() and PI_orcid_digits_only[15] == "X")
@@ -157,7 +158,7 @@ def list_of_names(value, context, extras=None, label=""):
                 warnings.append(f"{label} '{value}' should be of the format <last name>, <first name> <middle initials(s)> or <last name>, <first name> <middle name(s)> where appropriate")
             if not re.fullmatch(character_name_pattern, i):
                 warnings.append(f"{label} '{value}' - please use characters A-Z, a-z, À-ÿ where appropriate")
-    
+
     if type(value) == str:
         if not re.fullmatch(name_pattern, value):
             warnings.append(f"{label} '{value}' should be of the format <last name>, <first name> <middle initials(s)> or <last name>, <first name> <middle name(s)> where appropriate")
@@ -225,7 +226,7 @@ def relation_url_checker(value, context, extras=None, label=""):
     A function to check if Relation field is in the correct format, and that the url exists
     """
     errors = []
-    
+
     if " " not in value:
         errors.append(f"{label} '{value}' should contain a space before the url")
     else:
@@ -241,7 +242,7 @@ def latitude(value, context, extras=None, label=""):
     A function to check if the latitude is within -90 and +90
     """
     errors = []
-    
+
     latitude = re.findall(r'[0-9]+', value)
     int_latitude = int(latitude[0])
     dec_latitude = int(latitude[1])
@@ -257,7 +258,7 @@ def longitude(value, context, extras=None, label=""):
     A function to check if the longitude is within -180 and +180
     """
     errors = []
-    
+
     longitude = re.findall(r'[0-9]+', value)
     int_longitude = int(longitude[0])
     dec_longitude = int(longitude[1])
@@ -266,3 +267,36 @@ def longitude(value, context, extras=None, label=""):
         errors.append(f"{label} '{value}' must be within -180 and +180 ")
 
     return errors
+
+
+def check_qc_flags(value, context, extras=None, label=""):
+    """
+    A function to check flag_values and flag_meanings
+    value - flag_values
+    context - flag_meanings
+    """
+    errors = []
+
+    meanings = context.split(" ")
+
+    # check flag_values are correctly formatted (should be array of bytes)
+    if not (isinstance(value, np.ndarray) or isinstance(value, tuple)):
+        errors.append(f"{label} QC flag_values must be an array or tuple of byte values, not {type(value)}.")
+
+    # check there are at least two values and they start with 0 and 1
+    if not len(value) > 2:
+        errors.append(f"{label} There must be at least two QC flag values.")
+    elif not (np.all(value[:2] == [0, 1]) or np.all(value[:2] == (0, 1))):
+        errors.append(f"{label} First two QC flag_values must be '[0, 1]'.")
+
+    # check there are at least two meanings and the first two are correct
+    if not len(meanings) > 2:
+        errors.append(f"{label} There must be at least two QC flag meanings (space separated).")
+    elif not np.all(meanings[:2] == ["not_used", "good_data"]):
+        errors.append(f"{label} First two QC flag_meanings must be 'not_used' and 'good_data'.")
+
+    # check number of values is same as number of meanings
+    if not len(value) == len(meanings):
+        errors.append(f"{label} Number of flag_values must equal number of flag_meanings.")
+
+    return errors