From 1e888238f8e9aab98657f184af9ab60538c89ad5 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Tue, 30 Jan 2024 17:06:58 -0500 Subject: [PATCH 01/11] Fix rule and check. The response[0] in core was setting what was supposed to be a list to a single string from the list instead. The check for whether a ontology source was in the list was nested beneath another check instead of being its own check. --- isatools/isatab/validate/rules/core.py | 2 +- isatools/isatab/validate/rules/rules_30xx.py | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/isatools/isatab/validate/rules/core.py b/isatools/isatab/validate/rules/core.py index 40774499..c4f603ab 100644 --- a/isatools/isatab/validate/rules/core.py +++ b/isatools/isatab/validate/rules/core.py @@ -54,7 +54,7 @@ def execute(self, validator_params: dict) -> None: try: response = self.rule(*params) if self.identifier == '3008': - validator_params['term_source_refs'] = response[0] + validator_params['term_source_refs'] = response if self.identifier == '4001': validator_params['configs'] = response self.executed = True diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py index 79819624..ed255854 100644 --- a/isatools/isatab/validate/rules/rules_30xx.py +++ b/isatools/isatab/validate/rules/rules_30xx.py @@ -152,6 +152,7 @@ def check_single_field(cell_value, source, acc, cfield, filename): :param filename: Filename of the table :return: True if OK, False if not OK """ + return_value = True if ((cell_has_value(cell_value) and not cell_has_value(source) and cell_has_value(acc)) or not cell_has_value(cell_value)): msg = "Missing Term Source REF in annotation or missing Term Source Name" @@ -159,13 +160,14 @@ def check_single_field(cell_value, source, acc, cfield, filename): "label/accession/source are provided.").format(cfield.header, filename) validator.add_warning(message=msg, supplemental=spl, code=3008) log.warning("(W) {}".format(spl)) - if source not in tsrs: - spl = ("Term Source REF, for the field '{}' in the file '{}' does not refer to a declared " - "Ontology Source.").format(cfield.header, filename) - validator.add_warning(message="Term Source REF reference broken", supplemental=spl, code=3011) - log.warning("(W) {}".format(spl)) - return False - return True + return_value = False + if cell_has_value(source) and source not in tsrs: + spl = ("Term Source REF, for the field '{}' in the file '{}' does not refer to a declared " + "Ontology Source.").format(cfield.header, filename) + validator.add_warning(message="Term Source REF reference broken", supplemental=spl, code=3011) + log.warning("(W) {}".format(spl)) + return_value = False + return return_value result = True nfields = len(table.columns) From 691c35fcdcbd94a7954b5c736183e2cf518c81e5 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Wed, 6 Mar 2024 10:12:41 -0500 Subject: [PATCH 02/11] Testing changes to run on Windows. --- tests/convert/test_isatab2w4m.py | 17 ++++++++++---- tests/isatab/test_isatab.py | 26 ++++++++++++--------- tests/validators/test_validate_test_data.py | 19 +++++++-------- 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/tests/convert/test_isatab2w4m.py b/tests/convert/test_isatab2w4m.py index 30e05bcb..5096a98a 100644 --- a/tests/convert/test_isatab2w4m.py +++ b/tests/convert/test_isatab2w4m.py @@ -1,6 +1,5 @@ # Test conversion to W4M format -import filecmp import os import shutil import tempfile @@ -9,6 +8,16 @@ from isatools.tests import utils +def universal_filecmp(f1, f2): + with open(f1, 'r') as fp1, open(f2, 'r') as fp2: + while True: + b1 = fp1.readline() + b2 = fp2.readline() + if b1 != b2: + return False + if not b1: + return True + # Test presence of data folder def setUpModule(): if not os.path.exists(utils.DATA_DIR): @@ -46,7 +55,7 @@ def plain_test(self, study, test_dir): output_file = os.path.join(self._tmp_dir, '.'.join( ['-'.join([study, 'w4m', x]), 'tsv'])) self.assertTrue(os.path.exists(output_file)) - self.assertTrue(filecmp.cmp(output_file, ref_file, shallow=False), + self.assertTrue(universal_filecmp(output_file, ref_file), 'Output file "{0}" differs from reference file "{1}".'.format(output_file, ref_file)) # Test MTBLS30 @@ -89,7 +98,7 @@ def na_filtering_test(self, study, test_dir, samp_na_filtering=None, 'sample-metadata', 'variable-metadata', 'sample-variable-matrix']: self.assertTrue(os.path.exists(output_files[x])) self.assertTrue( - filecmp.cmp(output_files[x], ref_files[x]), + universal_filecmp(output_files[x], ref_files[x]), 'Output file "{0}" differs from reference file "{1}".'.format( output_files[x], ref_files[x])) @@ -140,5 +149,5 @@ def test_assay_selection(self): ['-'.join([study, 'w4m', x, assay]), 'tsv'])) self.assertTrue(os.path.exists(output_file)) self.assertTrue( - filecmp.cmp(output_file, ref_file), + universal_filecmp(output_file, ref_file), 'Output file "{0}" differs from reference file "{1}".'.format(output_file, ref_file)) diff --git a/tests/isatab/test_isatab.py b/tests/isatab/test_isatab.py index 1e249e50..25eee0d4 100644 --- a/tests/isatab/test_isatab.py +++ b/tests/isatab/test_isatab.py @@ -28,6 +28,9 @@ def setUpModule(): "git clone -b tests --single-branch git@github.com:ISA-tools/ISAdatasets {0}" .format(utils.DATA_DIR)) +def replace_windows_newlines(input_string): + return input_string.replace('\r\r\n', '\n').replace('\r\n', '\n').replace('\r', '\n') + class TestIsaMerge(unittest.TestCase): @@ -1069,7 +1072,7 @@ def test_source_protocol_ref_sample(self): i.studies = [s] expected = """Source Name\tProtocol REF\tSample Name source1\tsample collection\tsample1""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_source_protocol_ref_sample_x2(self): i = Investigation() @@ -1167,7 +1170,7 @@ def test_source_protocol_ref_sample_with_characteristics(self): i.studies = [s] expected = """Source Name\tCharacteristics[reference descriptor]\tProtocol REF\tSample Name\tCharacteristics[organism part] source1\tnot applicable\tsample collection\tsample1\tliver""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_source_protocol_ref_sample_with_parameter_values(self): i = Investigation() @@ -1188,7 +1191,7 @@ def test_source_protocol_ref_sample_with_parameter_values(self): i.studies = [s] expected = """Source Name\tProtocol REF\tParameter Value[temperature]\tSample Name source1\tsample collection\t10\tsample1""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_source_protocol_ref_sample_with_factor_values(self): i = Investigation() @@ -1216,11 +1219,11 @@ def test_source_protocol_ref_sample_with_factor_values(self): s.assays = [a] expected_study_table = """Source Name\tProtocol REF\tSample Name\tFactor Value[study group] source1\tsample collection\tsample1\tStudy group 1""" - self.assertIn(expected_study_table, isatab.dumps(i)) + self.assertIn(expected_study_table, replace_windows_newlines(isatab.dumps(i))) expected_assay_table = """Sample Name\tFactor Value[study group]\tProtocol REF sample1\tStudy group 1\textraction""" self.assertIn(expected_assay_table, - isatab.dumps(i, write_fvs_in_assay_table=True)) + replace_windows_newlines(isatab.dumps(i, write_fvs_in_assay_table=True))) def test_source_protocol_ref_protocol_ref_sample(self): i = Investigation() @@ -1239,7 +1242,7 @@ def test_source_protocol_ref_protocol_ref_sample(self): i.studies = [s] expected = """Source Name\tProtocol REF\tProtocol REF\tSample Name source1\tsample collection\taliquoting\taliquot1""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_source_protocol_ref_sample_protocol_ref_sample(self): i = Investigation() @@ -1261,7 +1264,7 @@ def test_source_protocol_ref_sample_protocol_ref_sample(self): i.studies = [s] expected = """Source Name\tProtocol REF\tSample Name\tProtocol REF\tSample Name source1\tsample collection\tsample1\taliquoting\taliquot1""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_sample_protocol_ref_material_protocol_ref_data2(self): i = Investigation() @@ -1295,7 +1298,7 @@ def test_sample_protocol_ref_material_protocol_ref_data2(self): i.studies = [s] expected = (f"""Sample Name\tProtocol REF\tExtract Name\tProtocol REF\tAssay Name\tRaw Data File\tComment[checksum type]\tComment[checksum]\n""" + f"""sample1\textraction\textract1\tnucleic acid sequencing\tassay-1\tdatafile.raw\t{cs_comment1.value}\t{cs_comment2.value}""") - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_sample_protocol_ref_material_protocol_ref_data3(self): i = Investigation() @@ -1334,7 +1337,7 @@ def test_sample_protocol_ref_material_protocol_ref_data3(self): # self.assertIn(expected_line1, dump_out) # self.assertIn(expected_line2, dump_out) - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_sample_protocol_ref_material_protocol_ref_data4(self): i = Investigation() @@ -1373,7 +1376,7 @@ def test_sample_protocol_ref_material_protocol_ref_data4(self): # self.assertIn(expected_line1, dump_out) # self.assertIn(expected_line2, dump_out) - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_sample_protocol_ref_material_protocol_ref_data_x2(self): i = Investigation() @@ -1710,7 +1713,7 @@ def test_isatab_preprocess_issue235(self): test_isatab_str = b""""Sample Name" "Protocol REF" "Parameter Value[medium]" "Term Source REF" "Term Accession Number" "Parameter Value[serum]" "Term Source REF" "Term Accession Number" "Parameter Value[serum concentration]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[medium volume]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[migration modulator]" "Term Source REF" "Term Accession Number" "Parameter Value[modulator concentration]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[modulator distribution]" "Term Source REF" "Term Accession Number" "Protocol REF" "Parameter Value[imaging technique]" "Term Source REF" "Term Accession Number" "Parameter Value[imaging technique temporal feature]" "Term Source REF" "Term Accession Number" "Parameter Value[acquisition duration]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[time interval]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[objective type]" "Term Source REF" "Term Accession Number" "Parameter Value[objective magnification]" "Term Source REF" "Term Accession Number" "Parameter Value[objective numerical aperture]" "Term Source REF" "Term Accession Number" "Parameter Value[acquisition channel count]" "Term Source REF" "Term Accession Number" "Parameter Value[reporter]" "Term Source REF" "Term Accession Number" "Parameter Value[voxel size]" "Unit" "Term Source REF" "Term Accession Number" "Assay Name" "Raw Data File" "Protocol REF" "Parameter Value[software]" "Term Source REF" "Term Accession Number" "Data Transformation Name" "Derived Data File" "culture1" "migration assay" "RPMI-1640" "" "" "Heat Inactivated Fetal Bovine Serum " "" "" "10" "%" "UO" "http://purl.obolibrary.org/obo/UO_0000165" "300" "microliter" "UO" "http://purl.obolibrary.org/obo/UO_0000101" "" "" "" "" "" "" "" "gradient" "" "" "imaging" "phase-contrast microscopy" "" "" "dynamic" "" "" "6" "hour" "UO" "http://purl.obolibrary.org/obo/UO_0000032" "15" "minute" "UO" "http://purl.obolibrary.org/obo/UO_0000031" "" "" "" "20" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "culture1" "" "data transformation" "CELLMIA" "" "" "" "" """ - with tempfile.NamedTemporaryFile() as tmp: + with tempfile.NamedTemporaryFile(delete=False) as tmp: tmp.write(test_isatab_str) tmp.seek(0) study_assay_parser = isatab_parser.StudyAssayParser('mock.txt') @@ -1719,6 +1722,7 @@ def test_isatab_preprocess_issue235(self): if """Protocol REF\tData Transformation Name""" in header: self.fail('Incorrectly inserted Protocol REF before ' 'Data Transformation Name') + os.remove(tmp.name) def test_isatab_factor_value_parsing_issue270(self): with open(os.path.join(self._tab_data_dir, 'issue270', 'i_matteo.txt'), diff --git a/tests/validators/test_validate_test_data.py b/tests/validators/test_validate_test_data.py index 33fb9840..4d8c1a4e 100644 --- a/tests/validators/test_validate_test_data.py +++ b/tests/validators/test_validate_test_data.py @@ -2,6 +2,7 @@ import logging import os import unittest +import pathlib from jsonschema import Draft4Validator from jsonschema import RefResolver @@ -304,8 +305,8 @@ class TestIsaJsonCreateTestData(unittest.TestCase): def setUp(self): self._reporting_level = logging.ERROR - self.v2_create_schemas_path = os.path.join( - os.path.dirname(__file__), '../..', 'isatools', 'resources', 'schemas', + self.v2_create_schemas_path = pathlib.PurePosixPath( + pathlib.Path(__file__).parents[0], '..', '..', 'isatools', 'resources', 'schemas', 'isa_model_version_2_0_schemas', 'create') def test_validate_testdata_sampleassayplan_json(self): @@ -314,10 +315,9 @@ def test_validate_testdata_sampleassayplan_json(self): with open(os.path.join(self.v2_create_schemas_path, 'sample_assay_plan_schema.json')) as fp: sample_assay_plan_schema = json.load(fp) - resolver = RefResolver('file://{}'.format( - os.path.join(self.v2_create_schemas_path, - 'sample_assay_plan_schema.json')), - sample_assay_plan_schema) + res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path, + 'sample_assay_plan_schema.json').as_uri() + resolver = RefResolver(res_path, sample_assay_plan_schema) validator = Draft4Validator(sample_assay_plan_schema, resolver=resolver) validator.validate(json.load(test_case_fp)) @@ -342,10 +342,9 @@ def test_validate_testdata_treatment_sequence_json(self): with open(os.path.join(self.v2_create_schemas_path, 'treatment_sequence_schema.json')) as fp: treatment_sequence_schema = json.load(fp) - resolver = RefResolver('file://{}'.format( - os.path.join(self.v2_create_schemas_path, - 'treatment_sequence_schema.json')), - treatment_sequence_schema) + res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path, + 'treatment_sequence_schema.json').as_uri() + resolver = RefResolver(res_path, treatment_sequence_schema) validator = Draft4Validator(treatment_sequence_schema, resolver=resolver) validator.validate(json.load(test_case_fp)) From 00509085ba6baadaa9ca60e10dfcb6fdd58d7110 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Thu, 22 Feb 2024 02:18:55 -0500 Subject: [PATCH 03/11] Update defaults.py The previous regex didn't match any DOI I tried. I found this one in a blog post from Crossref where they say it matched 74.4M out of 74.9M DOIs that they have seen. The post is here: https://www.crossref.org/blog/dois-and-matching-regular-expressions/ --- isatools/isatab/defaults.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isatools/isatab/defaults.py b/isatools/isatab/defaults.py index c38566e1..c2487123 100644 --- a/isatools/isatab/defaults.py +++ b/isatools/isatab/defaults.py @@ -34,7 +34,7 @@ def pbar(x): _RX_I_FILE_NAME = compile(r'i_(.*?)\.txt') _RX_DATA = compile(r'data\[(.*?)\]') _RX_COMMENT = compile(r'Comment\[(.*?)\]') -_RX_DOI = compile(r'(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)') +_RX_DOI = compile(r'10.\d{4,9}/[-._;()/:a-z0-9A-Z]+') _RX_PMID = compile(r'[0-9]{8}') _RX_PMCID = compile(r'PMC[0-9]{8}') _RX_CHARACTERISTICS = compile(r'Characteristics\[(.*?)\]') From 2275ee912949fc20f1d653c282842ccfb2d272a0 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Wed, 6 Mar 2024 14:20:34 -0500 Subject: [PATCH 04/11] Update validate/test__core.py Since the DOI regex works now there are 2 fewer warnings. Previously it was warning about valid DOIs. --- tests/isatab/validate/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/isatab/validate/test_core.py b/tests/isatab/validate/test_core.py index e2b2c3cd..6c9aeda9 100644 --- a/tests/isatab/validate/test_core.py +++ b/tests/isatab/validate/test_core.py @@ -17,7 +17,7 @@ def test_b_ii_s_3(self): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-3') with open(path.join(data_path, 'i_gilbert.txt'), 'r') as data_file: r = validate(fp=data_file, config_dir=self.default_conf, origin="") - self.assertEqual(len(r['warnings']), 12) + self.assertEqual(len(r['warnings']), 10) def test_mtbls267(self): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'MTBLS267-partial') @@ -82,7 +82,7 @@ def is_investigation(investigation_df): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-3') with open(path.join(data_path, 'i_gilbert.txt'), 'r') as data_file: r = validate(data_file, rules=rules) - self.assertEqual(len(r['warnings']), 12) + self.assertEqual(len(r['warnings']), 10) rule = '12000' expected_error = { From 0f9214f9779d7d8cf316acff5012a9a45f6e11e1 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Thu, 7 Mar 2024 12:57:37 -0500 Subject: [PATCH 05/11] More changes The last changes were initially tested and created on Windows, but had errors on GitHub. I fixed those errors, but didn't retest on Windows. This has been tested on both and is problem free. --- tests/isatab/test_isatab.py | 2 +- tests/validators/test_validate_test_data.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/isatab/test_isatab.py b/tests/isatab/test_isatab.py index 25eee0d4..9936febe 100644 --- a/tests/isatab/test_isatab.py +++ b/tests/isatab/test_isatab.py @@ -440,7 +440,7 @@ def test_isatab_dump_source_sample_char_quant(self): s.process_sequence = [sample_collection_process] s.samples.append(sample1) i.studies = [s] - actual = isatab.dumps(i) + actual = replace_windows_newlines(isatab.dumps(i)) expected = """Source Name\tMaterial Type\tCharacteristics[organism]\tTerm Source REF\tTerm Accession Number\tCharacteristics[body weight]\tUnit\tTerm Source REF\tTerm Accession Number\tProtocol REF\tParameter Value[vessel]\tTerm Source REF\tTerm Accession Number\tParameter Value[storage temperature]\tUnit\tTerm Source REF\tTerm Accession Number\tSample Name\tCharacteristics[organism part]\tTerm Source REF\tTerm Accession Number\tCharacteristics[specimen mass]\tUnit\tTerm Source REF\tTerm Accession Number source1\tspecimen\tHuman\tNCBITAXON\thttp://purl.bioontology.org/ontology/STY/T016\t72\tkilogram\tUO\thttp://purl.obolibrary.org/obo/UO_0000009\tsample collection\teppendorf tube\tOBI\tpurl.org\t-20\tdegree Celsius\tUO\thttp://purl.obolibrary.org/obo/UO_0000027\tsample1\tliver\tUBERON\thttp://purl.obolibrary.org/obo/UBERON_0002107\t450.5\tmilligram\tUO\thttp://purl.obolibrary.org/obo/UO_0000022""" self.assertIn(expected, actual) diff --git a/tests/validators/test_validate_test_data.py b/tests/validators/test_validate_test_data.py index 4d8c1a4e..35fda568 100644 --- a/tests/validators/test_validate_test_data.py +++ b/tests/validators/test_validate_test_data.py @@ -305,7 +305,7 @@ class TestIsaJsonCreateTestData(unittest.TestCase): def setUp(self): self._reporting_level = logging.ERROR - self.v2_create_schemas_path = pathlib.PurePosixPath( + self.v2_create_schemas_path = pathlib.Path( pathlib.Path(__file__).parents[0], '..', '..', 'isatools', 'resources', 'schemas', 'isa_model_version_2_0_schemas', 'create') @@ -315,7 +315,7 @@ def test_validate_testdata_sampleassayplan_json(self): with open(os.path.join(self.v2_create_schemas_path, 'sample_assay_plan_schema.json')) as fp: sample_assay_plan_schema = json.load(fp) - res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path, + res_path = pathlib.Path("file://", self.v2_create_schemas_path, 'sample_assay_plan_schema.json').as_uri() resolver = RefResolver(res_path, sample_assay_plan_schema) validator = Draft4Validator(sample_assay_plan_schema, @@ -342,7 +342,7 @@ def test_validate_testdata_treatment_sequence_json(self): with open(os.path.join(self.v2_create_schemas_path, 'treatment_sequence_schema.json')) as fp: treatment_sequence_schema = json.load(fp) - res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path, + res_path = pathlib.Path("file://", self.v2_create_schemas_path, 'treatment_sequence_schema.json').as_uri() resolver = RefResolver(res_path, treatment_sequence_schema) validator = Draft4Validator(treatment_sequence_schema, From b874af46d6f89de64eadd012d8bba6581b93edf0 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Thu, 1 Feb 2024 16:24:37 -0500 Subject: [PATCH 06/11] Rename i_df and investigation_df. i_df and investigation_df are not a DataFrame as indicated in parameter typing or as the name would suggest. It's actually a dictionary of DataFrames and lists of DataFrames. I have renamed them to reflect this. --- isatools/isatab/validate/core.py | 8 +-- isatools/isatab/validate/rules/core.py | 10 ++-- isatools/isatab/validate/rules/defaults.py | 36 ++++++------ isatools/isatab/validate/rules/rules_00xx.py | 8 +-- isatools/isatab/validate/rules/rules_10xx.py | 62 ++++++++++---------- isatools/isatab/validate/rules/rules_30xx.py | 40 ++++++------- isatools/isatab/validate/rules/rules_40xx.py | 36 ++++++------ 7 files changed, 100 insertions(+), 100 deletions(-) diff --git a/isatools/isatab/validate/core.py b/isatools/isatab/validate/core.py index 2fa6a08c..da13d88b 100644 --- a/isatools/isatab/validate/core.py +++ b/isatools/isatab/validate/core.py @@ -186,21 +186,21 @@ def validate(fp: TextIO, built_rules = build_rules(rules) try: - i_df = load_investigation(fp=fp) + i_df_dict = load_investigation(fp=fp) params = { - "investigation_df": i_df, + "investigation_df_dict": i_df_dict, "dir_context": path.dirname(fp.name), "configs": config_dir, } investigation_validator = ISAInvestigationValidator(**params, **built_rules['investigation']) - for i, study_df in enumerate(i_df['studies']): + for i, study_df in enumerate(i_df_dict['studies']): study_filename = study_df.iloc[0]['Study File Name'] study_validator = ISAStudyValidator(validator=investigation_validator, study_index=i, study_filename=study_filename, study_df=study_df, **built_rules['studies']) assay_tables = list() - assay_df = study_validator.params['investigation_df']['s_assays'][i] + assay_df = study_validator.params['investigation_df_dict']['s_assays'][i] for x, assay_filename in enumerate(assay_df['Study Assay File Name'].tolist()): ISAAssayValidator(assay_tables=assay_tables, validator=study_validator, assay_index=x, assay_df=assay_df, assay_filename=assay_filename, **built_rules['assays']) diff --git a/isatools/isatab/validate/rules/core.py b/isatools/isatab/validate/rules/core.py index 40774499..d0fe08fd 100644 --- a/isatools/isatab/validate/rules/core.py +++ b/isatools/isatab/validate/rules/core.py @@ -108,14 +108,14 @@ def validate_rules(self, validator): class ISAInvestigationValidator: def __init__(self, - investigation_df: DataFrame, + investigation_df_dict: dict, dir_context: str, configs: str, available_rules: list = INVESTIGATION_RULES_MAPPING, rules_to_run: tuple = DEFAULT_INVESTIGATION_RULES): """ The ISA investigation validator class - :param investigation_df: the investigation dataframe + :param investigation_df_dict: a dictionary of DataFrames and lists of DataFrames representing the investigation file :param dir_context: the directory of the investigation :param configs: directory of the XML config files :param available_rules: a customizable list of all available rules for investigation objects @@ -124,7 +124,7 @@ def __init__(self, self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules) self.has_validated = False self.params = { - 'investigation_df': investigation_df, + 'investigation_df_dict': investigation_df_dict, 'dir_context': dir_context, 'configs': configs, 'term_source_refs': None @@ -162,8 +162,8 @@ def __init__(self, self.params['study_sample_table'] = load_table(s_fp) self.params['study_sample_table'].filename = study_filename - protocol_names = self.params['investigation_df']['s_protocols'][study_index]['Study Protocol Name'].tolist() - protocol_types = self.params['investigation_df']['s_protocols'][study_index]['Study Protocol Type'].tolist() + protocol_names = self.params['investigation_df_dict']['s_protocols'][study_index]['Study Protocol Name'].tolist() + protocol_types = self.params['investigation_df_dict']['s_protocols'][study_index]['Study Protocol Type'].tolist() self.params['protocol_names_and_types'] = dict(zip(protocol_names, protocol_types)) self.params['study_group_size_in_comment'] = None diff --git a/isatools/isatab/validate/rules/defaults.py b/isatools/isatab/validate/rules/defaults.py index a655cd62..eaafb849 100644 --- a/isatools/isatab/validate/rules/defaults.py +++ b/isatools/isatab/validate/rules/defaults.py @@ -30,30 +30,30 @@ INVESTIGATION_RULES_MAPPING = [ - {'rule': check_table_files_read, 'params': ['investigation_df', 'dir_context'], 'identifier': '0006'}, + {'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0006'}, - {'rule': sample_not_declared, 'params': ['investigation_df', 'dir_context'], 'identifier': '1003'}, - {'rule': check_protocol_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1007'}, - {'rule': check_study_factor_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1008'}, - {'rule': check_protocol_parameter_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1009'}, - {'rule': check_protocol_names, 'params': ['investigation_df'], 'identifier': '1010'}, - {'rule': check_protocol_parameter_names, 'params': ['investigation_df'], 'identifier': '1011'}, - {'rule': check_study_factor_names, 'params': ['investigation_df'], 'identifier': '1012'}, + {'rule': sample_not_declared, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1003'}, + {'rule': check_protocol_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1007'}, + {'rule': check_study_factor_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1008'}, + {'rule': check_protocol_parameter_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1009'}, + {'rule': check_protocol_names, 'params': ['investigation_df_dict'], 'identifier': '1010'}, + {'rule': check_protocol_parameter_names, 'params': ['investigation_df_dict'], 'identifier': '1011'}, + {'rule': check_study_factor_names, 'params': ['investigation_df_dict'], 'identifier': '1012'}, - {'rule': check_date_formats, 'params': ['investigation_df'], 'identifier': '3001'}, - {'rule': check_dois, 'params': ['investigation_df'], 'identifier': '3002'}, - {'rule': check_pubmed_ids_format, 'params': ['investigation_df'], 'identifier': '3003'}, - {'rule': check_ontology_sources, 'params': ['investigation_df'], 'identifier': '3008'}, + {'rule': check_date_formats, 'params': ['investigation_df_dict'], 'identifier': '3001'}, + {'rule': check_dois, 'params': ['investigation_df_dict'], 'identifier': '3002'}, + {'rule': check_pubmed_ids_format, 'params': ['investigation_df_dict'], 'identifier': '3003'}, + {'rule': check_ontology_sources, 'params': ['investigation_df_dict'], 'identifier': '3008'}, {'rule': load_config, 'params': ['configs'], 'identifier': '4001'}, - {'rule': check_measurement_technology_types, 'params': ['investigation_df', 'configs'], 'identifier': '4002'}, - {'rule': check_investigation_against_config, 'params': ['investigation_df', 'configs'], 'identifier': '4003'}, + {'rule': check_measurement_technology_types, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4002'}, + {'rule': check_investigation_against_config, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4003'}, # copies - {'rule': check_table_files_read, 'params': ['investigation_df', 'dir_context'], 'identifier': '0008'}, - {'rule': check_protocol_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1019'}, - {'rule': check_protocol_parameter_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1020'}, - {'rule': check_study_factor_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1021'}, + {'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0008'}, + {'rule': check_protocol_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1019'}, + {'rule': check_protocol_parameter_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1020'}, + {'rule': check_study_factor_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1021'}, ] STUDY_RULES_MAPPING = [ diff --git a/isatools/isatab/validate/rules/rules_00xx.py b/isatools/isatab/validate/rules/rules_00xx.py index 248d1447..1a52aa56 100644 --- a/isatools/isatab/validate/rules/rules_00xx.py +++ b/isatools/isatab/validate/rules/rules_00xx.py @@ -5,14 +5,14 @@ from isatools.isatab.defaults import log -def check_table_files_read(i_df, dir_context): +def check_table_files_read(i_df_dict, dir_context): """Used for rules 0006 and 0008 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param dir_context: Path to where the investigation file is found :return: None """ - for i, study_df in enumerate(i_df['studies']): + for i, study_df in enumerate(i_df_dict['studies']): study_filename = study_df.iloc[0]['Study File Name'] if study_filename != '': try: @@ -22,7 +22,7 @@ def check_table_files_read(i_df, dir_context): spl = "Study File {} does not appear to exist".format(study_filename) validator.add_error(message="Missing study tab file(s)", supplemental=spl, code=6) log.error("(E) Study File {} does not appear to exist".format(study_filename)) - for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): + for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename != '': try: with utf8_text_file_open(path.join(dir_context, assay_filename)): diff --git a/isatools/isatab/validate/rules/rules_10xx.py b/isatools/isatab/validate/rules/rules_10xx.py index 190cd273..6bfc1b0c 100644 --- a/isatools/isatab/validate/rules/rules_10xx.py +++ b/isatools/isatab/validate/rules/rules_10xx.py @@ -9,14 +9,14 @@ from isatools.isatab.utils import cell_has_value -def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context): +def check_samples_not_declared_in_study_used_in_assay(i_df_dict, dir_context): """Checks if samples found in assay tables are found in the study-sample table - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param dir_context: Path to where the investigation file is found :return: None """ - for i, study_df in enumerate(i_df['studies']): + for i, study_df in enumerate(i_df_dict['studies']): study_filename = study_df.iloc[0]['Study File Name'] if study_filename != '': try: @@ -25,7 +25,7 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context): study_samples = set(study_df['Sample Name']) except FileNotFoundError: pass - for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): + for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename != '': try: with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp: @@ -40,15 +40,15 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context): pass -def check_study_factor_usage(i_df, dir_context): +def check_study_factor_usage(i_df_dict, dir_context): """Used for rules 1008 and 1021 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param dir_context: Path to where the investigation file is found :return: None """ - for i, study_df in enumerate(i_df['studies']): - study_factors_declared = set(i_df['s_factors'][i]['Study Factor Name'].tolist()) + for i, study_df in enumerate(i_df_dict['studies']): + study_factors_declared = set(i_df_dict['s_factors'][i]['Study Factor Name'].tolist()) study_filename = study_df.iloc[0]['Study File Name'] error_spl = "Some factors used in an study file {} are not declared in the investigation file: {}" error_msg = "Some factors are not declared in the investigation" @@ -66,7 +66,7 @@ def check_study_factor_usage(i_df, dir_context): validator.add_error(message=error_msg, supplemental=spl, code=1008) except FileNotFoundError: pass - for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): + for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename != '': try: study_factors_used = set() @@ -92,7 +92,7 @@ def check_study_factor_usage(i_df, dir_context): study_factors_used = study_factors_used.union(set(fv)) except FileNotFoundError: pass - for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): + for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename != '': try: with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp: @@ -109,15 +109,15 @@ def check_study_factor_usage(i_df, dir_context): .format(list(study_factors_declared - study_factors_used))) -def check_protocol_usage(i_df, dir_context): +def check_protocol_usage(i_df_dict, dir_context): """Used for rules 1007 and 1019 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param dir_context: Path to where the investigation file is found :return: None """ - for i, study_df in enumerate(i_df['studies']): - protocols_declared = set(i_df['s_protocols'][i]['Study Protocol Name'].tolist()) + for i, study_df in enumerate(i_df_dict['studies']): + protocols_declared = set(i_df_dict['s_protocols'][i]['Study Protocol Name'].tolist()) protocols_declared.add('') study_filename = study_df.iloc[0]['Study File Name'] if study_filename != '': @@ -136,7 +136,7 @@ def check_protocol_usage(i_df, dir_context): log.error("(E) {}".format(spl)) except FileNotFoundError: pass - for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): + for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename != '': try: protocol_refs_used = set() @@ -165,7 +165,7 @@ def check_protocol_usage(i_df, dir_context): except FileNotFoundError: pass for j, assay_filename in enumerate( - i_df['s_assays'][i]['Study Assay File Name'].tolist()): + i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename != '': try: with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp: @@ -183,16 +183,16 @@ def check_protocol_usage(i_df, dir_context): log.warning(warning) -def check_protocol_parameter_usage(i_df, dir_context): +def check_protocol_parameter_usage(i_df_dict, dir_context): """Used for rules 1009 and 1020 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param dir_context: Path to where the investigation file is found :return: None """ - for i, study_df in enumerate(i_df['studies']): + for i, study_df in enumerate(i_df_dict['studies']): protocol_parameters_declared = set() - protocol_parameters_per_protocol = set(i_df['s_protocols'][i]['Study Protocol Parameters Name'].tolist()) + protocol_parameters_per_protocol = set(i_df_dict['s_protocols'][i]['Study Protocol Parameters Name'].tolist()) for protocol_parameters in protocol_parameters_per_protocol: parameters_list = protocol_parameters.split(';') protocol_parameters_declared = protocol_parameters_declared.union(set(parameters_list)) @@ -216,7 +216,7 @@ def check_protocol_parameter_usage(i_df, dir_context): log.error(error) except FileNotFoundError: pass - for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): + for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename != '': try: protocol_parameters_used = set() @@ -246,7 +246,7 @@ def check_protocol_parameter_usage(i_df, dir_context): protocol_parameters_used = protocol_parameters_used.union(set(pv)) except FileNotFoundError: pass - for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): + for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename != '': try: with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp: @@ -263,13 +263,13 @@ def check_protocol_parameter_usage(i_df, dir_context): log.warning(warning) -def check_protocol_names(i_df): +def check_protocol_names(i_df_dict): """Used for rule 1010 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :return: None """ - for study_protocols_df in i_df['s_protocols']: + for study_protocols_df in i_df_dict['s_protocols']: for i, protocol_name in enumerate(study_protocols_df['Study Protocol Name'].tolist()): # DataFrames labels empty cells as 'Unnamed: n' if protocol_name == '' or 'Unnamed: ' in protocol_name: @@ -279,13 +279,13 @@ def check_protocol_names(i_df): log.warning(warning) -def check_protocol_parameter_names(i_df): +def check_protocol_parameter_names(i_df_dict): """Used for rule 1011 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :return: None """ - for study_protocols_df in i_df['s_protocols']: + for study_protocols_df in i_df_dict['s_protocols']: for i, protocol_parameters_names in enumerate(study_protocols_df['Study Protocol Parameters Name'].tolist()): # There's an empty cell if no protocols if len(protocol_parameters_names.split(sep=';')) > 1: @@ -298,13 +298,13 @@ def check_protocol_parameter_names(i_df): log.warning(warning) -def check_study_factor_names(i_df): +def check_study_factor_names(i_df_dict): """Used for rule 1012 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :return: None """ - for study_factors_df in i_df['s_factors']: + for study_factors_df in i_df_dict['s_factors']: for i, factor_name in enumerate(study_factors_df['Study Factor Name'].tolist()): # DataFrames labels empty cells as 'Unnamed: n' if factor_name == '' or 'Unnamed: ' in factor_name: diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py index 22e2e74a..0321348c 100644 --- a/isatools/isatab/validate/rules/rules_30xx.py +++ b/isatools/isatab/validate/rules/rules_30xx.py @@ -7,27 +7,27 @@ from isatools.isatab.utils import cell_has_value -def check_filenames_present(i_df: DataFrame) -> None: +def check_filenames_present(i_df_dict: DataFrame) -> None: """ Used for rule 3005 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :return: None """ - for s_pos, study_df in enumerate(i_df['studies']): + for s_pos, study_df in enumerate(i_df_dict['studies']): if study_df.iloc[0]['Study File Name'] == '': validator.add_warning(message="Missing Study File Name", supplemental="STUDY.{}".format(s_pos), code=3005) log.warning("(W) A study filename is missing for STUDY.{}".format(s_pos)) - for a_pos, filename in enumerate(i_df['s_assays'][s_pos]['Study Assay File Name'].tolist()): + for a_pos, filename in enumerate(i_df_dict['s_assays'][s_pos]['Study Assay File Name'].tolist()): if filename == '': spl = "STUDY.{}, STUDY ASSAY.{}".format(s_pos, a_pos) validator.add_warning.append(message="Missing assay file name", supplemental=spl, code=3005) log.warning("(W) An assay filename is missing for STUDY ASSAY.{}".format(a_pos)) -def check_date_formats(i_df): +def check_date_formats(i_df_dict): """ Used for rule 3001 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :return: None """ @@ -45,13 +45,13 @@ def check_iso8601_date(date_str): validator.add_warning(message="Date is not ISO8601 formatted", supplemental=spl, code=3001) log.warning("(W) Date {} does not conform to ISO8601 format".format(date_str)) - release_date_vals = i_df['investigation']['Investigation Public Release Date'].tolist() + release_date_vals = i_df_dict['investigation']['Investigation Public Release Date'].tolist() if len(release_date_vals) > 0: check_iso8601_date(release_date_vals[0]) - sub_date_values = i_df['investigation']['Investigation Submission Date'].tolist() + sub_date_values = i_df_dict['investigation']['Investigation Submission Date'].tolist() if len(sub_date_values) > 0: check_iso8601_date(sub_date_values[0]) - for i, study_df in enumerate(i_df['studies']): + for i, study_df in enumerate(i_df_dict['studies']): release_date_vals = study_df['Study Public Release Date'].tolist() if len(release_date_vals) > 0: check_iso8601_date(release_date_vals[0]) @@ -60,10 +60,10 @@ def check_iso8601_date(date_str): check_iso8601_date(sub_date_values[0]) -def check_dois(i_df): +def check_dois(i_df_dict): """ Used for rule 3002 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :return: None """ @@ -79,17 +79,17 @@ def check_doi(doi_str): validator.add_warning(message="DOI is not valid format", supplemental=spl, code=3002) log.warning("(W) DOI {} does not conform to DOI format".format(doi_str)) - for doi in i_df['i_publications']['Investigation Publication DOI'].tolist(): + for doi in i_df_dict['i_publications']['Investigation Publication DOI'].tolist(): check_doi(doi) - for i, study_df in enumerate(i_df['s_publications']): + for i, study_df in enumerate(i_df_dict['s_publications']): for doi in study_df['Study Publication DOI'].tolist(): check_doi(doi) -def check_pubmed_ids_format(i_df): +def check_pubmed_ids_format(i_df_dict): """ Used for rule 3003 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :return: None """ @@ -105,21 +105,21 @@ def check_pubmed_id(pubmed_id_str): validator.add_warning(message="PubMed ID is not valid format", supplemental=spl, code=3003) log.warning("(W) PubMed ID {} is not valid format".format(pubmed_id_str)) - for doi in i_df['i_publications']['Investigation PubMed ID'].tolist(): + for doi in i_df_dict['i_publications']['Investigation PubMed ID'].tolist(): check_pubmed_id(str(doi)) - for study_pubs_df in i_df['s_publications']: + for study_pubs_df in i_df_dict['s_publications']: for doi in study_pubs_df['Study PubMed ID'].tolist(): check_pubmed_id(str(doi)) -def check_ontology_sources(i_df): +def check_ontology_sources(i_df_dict): """ Used for rule 3008 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :return: None """ term_source_refs = [] - for i, ontology_source_name in enumerate(i_df['ontology_sources']['Term Source Name'].tolist()): + for i, ontology_source_name in enumerate(i_df_dict['ontology_sources']['Term Source Name'].tolist()): if ontology_source_name == '' or 'Unnamed: ' in ontology_source_name: spl = "pos={}".format(i) warn = "(W) An Ontology Source Reference at position {} is missing Term Source Name, so can't be referenced" diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py index 7f39c0df..84b87ace 100644 --- a/isatools/isatab/validate/rules/rules_40xx.py +++ b/isatools/isatab/validate/rules/rules_40xx.py @@ -13,10 +13,10 @@ ) -def check_investigation_against_config(i_df, configs): +def check_investigation_against_config(i_df_dict, configs): """Checks investigation file against the loaded configurations - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param configs: A dictionary of ISA Configuration objects :return: None """ @@ -52,18 +52,18 @@ def check_section_against_required_fields_one_value(section, required, i=0): config_fields = configs[('[investigation]', '')].get_isatab_configuration()[0].get_field() required_fields = [i.header for i in config_fields if i.is_required] - check_section_against_required_fields_one_value(i_df['investigation'], required_fields) - check_section_against_required_fields_one_value(i_df['i_publications'], required_fields) - check_section_against_required_fields_one_value(i_df['i_contacts'], required_fields) + check_section_against_required_fields_one_value(i_df_dict['investigation'], required_fields) + check_section_against_required_fields_one_value(i_df_dict['i_publications'], required_fields) + check_section_against_required_fields_one_value(i_df_dict['i_contacts'], required_fields) - for x, study_df in enumerate(i_df['studies']): - check_section_against_required_fields_one_value(i_df['studies'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df['s_design_descriptors'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df['s_publications'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df['s_factors'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df['s_assays'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df['s_protocols'][x], required_fields, x) - check_section_against_required_fields_one_value(i_df['s_contacts'][x], required_fields, x) + for x, study_df in enumerate(i_df_dict['studies']): + check_section_against_required_fields_one_value(i_df_dict['studies'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_design_descriptors'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_publications'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_factors'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_assays'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_protocols'][x], required_fields, x) + check_section_against_required_fields_one_value(i_df_dict['s_contacts'][x], required_fields, x) def load_config(config_dir): @@ -92,16 +92,16 @@ def load_config(config_dir): return configs -def check_measurement_technology_types(i_df, configs): +def check_measurement_technology_types(i_df_dict, configs): """Rule 4002 - :param i_df: An investigation DataFrame + :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file :param configs: A dictionary of ISA Configuration objects :return: None """ - for i, assay_df in enumerate(i_df['s_assays']): - measurement_types = assay_df['Study Assay Measurement Type'].tolist() - technology_types = assay_df['Study Assay Technology Type'].tolist() + for i, study_assays_df in enumerate(i_df_dict['s_assays']): + measurement_types = study_assays_df['Study Assay Measurement Type'].tolist() + technology_types = study_assays_df['Study Assay Technology Type'].tolist() if len(measurement_types) == len(technology_types): for x, measurement_type in enumerate(measurement_types): lowered_mt = measurement_types[x].lower() From 299a93dc59bf9d98cab8dbda001a2b0c63975059 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Thu, 1 Feb 2024 17:33:53 -0500 Subject: [PATCH 07/11] Missed a parameter type change --- isatools/isatab/validate/rules/rules_30xx.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py index 0321348c..1716b4df 100644 --- a/isatools/isatab/validate/rules/rules_30xx.py +++ b/isatools/isatab/validate/rules/rules_30xx.py @@ -1,13 +1,11 @@ import iso8601 -from pandas import DataFrame - from isatools.isatab.validate.store import validator from isatools.isatab.defaults import log, _RX_DOI, _RX_PMID, _RX_PMCID from isatools.isatab.utils import cell_has_value -def check_filenames_present(i_df_dict: DataFrame) -> None: +def check_filenames_present(i_df_dict: dict) -> None: """ Used for rule 3005 :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file From 33dae30ee40d80676e6b24d4be8b86b39f10fd4b Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Thu, 7 Mar 2024 17:24:22 -0500 Subject: [PATCH 08/11] Updated tests There were 2 tests that needed to be changed to match the new naming. --- tests/isatab/validate/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/isatab/validate/test_core.py b/tests/isatab/validate/test_core.py index e2b2c3cd..021e0705 100644 --- a/tests/isatab/validate/test_core.py +++ b/tests/isatab/validate/test_core.py @@ -47,7 +47,7 @@ def test_bii_s_7(self): def test_print_rule(self): raw_rule = INVESTIGATION_RULES_MAPPING[0] rule = Rule(**raw_rule) - expected_string = "rule=check_table_files_read, params=['investigation_df', 'dir_context'], identifier=0006" + expected_string = "rule=check_table_files_read, params=['investigation_df_dict', 'dir_context'], identifier=0006" self.assertEqual(str(rule), expected_string) def test_rules_error(self): @@ -69,7 +69,7 @@ def is_investigation(investigation_df): *INVESTIGATION_RULES_MAPPING, { 'rule': is_investigation, - 'params': ['investigation_df'], + 'params': ['investigation_df_dict'], 'identifier': '6000' } ], From e95a8de513a8f7979d951017b64f7ac3617de477 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Mon, 5 Feb 2024 16:28:35 -0500 Subject: [PATCH 09/11] Fairly significant changes to check_protocol_fields I started editing this function because of the "Only one protocol reference should be used in a Protocol REF column." message(s), but I found some other issues to address as well. --- isatools/isatab/validate/rules/rules_40xx.py | 61 ++++++++------------ 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py index 84b87ace..9679923d 100644 --- a/isatools/isatab/validate/rules/rules_40xx.py +++ b/isatools/isatab/validate/rules/rules_40xx.py @@ -254,30 +254,22 @@ def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip(a, b) - - proto_ref_index = [i for i in table.columns if 'protocol ref' in i.lower()] - result = True - for each in proto_ref_index: - prots_found = set() - for cell in table[each]: - prots_found.add(cell) - if len(prots_found) > 1: - log.warning("(W) Multiple protocol references {} are found in {}".format(prots_found, each)) - log.warning("(W) Only one protocol reference should be used in a Protocol REF column.") - result = False - if result: - field_headers = [i for i in table.columns - if i.lower().endswith(' name') - or i.lower().endswith(' data file') - or i.lower().endswith(' data matrix file')] - protos = [i for i in table.columns if i.lower() == 'protocol ref'] - if len(protos) > 0: - last_proto_index = table.columns.get_loc(protos[len(protos) - 1]) - else: - last_proto_index = -1 - last_mat_or_dat_index = table.columns.get_loc(field_headers[len(field_headers) - 1]) - if last_proto_index > last_mat_or_dat_index: - log.warning("(W) Protocol REF column without output in file '" + table.filename + "'") + + field_headers = [i for i in table.columns + if i.lower().endswith(' name') + or i.lower().endswith(' data file') + or i.lower().endswith(' data matrix file')] + protos = [i for i in table.columns if i.lower() == 'protocol ref'] + if len(protos) > 0: + last_proto_index = table.columns.get_loc(protos[len(protos) - 1]) + else: + last_proto_index = -1 + last_mat_or_dat_index = table.columns.get_loc(field_headers[len(field_headers) - 1]) + if last_proto_index > last_mat_or_dat_index: + spl = "Protocol REF column without output in file '" + table.filename + "'" + validator.add_warning(message="Missing Protocol Value", supplemental=spl, code=1007) + log.warning("(W) Protocol REF column is not followed by a material or data node in file '" + table.filename + "'") + if cfg.get_isatab_configuration(): for left, right in pairwise(field_headers): cleft = None cright = None @@ -294,16 +286,15 @@ def pairwise(iterable): fprotos_headers = [i for i in raw_headers if 'protocol ref' in i.lower()] fprotos = list() for header in fprotos_headers: - proto_name = table.iloc[0][header] - try: - proto_type = proto_map[proto_name] - fprotos.append(proto_type) - except KeyError: - spl = ("Could not find protocol type for protocol name '{}', trying to validate_rules against name " - "only").format(proto_name) - validator.add_warning(message="Missing Protocol declaration", supplemental=spl, code=1007) - log.warning("(W) {}".format(spl)) - fprotos.append(proto_name) + proto_names = list(table.loc[:, header].unique()) + for proto_name in proto_names: + proto_type = proto_map.get(proto_name) + if not proto_type and proto_name: + spl = ("Could not find protocol type for protocol name '{}' in file '{}'" ).format(proto_name, table.filename) + validator.add_warning(message="Missing Protocol Declaration", supplemental=spl, code=1007) + log.warning("(W) {}".format(spl)) + else: + fprotos.append(proto_type) invalid_protos = set(cprotos) - set(fprotos) if len(invalid_protos) > 0: spl = ("Protocol(s) of type {} defined in the ISA-configuration expected as a between '{}' and " @@ -311,8 +302,6 @@ def pairwise(iterable): spl = spl.format(str(list(invalid_protos)), cleft.header, cright.header, table.filename) validator.add_warning(message="Missing Protocol declaration", supplemental=spl, code=1007) log.warning("(W) {}".format(spl)) - result = False - return result def load_table_checks(df, filename): From 733b74ff4384b95fd973365c754e19654fb2e4be Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Mon, 5 Feb 2024 16:44:35 -0500 Subject: [PATCH 10/11] Missed a small change. --- isatools/isatab/validate/rules/rules_40xx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py index 9679923d..bd223ce5 100644 --- a/isatools/isatab/validate/rules/rules_40xx.py +++ b/isatools/isatab/validate/rules/rules_40xx.py @@ -266,9 +266,9 @@ def pairwise(iterable): last_proto_index = -1 last_mat_or_dat_index = table.columns.get_loc(field_headers[len(field_headers) - 1]) if last_proto_index > last_mat_or_dat_index: - spl = "Protocol REF column without output in file '" + table.filename + "'" + spl = "(W) Protocol REF column is not followed by a material or data node in file '" + table.filename + "'" validator.add_warning(message="Missing Protocol Value", supplemental=spl, code=1007) - log.warning("(W) Protocol REF column is not followed by a material or data node in file '" + table.filename + "'") + log.warning(spl) if cfg.get_isatab_configuration(): for left, right in pairwise(field_headers): cleft = None From 9040989de1829c9ba630c09a5bc984106b661f01 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Sun, 4 Feb 2024 14:06:12 -0500 Subject: [PATCH 11/11] Changed messaging on study assays errors The messaging on the 2 checks dealing with study assays did not make it clear exactly which assay and which study had the issue. I changed the messaging so it clearly indicates the affected study and assay. --- isatools/isatab/validate/rules/rules_30xx.py | 2 +- isatools/isatab/validate/rules/rules_40xx.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py index a659786f..be876dc5 100644 --- a/isatools/isatab/validate/rules/rules_30xx.py +++ b/isatools/isatab/validate/rules/rules_30xx.py @@ -19,7 +19,7 @@ def check_filenames_present(i_df_dict: dict) -> None: if filename == '': spl = "STUDY.{}, STUDY ASSAY.{}".format(s_pos, a_pos) validator.add_warning.append(message="Missing assay file name", supplemental=spl, code=3005) - log.warning("(W) An assay filename is missing for STUDY ASSAY.{}".format(a_pos)) + log.warning("(W) An assay filename is missing for STUDY.{}, STUDY ASSAY.{}".format(s_pos, a_pos)) def check_date_formats(i_df_dict): diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py index 84b87ace..f7c78adb 100644 --- a/isatools/isatab/validate/rules/rules_40xx.py +++ b/isatools/isatab/validate/rules/rules_40xx.py @@ -107,10 +107,10 @@ def check_measurement_technology_types(i_df_dict, configs): lowered_mt = measurement_types[x].lower() lowered_tt = technology_types[x].lower() if (lowered_mt, lowered_tt) not in configs.keys(): - spl = "Measurement {}/technology {}, STUDY ASSAY.{}" - spl = spl.format(measurement_types[x], technology_types[x], i) + spl = "Measurement {}/technology {}, STUDY.{}, STUDY ASSAY.{}" + spl = spl.format(measurement_types[x], technology_types[x], i, x) error = ("(E) Could not load configuration for measurement type '{}' and technology type '{}' " - "for STUDY ASSAY.{}'").format(measurement_types[x], technology_types[x], i) + "for STUDY.{}, STUDY ASSAY.{}'").format(measurement_types[x], technology_types[x], i, x) validator.add_error(message="Measurement/technology type invalid", supplemental=spl, code=4002) log.error(error)