From 1e888238f8e9aab98657f184af9ab60538c89ad5 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Tue, 30 Jan 2024 17:06:58 -0500
Subject: [PATCH 01/11] Fix rule and check.

The response[0] in core was setting what was supposed to be a list to a single string from the list instead. The check for whether a ontology source was in the list was nested beneath another check instead of being its own check.
---
 isatools/isatab/validate/rules/core.py       |  2 +-
 isatools/isatab/validate/rules/rules_30xx.py | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/isatools/isatab/validate/rules/core.py b/isatools/isatab/validate/rules/core.py
index 40774499..c4f603ab 100644
--- a/isatools/isatab/validate/rules/core.py
+++ b/isatools/isatab/validate/rules/core.py
@@ -54,7 +54,7 @@ def execute(self, validator_params: dict) -> None:
         try:
             response = self.rule(*params)
             if self.identifier == '3008':
-                validator_params['term_source_refs'] = response[0]
+                validator_params['term_source_refs'] = response
             if self.identifier == '4001':
                 validator_params['configs'] = response
             self.executed = True
diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py
index 79819624..ed255854 100644
--- a/isatools/isatab/validate/rules/rules_30xx.py
+++ b/isatools/isatab/validate/rules/rules_30xx.py
@@ -152,6 +152,7 @@ def check_single_field(cell_value, source, acc, cfield, filename):
         :param filename: Filename of the table
         :return: True if OK, False if not OK
         """
+        return_value = True
         if ((cell_has_value(cell_value) and not cell_has_value(source) and cell_has_value(acc))
                 or not cell_has_value(cell_value)):
             msg = "Missing Term Source REF in annotation or missing Term Source Name"
@@ -159,13 +160,14 @@ def check_single_field(cell_value, source, acc, cfield, filename):
                    "label/accession/source are provided.").format(cfield.header, filename)
             validator.add_warning(message=msg, supplemental=spl, code=3008)
             log.warning("(W) {}".format(spl))
-            if source not in tsrs:
-                spl = ("Term Source REF, for the field '{}' in the file '{}' does not refer to a declared "
-                       "Ontology Source.").format(cfield.header, filename)
-                validator.add_warning(message="Term Source REF reference broken", supplemental=spl, code=3011)
-                log.warning("(W) {}".format(spl))
-            return False
-        return True
+            return_value = False
+        if cell_has_value(source) and source not in tsrs:
+            spl = ("Term Source REF, for the field '{}' in the file '{}' does not refer to a declared "
+                   "Ontology Source.").format(cfield.header, filename)
+            validator.add_warning(message="Term Source REF reference broken", supplemental=spl, code=3011)
+            log.warning("(W) {}".format(spl))
+            return_value = False
+        return return_value
 
     result = True
     nfields = len(table.columns)

From 691c35fcdcbd94a7954b5c736183e2cf518c81e5 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Wed, 6 Mar 2024 10:12:41 -0500
Subject: [PATCH 02/11] Testing changes to run on Windows.

---
 tests/convert/test_isatab2w4m.py            | 17 ++++++++++----
 tests/isatab/test_isatab.py                 | 26 ++++++++++++---------
 tests/validators/test_validate_test_data.py | 19 +++++++--------
 3 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/tests/convert/test_isatab2w4m.py b/tests/convert/test_isatab2w4m.py
index 30e05bcb..5096a98a 100644
--- a/tests/convert/test_isatab2w4m.py
+++ b/tests/convert/test_isatab2w4m.py
@@ -1,6 +1,5 @@
 # Test conversion to W4M format
 
-import filecmp
 import os
 import shutil
 import tempfile
@@ -9,6 +8,16 @@
 from isatools.tests import utils
 
 
+def universal_filecmp(f1, f2):
+    with open(f1, 'r') as fp1, open(f2, 'r') as fp2:
+        while True:
+            b1 = fp1.readline()
+            b2 = fp2.readline()
+            if b1 != b2:
+                return False
+            if not b1:
+                return True
+
 # Test presence of data folder
 def setUpModule():
     if not os.path.exists(utils.DATA_DIR):
@@ -46,7 +55,7 @@ def plain_test(self, study, test_dir):
             output_file = os.path.join(self._tmp_dir, '.'.join(
                 ['-'.join([study, 'w4m', x]), 'tsv']))
             self.assertTrue(os.path.exists(output_file))
-            self.assertTrue(filecmp.cmp(output_file, ref_file, shallow=False),
+            self.assertTrue(universal_filecmp(output_file, ref_file),
                             'Output file "{0}" differs from reference file "{1}".'.format(output_file, ref_file))
 
     # Test MTBLS30
@@ -89,7 +98,7 @@ def na_filtering_test(self, study, test_dir, samp_na_filtering=None,
                 'sample-metadata', 'variable-metadata', 'sample-variable-matrix']:
             self.assertTrue(os.path.exists(output_files[x]))
         self.assertTrue(
-            filecmp.cmp(output_files[x], ref_files[x]),
+            universal_filecmp(output_files[x], ref_files[x]),
             'Output file "{0}" differs from reference file "{1}".'.format(
                 output_files[x], ref_files[x]))
 
@@ -140,5 +149,5 @@ def test_assay_selection(self):
                     ['-'.join([study, 'w4m', x, assay]), 'tsv']))
                 self.assertTrue(os.path.exists(output_file))
                 self.assertTrue(
-                    filecmp.cmp(output_file, ref_file),
+                    universal_filecmp(output_file, ref_file),
                     'Output file "{0}" differs from reference file "{1}".'.format(output_file, ref_file))
diff --git a/tests/isatab/test_isatab.py b/tests/isatab/test_isatab.py
index 1e249e50..25eee0d4 100644
--- a/tests/isatab/test_isatab.py
+++ b/tests/isatab/test_isatab.py
@@ -28,6 +28,9 @@ def setUpModule():
                                 "git clone -b tests --single-branch git@github.com:ISA-tools/ISAdatasets {0}"
                                 .format(utils.DATA_DIR))
 
+def replace_windows_newlines(input_string):
+    return input_string.replace('\r\r\n', '\n').replace('\r\n', '\n').replace('\r', '\n')
+
 
 class TestIsaMerge(unittest.TestCase):
 
@@ -1069,7 +1072,7 @@ def test_source_protocol_ref_sample(self):
         i.studies = [s]
         expected = """Source Name\tProtocol REF\tSample Name
 source1\tsample collection\tsample1"""
-        self.assertIn(expected, isatab.dumps(i))
+        self.assertIn(expected, replace_windows_newlines(isatab.dumps(i)))
 
     def test_source_protocol_ref_sample_x2(self):
         i = Investigation()
@@ -1167,7 +1170,7 @@ def test_source_protocol_ref_sample_with_characteristics(self):
         i.studies = [s]
         expected = """Source Name\tCharacteristics[reference descriptor]\tProtocol REF\tSample Name\tCharacteristics[organism part]
 source1\tnot applicable\tsample collection\tsample1\tliver"""
-        self.assertIn(expected, isatab.dumps(i))
+        self.assertIn(expected, replace_windows_newlines(isatab.dumps(i)))
 
     def test_source_protocol_ref_sample_with_parameter_values(self):
         i = Investigation()
@@ -1188,7 +1191,7 @@ def test_source_protocol_ref_sample_with_parameter_values(self):
         i.studies = [s]
         expected = """Source Name\tProtocol REF\tParameter Value[temperature]\tSample Name
 source1\tsample collection\t10\tsample1"""
-        self.assertIn(expected, isatab.dumps(i))
+        self.assertIn(expected, replace_windows_newlines(isatab.dumps(i)))
 
     def test_source_protocol_ref_sample_with_factor_values(self):
         i = Investigation()
@@ -1216,11 +1219,11 @@ def test_source_protocol_ref_sample_with_factor_values(self):
         s.assays = [a]
         expected_study_table = """Source Name\tProtocol REF\tSample Name\tFactor Value[study group]
 source1\tsample collection\tsample1\tStudy group 1"""
-        self.assertIn(expected_study_table, isatab.dumps(i))
+        self.assertIn(expected_study_table, replace_windows_newlines(isatab.dumps(i)))
         expected_assay_table = """Sample Name\tFactor Value[study group]\tProtocol REF
 sample1\tStudy group 1\textraction"""
         self.assertIn(expected_assay_table,
-                      isatab.dumps(i, write_fvs_in_assay_table=True))
+                      replace_windows_newlines(isatab.dumps(i, write_fvs_in_assay_table=True)))
 
     def test_source_protocol_ref_protocol_ref_sample(self):
         i = Investigation()
@@ -1239,7 +1242,7 @@ def test_source_protocol_ref_protocol_ref_sample(self):
         i.studies = [s]
         expected = """Source Name\tProtocol REF\tProtocol REF\tSample Name
 source1\tsample collection\taliquoting\taliquot1"""
-        self.assertIn(expected, isatab.dumps(i))
+        self.assertIn(expected, replace_windows_newlines(isatab.dumps(i)))
 
     def test_source_protocol_ref_sample_protocol_ref_sample(self):
         i = Investigation()
@@ -1261,7 +1264,7 @@ def test_source_protocol_ref_sample_protocol_ref_sample(self):
         i.studies = [s]
         expected = """Source Name\tProtocol REF\tSample Name\tProtocol REF\tSample Name
 source1\tsample collection\tsample1\taliquoting\taliquot1"""
-        self.assertIn(expected, isatab.dumps(i))
+        self.assertIn(expected, replace_windows_newlines(isatab.dumps(i)))
 
     def test_sample_protocol_ref_material_protocol_ref_data2(self):
         i = Investigation()
@@ -1295,7 +1298,7 @@ def test_sample_protocol_ref_material_protocol_ref_data2(self):
         i.studies = [s]
         expected = (f"""Sample Name\tProtocol REF\tExtract Name\tProtocol REF\tAssay Name\tRaw Data File\tComment[checksum type]\tComment[checksum]\n""" +
                     f"""sample1\textraction\textract1\tnucleic acid sequencing\tassay-1\tdatafile.raw\t{cs_comment1.value}\t{cs_comment2.value}""")
-        self.assertIn(expected, isatab.dumps(i))
+        self.assertIn(expected, replace_windows_newlines(isatab.dumps(i)))
 
     def test_sample_protocol_ref_material_protocol_ref_data3(self):
         i = Investigation()
@@ -1334,7 +1337,7 @@ def test_sample_protocol_ref_material_protocol_ref_data3(self):
 
         # self.assertIn(expected_line1, dump_out)
         # self.assertIn(expected_line2, dump_out)
-        self.assertIn(expected, isatab.dumps(i))
+        self.assertIn(expected, replace_windows_newlines(isatab.dumps(i)))
 
     def test_sample_protocol_ref_material_protocol_ref_data4(self):
         i = Investigation()
@@ -1373,7 +1376,7 @@ def test_sample_protocol_ref_material_protocol_ref_data4(self):
 
         # self.assertIn(expected_line1, dump_out)
         # self.assertIn(expected_line2, dump_out)
-        self.assertIn(expected, isatab.dumps(i))
+        self.assertIn(expected, replace_windows_newlines(isatab.dumps(i)))
 
     def test_sample_protocol_ref_material_protocol_ref_data_x2(self):
         i = Investigation()
@@ -1710,7 +1713,7 @@ def test_isatab_preprocess_issue235(self):
         test_isatab_str = b""""Sample Name"	"Protocol REF"	"Parameter Value[medium]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[serum]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[serum concentration]"	"Unit"	"Term Source REF"	"Term Accession Number"	"Parameter Value[medium volume]"	"Unit"	"Term Source REF"	"Term Accession Number"	"Parameter Value[migration modulator]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[modulator concentration]"	"Unit"	"Term Source REF"	"Term Accession Number"	"Parameter Value[modulator distribution]"	"Term Source REF"	"Term Accession Number"	"Protocol REF"	"Parameter Value[imaging technique]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[imaging technique temporal feature]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[acquisition duration]"	"Unit"	"Term Source REF"	"Term Accession Number"	"Parameter Value[time interval]"	"Unit"	"Term Source REF"	"Term Accession Number"	"Parameter Value[objective type]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[objective magnification]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[objective numerical aperture]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[acquisition channel count]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[reporter]"	"Term Source REF"	"Term Accession Number"	"Parameter Value[voxel size]"	"Unit"	"Term Source REF"	"Term Accession Number"	"Assay Name"	"Raw Data File"	"Protocol REF"	"Parameter Value[software]"	"Term Source REF"	"Term Accession Number"	"Data Transformation Name"	"Derived Data File"
 "culture1"	"migration assay"	"RPMI-1640"	""	""	"Heat Inactivated Fetal Bovine Serum "	""	""	"10"	"%"	"UO"	"http://purl.obolibrary.org/obo/UO_0000165"	"300"	"microliter"	"UO"	"http://purl.obolibrary.org/obo/UO_0000101"	""	""	""	""	""	""	""	"gradient"	""	""	"imaging"	"phase-contrast microscopy"	""	""	"dynamic"	""	""	"6"	"hour"	"UO"	"http://purl.obolibrary.org/obo/UO_0000032"	"15"	"minute"	"UO"	"http://purl.obolibrary.org/obo/UO_0000031"	""	""	""	"20"	""	""	""	""	""	""	""	""	""	""	""	""	""	""	""	"culture1"	""	"data transformation"	"CELLMIA"	""	""	""	""
 """
-        with tempfile.NamedTemporaryFile() as tmp:
+        with tempfile.NamedTemporaryFile(delete=False) as tmp:
             tmp.write(test_isatab_str)
             tmp.seek(0)
             study_assay_parser = isatab_parser.StudyAssayParser('mock.txt')
@@ -1719,6 +1722,7 @@ def test_isatab_preprocess_issue235(self):
                 if """Protocol REF\tData Transformation Name""" in header:
                     self.fail('Incorrectly inserted Protocol REF before '
                               'Data Transformation Name')
+        os.remove(tmp.name)
 
     def test_isatab_factor_value_parsing_issue270(self):
         with open(os.path.join(self._tab_data_dir, 'issue270', 'i_matteo.txt'),
diff --git a/tests/validators/test_validate_test_data.py b/tests/validators/test_validate_test_data.py
index 33fb9840..4d8c1a4e 100644
--- a/tests/validators/test_validate_test_data.py
+++ b/tests/validators/test_validate_test_data.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import unittest
+import pathlib
 
 from jsonschema import Draft4Validator
 from jsonschema import RefResolver
@@ -304,8 +305,8 @@ class TestIsaJsonCreateTestData(unittest.TestCase):
 
     def setUp(self):
         self._reporting_level = logging.ERROR
-        self.v2_create_schemas_path = os.path.join(
-            os.path.dirname(__file__), '../..', 'isatools', 'resources', 'schemas',
+        self.v2_create_schemas_path = pathlib.PurePosixPath(
+            pathlib.Path(__file__).parents[0], '..', '..', 'isatools', 'resources', 'schemas',
             'isa_model_version_2_0_schemas', 'create')
 
     def test_validate_testdata_sampleassayplan_json(self):
@@ -314,10 +315,9 @@ def test_validate_testdata_sampleassayplan_json(self):
             with open(os.path.join(self.v2_create_schemas_path,
                                    'sample_assay_plan_schema.json')) as fp:
                 sample_assay_plan_schema = json.load(fp)
-                resolver = RefResolver('file://{}'.format(
-                    os.path.join(self.v2_create_schemas_path,
-                                 'sample_assay_plan_schema.json')),
-                    sample_assay_plan_schema)
+                res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path,
+                             'sample_assay_plan_schema.json').as_uri()
+                resolver = RefResolver(res_path, sample_assay_plan_schema)
             validator = Draft4Validator(sample_assay_plan_schema,
                                         resolver=resolver)
             validator.validate(json.load(test_case_fp))
@@ -342,10 +342,9 @@ def test_validate_testdata_treatment_sequence_json(self):
             with open(os.path.join(self.v2_create_schemas_path,
                                    'treatment_sequence_schema.json')) as fp:
                 treatment_sequence_schema = json.load(fp)
-            resolver = RefResolver('file://{}'.format(
-                os.path.join(self.v2_create_schemas_path,
-                             'treatment_sequence_schema.json')),
-                                   treatment_sequence_schema)
+            res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path,
+                         'treatment_sequence_schema.json').as_uri()
+            resolver = RefResolver(res_path, treatment_sequence_schema)
             validator = Draft4Validator(treatment_sequence_schema,
                                         resolver=resolver)
             validator.validate(json.load(test_case_fp))

From 00509085ba6baadaa9ca60e10dfcb6fdd58d7110 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Thu, 22 Feb 2024 02:18:55 -0500
Subject: [PATCH 03/11] Update defaults.py

The previous regex didn't match any DOI I tried. I found this one in a blog post from Crossref where they say it matched 74.4M out of 74.9M DOIs that they have seen. The post is here: https://www.crossref.org/blog/dois-and-matching-regular-expressions/
---
 isatools/isatab/defaults.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/isatools/isatab/defaults.py b/isatools/isatab/defaults.py
index c38566e1..c2487123 100644
--- a/isatools/isatab/defaults.py
+++ b/isatools/isatab/defaults.py
@@ -34,7 +34,7 @@ def pbar(x):
 _RX_I_FILE_NAME = compile(r'i_(.*?)\.txt')
 _RX_DATA = compile(r'data\[(.*?)\]')
 _RX_COMMENT = compile(r'Comment\[(.*?)\]')
-_RX_DOI = compile(r'(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)')
+_RX_DOI = compile(r'10.\d{4,9}/[-._;()/:a-z0-9A-Z]+')
 _RX_PMID = compile(r'[0-9]{8}')
 _RX_PMCID = compile(r'PMC[0-9]{8}')
 _RX_CHARACTERISTICS = compile(r'Characteristics\[(.*?)\]')

From 2275ee912949fc20f1d653c282842ccfb2d272a0 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Wed, 6 Mar 2024 14:20:34 -0500
Subject: [PATCH 04/11] Update validate/test__core.py

Since the DOI regex works now there are 2 fewer warnings. Previously it was warning about valid DOIs.
---
 tests/isatab/validate/test_core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/isatab/validate/test_core.py b/tests/isatab/validate/test_core.py
index e2b2c3cd..6c9aeda9 100644
--- a/tests/isatab/validate/test_core.py
+++ b/tests/isatab/validate/test_core.py
@@ -17,7 +17,7 @@ def test_b_ii_s_3(self):
         data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-3')
         with open(path.join(data_path, 'i_gilbert.txt'), 'r') as data_file:
             r = validate(fp=data_file, config_dir=self.default_conf, origin="")
-        self.assertEqual(len(r['warnings']), 12)
+        self.assertEqual(len(r['warnings']), 10)
 
     def test_mtbls267(self):
         data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'MTBLS267-partial')
@@ -82,7 +82,7 @@ def is_investigation(investigation_df):
         data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-3')
         with open(path.join(data_path, 'i_gilbert.txt'), 'r') as data_file:
             r = validate(data_file, rules=rules)
-        self.assertEqual(len(r['warnings']), 12)
+        self.assertEqual(len(r['warnings']), 10)
 
         rule = '12000'
         expected_error = {

From 0f9214f9779d7d8cf316acff5012a9a45f6e11e1 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Thu, 7 Mar 2024 12:57:37 -0500
Subject: [PATCH 05/11] More changes

The last changes were initially tested and created on Windows, but had errors on GitHub. I fixed those errors, but didn't retest on Windows. This has been tested on both and is problem free.
---
 tests/isatab/test_isatab.py                 | 2 +-
 tests/validators/test_validate_test_data.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/isatab/test_isatab.py b/tests/isatab/test_isatab.py
index 25eee0d4..9936febe 100644
--- a/tests/isatab/test_isatab.py
+++ b/tests/isatab/test_isatab.py
@@ -440,7 +440,7 @@ def test_isatab_dump_source_sample_char_quant(self):
         s.process_sequence = [sample_collection_process]
         s.samples.append(sample1)
         i.studies = [s]
-        actual = isatab.dumps(i)
+        actual = replace_windows_newlines(isatab.dumps(i))
         expected = """Source Name\tMaterial Type\tCharacteristics[organism]\tTerm Source REF\tTerm Accession Number\tCharacteristics[body weight]\tUnit\tTerm Source REF\tTerm Accession Number\tProtocol REF\tParameter Value[vessel]\tTerm Source REF\tTerm Accession Number\tParameter Value[storage temperature]\tUnit\tTerm Source REF\tTerm Accession Number\tSample Name\tCharacteristics[organism part]\tTerm Source REF\tTerm Accession Number\tCharacteristics[specimen mass]\tUnit\tTerm Source REF\tTerm Accession Number
 source1\tspecimen\tHuman\tNCBITAXON\thttp://purl.bioontology.org/ontology/STY/T016\t72\tkilogram\tUO\thttp://purl.obolibrary.org/obo/UO_0000009\tsample collection\teppendorf tube\tOBI\tpurl.org\t-20\tdegree Celsius\tUO\thttp://purl.obolibrary.org/obo/UO_0000027\tsample1\tliver\tUBERON\thttp://purl.obolibrary.org/obo/UBERON_0002107\t450.5\tmilligram\tUO\thttp://purl.obolibrary.org/obo/UO_0000022"""
         self.assertIn(expected, actual)
diff --git a/tests/validators/test_validate_test_data.py b/tests/validators/test_validate_test_data.py
index 4d8c1a4e..35fda568 100644
--- a/tests/validators/test_validate_test_data.py
+++ b/tests/validators/test_validate_test_data.py
@@ -305,7 +305,7 @@ class TestIsaJsonCreateTestData(unittest.TestCase):
 
     def setUp(self):
         self._reporting_level = logging.ERROR
-        self.v2_create_schemas_path = pathlib.PurePosixPath(
+        self.v2_create_schemas_path = pathlib.Path(
             pathlib.Path(__file__).parents[0], '..', '..', 'isatools', 'resources', 'schemas',
             'isa_model_version_2_0_schemas', 'create')
 
@@ -315,7 +315,7 @@ def test_validate_testdata_sampleassayplan_json(self):
             with open(os.path.join(self.v2_create_schemas_path,
                                    'sample_assay_plan_schema.json')) as fp:
                 sample_assay_plan_schema = json.load(fp)
-                res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path,
+                res_path = pathlib.Path("file://", self.v2_create_schemas_path,
                              'sample_assay_plan_schema.json').as_uri()
                 resolver = RefResolver(res_path, sample_assay_plan_schema)
             validator = Draft4Validator(sample_assay_plan_schema,
@@ -342,7 +342,7 @@ def test_validate_testdata_treatment_sequence_json(self):
             with open(os.path.join(self.v2_create_schemas_path,
                                    'treatment_sequence_schema.json')) as fp:
                 treatment_sequence_schema = json.load(fp)
-            res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path,
+            res_path = pathlib.Path("file://", self.v2_create_schemas_path,
                          'treatment_sequence_schema.json').as_uri()
             resolver = RefResolver(res_path, treatment_sequence_schema)
             validator = Draft4Validator(treatment_sequence_schema,

From b874af46d6f89de64eadd012d8bba6581b93edf0 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Thu, 1 Feb 2024 16:24:37 -0500
Subject: [PATCH 06/11] Rename i_df and investigation_df.

i_df and investigation_df are not a DataFrame as indicated in parameter typing or as the name would suggest. It's actually a dictionary of DataFrames and lists of DataFrames. I have renamed them to reflect this.
---
 isatools/isatab/validate/core.py             |  8 +--
 isatools/isatab/validate/rules/core.py       | 10 ++--
 isatools/isatab/validate/rules/defaults.py   | 36 ++++++------
 isatools/isatab/validate/rules/rules_00xx.py |  8 +--
 isatools/isatab/validate/rules/rules_10xx.py | 62 ++++++++++----------
 isatools/isatab/validate/rules/rules_30xx.py | 40 ++++++-------
 isatools/isatab/validate/rules/rules_40xx.py | 36 ++++++------
 7 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/isatools/isatab/validate/core.py b/isatools/isatab/validate/core.py
index 2fa6a08c..da13d88b 100644
--- a/isatools/isatab/validate/core.py
+++ b/isatools/isatab/validate/core.py
@@ -186,21 +186,21 @@ def validate(fp: TextIO,
 
     built_rules = build_rules(rules)
     try:
-        i_df = load_investigation(fp=fp)
+        i_df_dict = load_investigation(fp=fp)
         params = {
-            "investigation_df": i_df,
+            "investigation_df_dict": i_df_dict,
             "dir_context": path.dirname(fp.name),
             "configs": config_dir,
         }
         investigation_validator = ISAInvestigationValidator(**params, **built_rules['investigation'])
 
-        for i, study_df in enumerate(i_df['studies']):
+        for i, study_df in enumerate(i_df_dict['studies']):
             study_filename = study_df.iloc[0]['Study File Name']
             study_validator = ISAStudyValidator(validator=investigation_validator, study_index=i,
                                                 study_filename=study_filename, study_df=study_df,
                                                 **built_rules['studies'])
             assay_tables = list()
-            assay_df = study_validator.params['investigation_df']['s_assays'][i]
+            assay_df = study_validator.params['investigation_df_dict']['s_assays'][i]
             for x, assay_filename in enumerate(assay_df['Study Assay File Name'].tolist()):
                 ISAAssayValidator(assay_tables=assay_tables, validator=study_validator, assay_index=x,
                                   assay_df=assay_df, assay_filename=assay_filename, **built_rules['assays'])
diff --git a/isatools/isatab/validate/rules/core.py b/isatools/isatab/validate/rules/core.py
index 40774499..d0fe08fd 100644
--- a/isatools/isatab/validate/rules/core.py
+++ b/isatools/isatab/validate/rules/core.py
@@ -108,14 +108,14 @@ def validate_rules(self, validator):
 
 class ISAInvestigationValidator:
     def __init__(self,
-                 investigation_df: DataFrame,
+                 investigation_df_dict: dict,
                  dir_context: str,
                  configs: str,
                  available_rules: list = INVESTIGATION_RULES_MAPPING,
                  rules_to_run: tuple = DEFAULT_INVESTIGATION_RULES):
         """ The ISA investigation validator class
 
-        :param investigation_df: the investigation dataframe
+        :param investigation_df_dict: a dictionary of DataFrames and lists of DataFrames representing the investigation file
         :param dir_context: the directory of the investigation
         :param configs: directory of the XML config files
         :param available_rules: a customizable list of all available rules for investigation objects
@@ -124,7 +124,7 @@ def __init__(self,
         self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules)
         self.has_validated = False
         self.params = {
-            'investigation_df': investigation_df,
+            'investigation_df_dict': investigation_df_dict,
             'dir_context': dir_context,
             'configs': configs,
             'term_source_refs': None
@@ -162,8 +162,8 @@ def __init__(self,
             self.params['study_sample_table'] = load_table(s_fp)
             self.params['study_sample_table'].filename = study_filename
 
-        protocol_names = self.params['investigation_df']['s_protocols'][study_index]['Study Protocol Name'].tolist()
-        protocol_types = self.params['investigation_df']['s_protocols'][study_index]['Study Protocol Type'].tolist()
+        protocol_names = self.params['investigation_df_dict']['s_protocols'][study_index]['Study Protocol Name'].tolist()
+        protocol_types = self.params['investigation_df_dict']['s_protocols'][study_index]['Study Protocol Type'].tolist()
         self.params['protocol_names_and_types'] = dict(zip(protocol_names, protocol_types))
 
         self.params['study_group_size_in_comment'] = None
diff --git a/isatools/isatab/validate/rules/defaults.py b/isatools/isatab/validate/rules/defaults.py
index a655cd62..eaafb849 100644
--- a/isatools/isatab/validate/rules/defaults.py
+++ b/isatools/isatab/validate/rules/defaults.py
@@ -30,30 +30,30 @@
 
 
 INVESTIGATION_RULES_MAPPING = [
-    {'rule': check_table_files_read, 'params': ['investigation_df', 'dir_context'], 'identifier': '0006'},
+    {'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0006'},
 
-    {'rule': sample_not_declared, 'params': ['investigation_df', 'dir_context'], 'identifier': '1003'},
-    {'rule': check_protocol_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1007'},
-    {'rule': check_study_factor_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1008'},
-    {'rule': check_protocol_parameter_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1009'},
-    {'rule': check_protocol_names, 'params': ['investigation_df'], 'identifier': '1010'},
-    {'rule': check_protocol_parameter_names, 'params': ['investigation_df'], 'identifier': '1011'},
-    {'rule': check_study_factor_names, 'params': ['investigation_df'], 'identifier': '1012'},
+    {'rule': sample_not_declared, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1003'},
+    {'rule': check_protocol_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1007'},
+    {'rule': check_study_factor_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1008'},
+    {'rule': check_protocol_parameter_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1009'},
+    {'rule': check_protocol_names, 'params': ['investigation_df_dict'], 'identifier': '1010'},
+    {'rule': check_protocol_parameter_names, 'params': ['investigation_df_dict'], 'identifier': '1011'},
+    {'rule': check_study_factor_names, 'params': ['investigation_df_dict'], 'identifier': '1012'},
 
-    {'rule': check_date_formats, 'params': ['investigation_df'], 'identifier': '3001'},
-    {'rule': check_dois, 'params': ['investigation_df'], 'identifier': '3002'},
-    {'rule': check_pubmed_ids_format, 'params': ['investigation_df'], 'identifier': '3003'},
-    {'rule': check_ontology_sources, 'params': ['investigation_df'], 'identifier': '3008'},
+    {'rule': check_date_formats, 'params': ['investigation_df_dict'], 'identifier': '3001'},
+    {'rule': check_dois, 'params': ['investigation_df_dict'], 'identifier': '3002'},
+    {'rule': check_pubmed_ids_format, 'params': ['investigation_df_dict'], 'identifier': '3003'},
+    {'rule': check_ontology_sources, 'params': ['investigation_df_dict'], 'identifier': '3008'},
 
     {'rule': load_config, 'params': ['configs'], 'identifier': '4001'},
-    {'rule': check_measurement_technology_types, 'params': ['investigation_df', 'configs'], 'identifier': '4002'},
-    {'rule': check_investigation_against_config, 'params': ['investigation_df', 'configs'], 'identifier': '4003'},
+    {'rule': check_measurement_technology_types, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4002'},
+    {'rule': check_investigation_against_config, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4003'},
 
     # copies
-    {'rule': check_table_files_read, 'params': ['investigation_df', 'dir_context'], 'identifier': '0008'},
-    {'rule': check_protocol_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1019'},
-    {'rule': check_protocol_parameter_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1020'},
-    {'rule': check_study_factor_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1021'},
+    {'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0008'},
+    {'rule': check_protocol_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1019'},
+    {'rule': check_protocol_parameter_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1020'},
+    {'rule': check_study_factor_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1021'},
 ]
 
 STUDY_RULES_MAPPING = [
diff --git a/isatools/isatab/validate/rules/rules_00xx.py b/isatools/isatab/validate/rules/rules_00xx.py
index 248d1447..1a52aa56 100644
--- a/isatools/isatab/validate/rules/rules_00xx.py
+++ b/isatools/isatab/validate/rules/rules_00xx.py
@@ -5,14 +5,14 @@
 from isatools.isatab.defaults import log
 
 
-def check_table_files_read(i_df, dir_context):
+def check_table_files_read(i_df_dict, dir_context):
     """Used for rules 0006 and 0008
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
+    for i, study_df in enumerate(i_df_dict['studies']):
         study_filename = study_df.iloc[0]['Study File Name']
         if study_filename != '':
             try:
@@ -22,7 +22,7 @@ def check_table_files_read(i_df, dir_context):
                 spl = "Study File {} does not appear to exist".format(study_filename)
                 validator.add_error(message="Missing study tab file(s)", supplemental=spl, code=6)
                 log.error("(E) Study File {} does not appear to exist".format(study_filename))
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)):
diff --git a/isatools/isatab/validate/rules/rules_10xx.py b/isatools/isatab/validate/rules/rules_10xx.py
index 190cd273..6bfc1b0c 100644
--- a/isatools/isatab/validate/rules/rules_10xx.py
+++ b/isatools/isatab/validate/rules/rules_10xx.py
@@ -9,14 +9,14 @@
 from isatools.isatab.utils import cell_has_value
 
 
-def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
+def check_samples_not_declared_in_study_used_in_assay(i_df_dict, dir_context):
     """Checks if samples found in assay tables are found in the study-sample table
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
+    for i, study_df in enumerate(i_df_dict['studies']):
         study_filename = study_df.iloc[0]['Study File Name']
         if study_filename != '':
             try:
@@ -25,7 +25,7 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
                     study_samples = set(study_df['Sample Name'])
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
@@ -40,15 +40,15 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
                     pass
 
 
-def check_study_factor_usage(i_df, dir_context):
+def check_study_factor_usage(i_df_dict, dir_context):
     """Used for rules 1008 and 1021
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
-        study_factors_declared = set(i_df['s_factors'][i]['Study Factor Name'].tolist())
+    for i, study_df in enumerate(i_df_dict['studies']):
+        study_factors_declared = set(i_df_dict['s_factors'][i]['Study Factor Name'].tolist())
         study_filename = study_df.iloc[0]['Study File Name']
         error_spl = "Some factors used in an study file {} are not declared in the investigation file: {}"
         error_msg = "Some factors are not declared in the investigation"
@@ -66,7 +66,7 @@ def check_study_factor_usage(i_df, dir_context):
                         validator.add_error(message=error_msg, supplemental=spl, code=1008)
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     study_factors_used = set()
@@ -92,7 +92,7 @@ def check_study_factor_usage(i_df, dir_context):
                         study_factors_used = study_factors_used.union(set(fv))
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
@@ -109,15 +109,15 @@ def check_study_factor_usage(i_df, dir_context):
                         .format(list(study_factors_declared - study_factors_used)))
 
 
-def check_protocol_usage(i_df, dir_context):
+def check_protocol_usage(i_df_dict, dir_context):
     """Used for rules 1007 and 1019
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
-        protocols_declared = set(i_df['s_protocols'][i]['Study Protocol Name'].tolist())
+    for i, study_df in enumerate(i_df_dict['studies']):
+        protocols_declared = set(i_df_dict['s_protocols'][i]['Study Protocol Name'].tolist())
         protocols_declared.add('')
         study_filename = study_df.iloc[0]['Study File Name']
         if study_filename != '':
@@ -136,7 +136,7 @@ def check_protocol_usage(i_df, dir_context):
                         log.error("(E) {}".format(spl))
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     protocol_refs_used = set()
@@ -165,7 +165,7 @@ def check_protocol_usage(i_df, dir_context):
             except FileNotFoundError:
                 pass
         for j, assay_filename in enumerate(
-                i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+                i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
@@ -183,16 +183,16 @@ def check_protocol_usage(i_df, dir_context):
             log.warning(warning)
 
 
-def check_protocol_parameter_usage(i_df, dir_context):
+def check_protocol_parameter_usage(i_df_dict, dir_context):
     """Used for rules 1009 and 1020
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
+    for i, study_df in enumerate(i_df_dict['studies']):
         protocol_parameters_declared = set()
-        protocol_parameters_per_protocol = set(i_df['s_protocols'][i]['Study Protocol Parameters Name'].tolist())
+        protocol_parameters_per_protocol = set(i_df_dict['s_protocols'][i]['Study Protocol Parameters Name'].tolist())
         for protocol_parameters in protocol_parameters_per_protocol:
             parameters_list = protocol_parameters.split(';')
             protocol_parameters_declared = protocol_parameters_declared.union(set(parameters_list))
@@ -216,7 +216,7 @@ def check_protocol_parameter_usage(i_df, dir_context):
                         log.error(error)
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     protocol_parameters_used = set()
@@ -246,7 +246,7 @@ def check_protocol_parameter_usage(i_df, dir_context):
                         protocol_parameters_used = protocol_parameters_used.union(set(pv))
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
@@ -263,13 +263,13 @@ def check_protocol_parameter_usage(i_df, dir_context):
             log.warning(warning)
 
 
-def check_protocol_names(i_df):
+def check_protocol_names(i_df_dict):
     """Used for rule 1010
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
-    for study_protocols_df in i_df['s_protocols']:
+    for study_protocols_df in i_df_dict['s_protocols']:
         for i, protocol_name in enumerate(study_protocols_df['Study Protocol Name'].tolist()):
             # DataFrames labels empty cells as 'Unnamed: n'
             if protocol_name == '' or 'Unnamed: ' in protocol_name:
@@ -279,13 +279,13 @@ def check_protocol_names(i_df):
                 log.warning(warning)
 
 
-def check_protocol_parameter_names(i_df):
+def check_protocol_parameter_names(i_df_dict):
     """Used for rule 1011
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
-    for study_protocols_df in i_df['s_protocols']:
+    for study_protocols_df in i_df_dict['s_protocols']:
         for i, protocol_parameters_names in enumerate(study_protocols_df['Study Protocol Parameters Name'].tolist()):
             # There's an empty cell if no protocols
             if len(protocol_parameters_names.split(sep=';')) > 1:
@@ -298,13 +298,13 @@ def check_protocol_parameter_names(i_df):
                         log.warning(warning)
 
 
-def check_study_factor_names(i_df):
+def check_study_factor_names(i_df_dict):
     """Used for rule 1012
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
-    for study_factors_df in i_df['s_factors']:
+    for study_factors_df in i_df_dict['s_factors']:
         for i, factor_name in enumerate(study_factors_df['Study Factor Name'].tolist()):
             # DataFrames labels empty cells as 'Unnamed: n'
             if factor_name == '' or 'Unnamed: ' in factor_name:
diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py
index 22e2e74a..0321348c 100644
--- a/isatools/isatab/validate/rules/rules_30xx.py
+++ b/isatools/isatab/validate/rules/rules_30xx.py
@@ -7,27 +7,27 @@
 from isatools.isatab.utils import cell_has_value
 
 
-def check_filenames_present(i_df: DataFrame) -> None:
+def check_filenames_present(i_df_dict: DataFrame) -> None:
     """ Used for rule 3005
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
-    for s_pos, study_df in enumerate(i_df['studies']):
+    for s_pos, study_df in enumerate(i_df_dict['studies']):
         if study_df.iloc[0]['Study File Name'] == '':
             validator.add_warning(message="Missing Study File Name", supplemental="STUDY.{}".format(s_pos), code=3005)
             log.warning("(W) A study filename is missing for STUDY.{}".format(s_pos))
-        for a_pos, filename in enumerate(i_df['s_assays'][s_pos]['Study Assay File Name'].tolist()):
+        for a_pos, filename in enumerate(i_df_dict['s_assays'][s_pos]['Study Assay File Name'].tolist()):
             if filename == '':
                 spl = "STUDY.{}, STUDY ASSAY.{}".format(s_pos, a_pos)
                 validator.add_warning.append(message="Missing assay file name", supplemental=spl, code=3005)
                 log.warning("(W) An assay filename is missing for STUDY ASSAY.{}".format(a_pos))
 
 
-def check_date_formats(i_df):
+def check_date_formats(i_df_dict):
     """ Used for rule 3001
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
 
@@ -45,13 +45,13 @@ def check_iso8601_date(date_str):
                 validator.add_warning(message="Date is not ISO8601 formatted", supplemental=spl, code=3001)
                 log.warning("(W) Date {} does not conform to ISO8601 format".format(date_str))
 
-    release_date_vals = i_df['investigation']['Investigation Public Release Date'].tolist()
+    release_date_vals = i_df_dict['investigation']['Investigation Public Release Date'].tolist()
     if len(release_date_vals) > 0:
         check_iso8601_date(release_date_vals[0])
-    sub_date_values = i_df['investigation']['Investigation Submission Date'].tolist()
+    sub_date_values = i_df_dict['investigation']['Investigation Submission Date'].tolist()
     if len(sub_date_values) > 0:
         check_iso8601_date(sub_date_values[0])
-    for i, study_df in enumerate(i_df['studies']):
+    for i, study_df in enumerate(i_df_dict['studies']):
         release_date_vals = study_df['Study Public Release Date'].tolist()
         if len(release_date_vals) > 0:
             check_iso8601_date(release_date_vals[0])
@@ -60,10 +60,10 @@ def check_iso8601_date(date_str):
             check_iso8601_date(sub_date_values[0])
 
 
-def check_dois(i_df):
+def check_dois(i_df_dict):
     """ Used for rule 3002
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
 
@@ -79,17 +79,17 @@ def check_doi(doi_str):
                 validator.add_warning(message="DOI is not valid format", supplemental=spl, code=3002)
                 log.warning("(W) DOI {} does not conform to DOI format".format(doi_str))
 
-    for doi in i_df['i_publications']['Investigation Publication DOI'].tolist():
+    for doi in i_df_dict['i_publications']['Investigation Publication DOI'].tolist():
         check_doi(doi)
-    for i, study_df in enumerate(i_df['s_publications']):
+    for i, study_df in enumerate(i_df_dict['s_publications']):
         for doi in study_df['Study Publication DOI'].tolist():
             check_doi(doi)
 
 
-def check_pubmed_ids_format(i_df):
+def check_pubmed_ids_format(i_df_dict):
     """ Used for rule 3003
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
 
@@ -105,21 +105,21 @@ def check_pubmed_id(pubmed_id_str):
                 validator.add_warning(message="PubMed ID is not valid format", supplemental=spl, code=3003)
                 log.warning("(W) PubMed ID {} is not valid format".format(pubmed_id_str))
 
-    for doi in i_df['i_publications']['Investigation PubMed ID'].tolist():
+    for doi in i_df_dict['i_publications']['Investigation PubMed ID'].tolist():
         check_pubmed_id(str(doi))
-    for study_pubs_df in i_df['s_publications']:
+    for study_pubs_df in i_df_dict['s_publications']:
         for doi in study_pubs_df['Study PubMed ID'].tolist():
             check_pubmed_id(str(doi))
 
 
-def check_ontology_sources(i_df):
+def check_ontology_sources(i_df_dict):
     """ Used for rule 3008
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
     term_source_refs = []
-    for i, ontology_source_name in enumerate(i_df['ontology_sources']['Term Source Name'].tolist()):
+    for i, ontology_source_name in enumerate(i_df_dict['ontology_sources']['Term Source Name'].tolist()):
         if ontology_source_name == '' or 'Unnamed: ' in ontology_source_name:
             spl = "pos={}".format(i)
             warn = "(W) An Ontology Source Reference at position {} is missing Term Source Name, so can't be referenced"
diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py
index 7f39c0df..84b87ace 100644
--- a/isatools/isatab/validate/rules/rules_40xx.py
+++ b/isatools/isatab/validate/rules/rules_40xx.py
@@ -13,10 +13,10 @@
 )
 
 
-def check_investigation_against_config(i_df, configs):
+def check_investigation_against_config(i_df_dict, configs):
     """Checks investigation file against the loaded configurations
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param configs: A dictionary of ISA Configuration objects
     :return: None
     """
@@ -52,18 +52,18 @@ def check_section_against_required_fields_one_value(section, required, i=0):
 
     config_fields = configs[('[investigation]', '')].get_isatab_configuration()[0].get_field()
     required_fields = [i.header for i in config_fields if i.is_required]
-    check_section_against_required_fields_one_value(i_df['investigation'], required_fields)
-    check_section_against_required_fields_one_value(i_df['i_publications'], required_fields)
-    check_section_against_required_fields_one_value(i_df['i_contacts'], required_fields)
+    check_section_against_required_fields_one_value(i_df_dict['investigation'], required_fields)
+    check_section_against_required_fields_one_value(i_df_dict['i_publications'], required_fields)
+    check_section_against_required_fields_one_value(i_df_dict['i_contacts'], required_fields)
 
-    for x, study_df in enumerate(i_df['studies']):
-        check_section_against_required_fields_one_value(i_df['studies'][x], required_fields, x)
-        check_section_against_required_fields_one_value(i_df['s_design_descriptors'][x], required_fields, x)
-        check_section_against_required_fields_one_value(i_df['s_publications'][x], required_fields, x)
-        check_section_against_required_fields_one_value(i_df['s_factors'][x], required_fields, x)
-        check_section_against_required_fields_one_value(i_df['s_assays'][x], required_fields, x)
-        check_section_against_required_fields_one_value(i_df['s_protocols'][x], required_fields, x)
-        check_section_against_required_fields_one_value(i_df['s_contacts'][x], required_fields, x)
+    for x, study_df in enumerate(i_df_dict['studies']):
+        check_section_against_required_fields_one_value(i_df_dict['studies'][x], required_fields, x)
+        check_section_against_required_fields_one_value(i_df_dict['s_design_descriptors'][x], required_fields, x)
+        check_section_against_required_fields_one_value(i_df_dict['s_publications'][x], required_fields, x)
+        check_section_against_required_fields_one_value(i_df_dict['s_factors'][x], required_fields, x)
+        check_section_against_required_fields_one_value(i_df_dict['s_assays'][x], required_fields, x)
+        check_section_against_required_fields_one_value(i_df_dict['s_protocols'][x], required_fields, x)
+        check_section_against_required_fields_one_value(i_df_dict['s_contacts'][x], required_fields, x)
 
 
 def load_config(config_dir):
@@ -92,16 +92,16 @@ def load_config(config_dir):
     return configs
 
 
-def check_measurement_technology_types(i_df, configs):
+def check_measurement_technology_types(i_df_dict, configs):
     """Rule 4002
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param configs: A dictionary of ISA Configuration objects
     :return: None
     """
-    for i, assay_df in enumerate(i_df['s_assays']):
-        measurement_types = assay_df['Study Assay Measurement Type'].tolist()
-        technology_types = assay_df['Study Assay Technology Type'].tolist()
+    for i, study_assays_df in enumerate(i_df_dict['s_assays']):
+        measurement_types = study_assays_df['Study Assay Measurement Type'].tolist()
+        technology_types = study_assays_df['Study Assay Technology Type'].tolist()
         if len(measurement_types) == len(technology_types):
             for x, measurement_type in enumerate(measurement_types):
                 lowered_mt = measurement_types[x].lower()

From 299a93dc59bf9d98cab8dbda001a2b0c63975059 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Thu, 1 Feb 2024 17:33:53 -0500
Subject: [PATCH 07/11] Missed a parameter type change

---
 isatools/isatab/validate/rules/rules_30xx.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py
index 0321348c..1716b4df 100644
--- a/isatools/isatab/validate/rules/rules_30xx.py
+++ b/isatools/isatab/validate/rules/rules_30xx.py
@@ -1,13 +1,11 @@
 import iso8601
 
-from pandas import DataFrame
-
 from isatools.isatab.validate.store import validator
 from isatools.isatab.defaults import log, _RX_DOI, _RX_PMID, _RX_PMCID
 from isatools.isatab.utils import cell_has_value
 
 
-def check_filenames_present(i_df_dict: DataFrame) -> None:
+def check_filenames_present(i_df_dict: dict) -> None:
     """ Used for rule 3005
 
     :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file

From 33dae30ee40d80676e6b24d4be8b86b39f10fd4b Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Thu, 7 Mar 2024 17:24:22 -0500
Subject: [PATCH 08/11] Updated tests

There were 2 tests that needed to be changed to match the new naming.
---
 tests/isatab/validate/test_core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/isatab/validate/test_core.py b/tests/isatab/validate/test_core.py
index e2b2c3cd..021e0705 100644
--- a/tests/isatab/validate/test_core.py
+++ b/tests/isatab/validate/test_core.py
@@ -47,7 +47,7 @@ def test_bii_s_7(self):
     def test_print_rule(self):
         raw_rule = INVESTIGATION_RULES_MAPPING[0]
         rule = Rule(**raw_rule)
-        expected_string = "rule=check_table_files_read, params=['investigation_df', 'dir_context'], identifier=0006"
+        expected_string = "rule=check_table_files_read, params=['investigation_df_dict', 'dir_context'], identifier=0006"
         self.assertEqual(str(rule), expected_string)
 
     def test_rules_error(self):
@@ -69,7 +69,7 @@ def is_investigation(investigation_df):
                     *INVESTIGATION_RULES_MAPPING,
                     {
                         'rule': is_investigation,
-                        'params': ['investigation_df'],
+                        'params': ['investigation_df_dict'],
                         'identifier': '6000'
                     }
                 ],

From e95a8de513a8f7979d951017b64f7ac3617de477 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Mon, 5 Feb 2024 16:28:35 -0500
Subject: [PATCH 09/11] Fairly significant changes to check_protocol_fields

I started editing this function because of the "Only one protocol reference should be used in a Protocol REF column." message(s), but I found some other issues to address as well.
---
 isatools/isatab/validate/rules/rules_40xx.py | 61 ++++++++------------
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py
index 84b87ace..9679923d 100644
--- a/isatools/isatab/validate/rules/rules_40xx.py
+++ b/isatools/isatab/validate/rules/rules_40xx.py
@@ -254,30 +254,22 @@ def pairwise(iterable):
         a, b = tee(iterable)
         next(b, None)
         return zip(a, b)
-
-    proto_ref_index = [i for i in table.columns if 'protocol ref' in i.lower()]
-    result = True
-    for each in proto_ref_index:
-        prots_found = set()
-        for cell in table[each]:
-            prots_found.add(cell)
-        if len(prots_found) > 1:
-            log.warning("(W) Multiple protocol references {} are found in {}".format(prots_found, each))
-            log.warning("(W) Only one protocol reference should be used in a Protocol REF column.")
-            result = False
-    if result:
-        field_headers = [i for i in table.columns
-                         if i.lower().endswith(' name')
-                         or i.lower().endswith(' data file')
-                         or i.lower().endswith(' data matrix file')]
-        protos = [i for i in table.columns if i.lower() == 'protocol ref']
-        if len(protos) > 0:
-            last_proto_index = table.columns.get_loc(protos[len(protos) - 1])
-        else:
-            last_proto_index = -1
-        last_mat_or_dat_index = table.columns.get_loc(field_headers[len(field_headers) - 1])
-        if last_proto_index > last_mat_or_dat_index:
-            log.warning("(W) Protocol REF column without output in file '" + table.filename + "'")
+    
+    field_headers = [i for i in table.columns
+                     if i.lower().endswith(' name')
+                     or i.lower().endswith(' data file')
+                     or i.lower().endswith(' data matrix file')]
+    protos = [i for i in table.columns if i.lower() == 'protocol ref']
+    if len(protos) > 0:
+        last_proto_index = table.columns.get_loc(protos[len(protos) - 1])
+    else:
+        last_proto_index = -1
+    last_mat_or_dat_index = table.columns.get_loc(field_headers[len(field_headers) - 1])
+    if last_proto_index > last_mat_or_dat_index:
+        spl = "Protocol REF column without output in file '" + table.filename + "'"
+        validator.add_warning(message="Missing Protocol Value", supplemental=spl, code=1007)
+        log.warning("(W) Protocol REF column is not followed by a material or data node in file '" + table.filename + "'")
+    if cfg.get_isatab_configuration():
         for left, right in pairwise(field_headers):
             cleft = None
             cright = None
@@ -294,16 +286,15 @@ def pairwise(iterable):
                 fprotos_headers = [i for i in raw_headers if 'protocol ref' in i.lower()]
                 fprotos = list()
                 for header in fprotos_headers:
-                    proto_name = table.iloc[0][header]
-                    try:
-                        proto_type = proto_map[proto_name]
-                        fprotos.append(proto_type)
-                    except KeyError:
-                        spl = ("Could not find protocol type for protocol name '{}', trying to validate_rules against name "
-                               "only").format(proto_name)
-                        validator.add_warning(message="Missing Protocol declaration", supplemental=spl, code=1007)
-                        log.warning("(W) {}".format(spl))
-                        fprotos.append(proto_name)
+                    proto_names = list(table.loc[:, header].unique())
+                    for proto_name in proto_names:
+                        proto_type = proto_map.get(proto_name)
+                        if not proto_type and proto_name:
+                            spl = ("Could not find protocol type for protocol name '{}' in file '{}'" ).format(proto_name, table.filename)
+                            validator.add_warning(message="Missing Protocol Declaration", supplemental=spl, code=1007)
+                            log.warning("(W) {}".format(spl))
+                        else:
+                            fprotos.append(proto_type)
                 invalid_protos = set(cprotos) - set(fprotos)
                 if len(invalid_protos) > 0:
                     spl = ("Protocol(s) of type {} defined in the ISA-configuration expected as a between '{}' and "
@@ -311,8 +302,6 @@ def pairwise(iterable):
                     spl = spl.format(str(list(invalid_protos)), cleft.header, cright.header, table.filename)
                     validator.add_warning(message="Missing Protocol declaration", supplemental=spl, code=1007)
                     log.warning("(W) {}".format(spl))
-                    result = False
-    return result
 
 
 def load_table_checks(df, filename):

From 733b74ff4384b95fd973365c754e19654fb2e4be Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Mon, 5 Feb 2024 16:44:35 -0500
Subject: [PATCH 10/11] Missed a small change.

---
 isatools/isatab/validate/rules/rules_40xx.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py
index 9679923d..bd223ce5 100644
--- a/isatools/isatab/validate/rules/rules_40xx.py
+++ b/isatools/isatab/validate/rules/rules_40xx.py
@@ -266,9 +266,9 @@ def pairwise(iterable):
         last_proto_index = -1
     last_mat_or_dat_index = table.columns.get_loc(field_headers[len(field_headers) - 1])
     if last_proto_index > last_mat_or_dat_index:
-        spl = "Protocol REF column without output in file '" + table.filename + "'"
+        spl = "(W) Protocol REF column is not followed by a material or data node in file '" + table.filename + "'"
         validator.add_warning(message="Missing Protocol Value", supplemental=spl, code=1007)
-        log.warning("(W) Protocol REF column is not followed by a material or data node in file '" + table.filename + "'")
+        log.warning(spl)
     if cfg.get_isatab_configuration():
         for left, right in pairwise(field_headers):
             cleft = None

From 9040989de1829c9ba630c09a5bc984106b661f01 Mon Sep 17 00:00:00 2001
From: Travis Thompson <ptth222@uky.edu>
Date: Sun, 4 Feb 2024 14:06:12 -0500
Subject: [PATCH 11/11] Changed messaging on study assays errors

The messaging on the 2 checks dealing with study assays did not make it clear exactly which assay and which study had the issue. I changed the messaging so it clearly indicates the affected study and assay.
---
 isatools/isatab/validate/rules/rules_30xx.py | 2 +-
 isatools/isatab/validate/rules/rules_40xx.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py
index a659786f..be876dc5 100644
--- a/isatools/isatab/validate/rules/rules_30xx.py
+++ b/isatools/isatab/validate/rules/rules_30xx.py
@@ -19,7 +19,7 @@ def check_filenames_present(i_df_dict: dict) -> None:
             if filename == '':
                 spl = "STUDY.{}, STUDY ASSAY.{}".format(s_pos, a_pos)
                 validator.add_warning.append(message="Missing assay file name", supplemental=spl, code=3005)
-                log.warning("(W) An assay filename is missing for STUDY ASSAY.{}".format(a_pos))
+                log.warning("(W) An assay filename is missing for STUDY.{}, STUDY ASSAY.{}".format(s_pos, a_pos))
 
 
 def check_date_formats(i_df_dict):
diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py
index 84b87ace..f7c78adb 100644
--- a/isatools/isatab/validate/rules/rules_40xx.py
+++ b/isatools/isatab/validate/rules/rules_40xx.py
@@ -107,10 +107,10 @@ def check_measurement_technology_types(i_df_dict, configs):
                 lowered_mt = measurement_types[x].lower()
                 lowered_tt = technology_types[x].lower()
                 if (lowered_mt, lowered_tt) not in configs.keys():
-                    spl = "Measurement {}/technology {}, STUDY ASSAY.{}"
-                    spl = spl.format(measurement_types[x], technology_types[x], i)
+                    spl = "Measurement {}/technology {}, STUDY.{}, STUDY ASSAY.{}"
+                    spl = spl.format(measurement_types[x], technology_types[x], i, x)
                     error = ("(E) Could not load configuration for measurement type '{}' and technology type '{}' "
-                             "for STUDY ASSAY.{}'").format(measurement_types[x], technology_types[x], i)
+                             "for STUDY.{}, STUDY ASSAY.{}'").format(measurement_types[x], technology_types[x], i, x)
                     validator.add_error(message="Measurement/technology type invalid", supplemental=spl, code=4002)
                     log.error(error)