Merge pull request #551 from ISA-tools/extended-511

Extended 511
ISA-tools · Mar 18, 2024 · d7aa027 · d7aa027
2 parents 7d5f19f + e1eda82
commit d7aa027
Show file tree

Hide file tree

Showing 34 changed files with 1,947 additions and 375 deletions.
diff --git a/isa-cookbook/content/notebooks/isa-api-programmatic-BH2023-multiomics-isa.ipynb b/isa-cookbook/content/notebooks/isa-api-programmatic-BH2023-multiomics-isa.ipynb
diff --git a/isatools/constants.py b/isatools/constants.py
@@ -19,6 +19,9 @@
 DATA_FILE_LABELS = [
  'Raw Data File',
  'Raw Spectral Data File',
+ 'Free Induction Decay Data File',
+ 'Image File',
+ 'Derived Data File',
  'Derived Spectral Data File',
  'Derived Array Data File',
  'Derived Array Data Matrix File',
@@ -27,16 +30,16 @@
  'Peptide Assignment File',
  'Post Translational Modification Assignment File',
  'Acquisition Parameter Data File',
- 'Free Induction Decay Data File',
- 'Image File',
- 'Derived Data File',
  'Metabolite Assignment File',
  'Metabolite Identification File'
 ]
 
 _LABELS_DATA_NODES = [
  'Raw Data File',
  'Raw Spectral Data File',
+ 'Free Induction Decay Data File',
+ 'Image File',
+ 'Derived Data File',
  'Derived Spectral Data File',
  'Derived Array Data File',
  'Derived Array Data Matrix File',
@@ -45,9 +48,6 @@
  'Peptide Assignment File',
  'Post Translational Modification Assignment File',
  'Acquisition Parameter Data File',
- 'Free Induction Decay Data File',
- 'Image File',
- 'Derived Data File',
  'Metabolite Assignment File',
  'Metabolite Identification File'
 ]
@@ -65,16 +65,6 @@
  'Data Transformation Name'
 ]
 
-_LABELS_ASSAY_NODES = [
- 'Assay Name',
- 'MS Assay Name',
- 'NMR Assay Name',
- 'Hybridization Assay Name',
- 'Scan Name',
- 'Normalization Name',
- 'Data Transformation Name'
-]
-
 QUALIFIER_LABELS = [
  'Protocol REF',
  'Material Type',
@@ -83,6 +73,19 @@
  'Unit'
 ]
 
+ALLOWED_NODES = NODE_LABELS.append('Protocol REF')
+
 ALL_LABELS = NODE_LABELS + ASSAY_LABELS + QUALIFIER_LABELS
 
 ALL_LABELS.append('Protocol REF')
+ALL_LABELS.append('Label')
+
+_LABELS_ASSAY_NODES = [
+ 'Assay Name',
+ 'MS Assay Name',
+ 'NMR Assay Name',
+ 'Hybridization Assay Name',
+ 'Scan Name',
+ 'Normalization Name',
+ 'Data Transformation Name'
+]
diff --git a/isatools/create/model.py b/isatools/create/model.py
@@ -3283,4 +3283,4 @@ def compute_single_arm_design_multi_element_cell(treatments, sample_assay_plan,
  elements=[follow_up_map[0]]), follow_up_map[1]])
  arm = StudyArm('ARM_00', group_size=group_size, arm_map=OrderedDict(arm_map))
  design.add_study_arm(arm)
- return design
+ return design
diff --git a/isatools/isajson/validate.py b/isatools/isajson/validate.py
@@ -810,14 +810,16 @@ def check_study_groups(study_or_assay):
 def validate(
  fp,
  config_dir=default_config_dir,
- log_level=None,
+ log_level=logging.INFO,
  base_schemas_dir="isa_model_version_1_0_schemas"
 ):
  if config_dir is None:
  config_dir = default_config_dir
- if log_level in (
- logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
- logging.ERROR, logging.CRITICAL):
+ if log_level is None: #(
+ # logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
+ # logging.ERROR, logging.CRITICAL):
+ log.disabled = True
+ else:
  log.setLevel(log_level)
  log.info("ISA JSON Validator from ISA tools API v0.12.")
  stream = StringIO()

diff --git a/isatools/isatab/dump/core.py b/isatools/isatab/dump/core.py
@@ -41,7 +41,7 @@ def dump(isa_obj, output_path,
  raise NameError('Investigation file must match pattern i_*.txt, got {}'.format(i_file_name))
 
  if path.exists(output_path):
- fp = open(path.join(output_path, i_file_name), 'w', encoding='utf-8')
+ fp = open(path.join(output_path, i_file_name), 'wb')
  else:
  log.debug('output_path=', i_file_name)
  raise FileNotFoundError("Can't find " + output_path)
@@ -55,7 +55,7 @@ def dump(isa_obj, output_path,
 
  # Write ONTOLOGY SOURCE REFERENCE section
  ontology_source_references_df = _build_ontology_reference_section(investigation.ontology_source_references)
- fp.write('ONTOLOGY SOURCE REFERENCE\n')
+ fp.write(bytearray('ONTOLOGY SOURCE REFERENCE\n', 'utf-8'))
  # Need to set index_label as top left cell
  ontology_source_references_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Term Source Name')
@@ -80,7 +80,7 @@ def dump(isa_obj, output_path,
  inv_df_rows.append(comment.value)
  investigation_df.loc[0] = inv_df_rows
  investigation_df = investigation_df.set_index('Investigation Identifier').T
- fp.write('INVESTIGATION\n')
+ fp.write(bytearray('INVESTIGATION\n', 'utf-8'))
  investigation_df.to_csv(
  path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Investigation Identifier')
@@ -90,14 +90,15 @@ def dump(isa_obj, output_path,
  prefix='Investigation',
  publications=investigation.publications
  )
- fp.write('INVESTIGATION PUBLICATIONS\n')
+ fp.write(bytearray('INVESTIGATION PUBLICATIONS\n', 'utf-8'))
  investigation_publications_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Investigation PubMed ID')
 
  # Write INVESTIGATION CONTACTS section
  investigation_contacts_df = _build_contacts_section_df(
  contacts=investigation.contacts)
- fp.write('INVESTIGATION CONTACTS\n')
+ fp.write(bytearray('INVESTIGATION CONTACTS\n', 'utf-8'))
+
  investigation_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Investigation Person Last Name')
 
@@ -127,40 +128,40 @@ def dump(isa_obj, output_path,
  study_df_row.append(comment.value)
  study_df.loc[0] = study_df_row
  study_df = study_df.set_index('Study Identifier').T
- fp.write('STUDY\n')
+ fp.write(bytearray('STUDY\n', 'utf-8'))
  study_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8', index_label='Study Identifier')
  study_design_descriptors_df = _build_design_descriptors_section(design_descriptors=study.design_descriptors)
- fp.write('STUDY DESIGN DESCRIPTORS\n')
+ fp.write(bytearray('STUDY DESIGN DESCRIPTORS\n', 'utf-8'))
  study_design_descriptors_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Study Design Type')
 
  # Write STUDY PUBLICATIONS section
  study_publications_df = _build_publications_section_df(prefix='Study', publications=study.publications)
- fp.write('STUDY PUBLICATIONS\n')
+ fp.write(bytearray('STUDY PUBLICATIONS\n', 'utf-8'))
  study_publications_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Study PubMed ID')
 
  # Write STUDY FACTORS section
  study_factors_df = _build_factors_section_df(factors=study.factors)
- fp.write('STUDY FACTORS\n')
+ fp.write(bytearray('STUDY FACTORS\n', 'utf-8'))
  study_factors_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Study Factor Name')
 
  study_assays_df = _build_assays_section_df(assays=study.assays)
- fp.write('STUDY ASSAYS\n')
+ fp.write(bytearray('STUDY ASSAYS\n', 'utf-8'))
  study_assays_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Study Assay File Name')
 
  # Write STUDY PROTOCOLS section
  study_protocols_df = _build_protocols_section_df(protocols=study.protocols)
- fp.write('STUDY PROTOCOLS\n')
+ fp.write(bytearray('STUDY PROTOCOLS\n', 'utf-8'))
  study_protocols_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Study Protocol Name')
 
  # Write STUDY CONTACTS section
  study_contacts_df = _build_contacts_section_df(
  prefix='Study', contacts=study.contacts)
- fp.write('STUDY CONTACTS\n')
+ fp.write(bytearray('STUDY CONTACTS\n', 'utf-8'))
  study_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
  index_label='Study Person Last Name')
 

diff --git a/isatools/isatab/dump/write.py b/isatools/isatab/dump/write.py
@@ -59,7 +59,6 @@ def flatten(current_list):
  paths = _all_end_to_end_paths(
  s_graph,
  [x for x in s_graph.nodes() if isinstance(s_graph.indexes[x], Source)])
- log.warning(s_graph.nodes())
 
  sample_in_path_count = 0
  protocol_in_path_count = 0
@@ -220,7 +219,7 @@ def flatten(current_list):
  DF = DF.replace('', nan)
  DF = DF.dropna(axis=1, how='all')
 
- with open(path.join(output_dir, study_obj.filename), 'w') as out_fp:
+ with open(path.join(output_dir, study_obj.filename), 'wb') as out_fp:
  DF.to_csv(
  path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8')
 
@@ -480,7 +479,7 @@ def pbar(x):
  DF = DF.dropna(axis=1, how='all')
 
  with open(path.join(
- output_dir, assay_obj.filename), 'w') as out_fp:
+ output_dir, assay_obj.filename), 'wb') as out_fp:
  DF.to_csv(path_or_buf=out_fp, index=False, sep='\t',
  encoding='utf-8')
 

diff --git a/isatools/isatab/graph.py b/isatools/isatab/graph.py
@@ -17,7 +17,7 @@ def _all_end_to_end_paths(G, start_nodes):
  num_start_nodes = len(start_nodes)
  message = 'Calculating for paths for {} start nodes: '.format(
  num_start_nodes)
- log.info(start_nodes)
+ # log.info(start_nodes)
  start_node = G.indexes[start_nodes[0]]
  if isinstance(start_node, Source):
  message = 'Calculating for paths for {} sources: '.format(
@@ -61,7 +61,6 @@ def _longest_path_and_attrs(paths, indexes):
  :return: The longest path and attributes
  """
  longest = (0, None)
- log.info(paths)
  for path in paths:
  length = len(path)
  for node in path:

diff --git a/isatools/isatab/load/ProcessSequenceFactory.py b/isatools/isatab/load/ProcessSequenceFactory.py
@@ -376,6 +376,12 @@ def get_node_by_label_and_key(labl, this_key):
  if comment_key not in [x.name for x in process.comments]:
  process.comments.append(Comment(name=comment_key, value=str(object_series[comment_column])))
 
+ for performer in [c for c in column_group if c == 'Performer']:
+ process.performer = str(object_series[performer])
+
+ for date in [c for c in column_group if c == 'Date']:
+ process.date = str(object_series[date])
+
  for _, object_series in DF.iterrows(): # don't drop duplicates
  process_key_sequence = list()
  source_node_context = None

diff --git a/isatools/isatab/load/core.py b/isatools/isatab/load/core.py
@@ -390,7 +390,7 @@ def merge_study_with_assay_tables(study_file_path, assay_file_path, target_file_
  log.info("Merging DataFrames...")
  merged_DF = merge(study_DF, assay_DF, on='Sample Name')
  log.info("Writing merged DataFrame to file %s", target_file_path)
- with open(target_file_path, 'w', encoding='utf-8') as fp:
+ with open(target_file_path, 'wb') as fp:
  merged_DF.to_csv(fp, sep='\t', index=False, header=study_DF.isatab_header + assay_DF.isatab_header[1:])
 
 

diff --git a/isatools/isatab/validate/core.py b/isatools/isatab/validate/core.py
@@ -3,6 +3,7 @@
 
 from os import path
 from glob import glob
+import logging
 
 from pandas.errors import ParserError
 
@@ -45,7 +46,7 @@ def check_labels(section, labels_expected, df):
  if _RX_COMMENT.match(label) is None:
  msg = "Invalid label found in investigation file"
  spl = "In {} section, label {} is not allowed".format(section, label)
- message_handler.add_error(message=msg, supplemental=spl, code= 5)
+ message_handler.add_error(message=msg, supplemental=spl, code=5)
  elif len(_RX_COMMENT.findall(label)) == 0:
  spl = "In {} section, label {} is missing a name".format(section, label)
  msg = "Missing name in Comment[] label"
@@ -179,8 +180,10 @@ def validate(fp: TextIO,
  :param log_level: optional log level (default: INFO)
  :return: a dictionary of the validation results (errors, warnings and info)
  """
- if not log_level:
+ if log_level is None:
  log.disabled = True
+ else:
+ log.setLevel(log_level)
  message_handler.reset_store()
  validated = False
 

diff --git a/isatools/isatab/validate/rules/rules_10xx.py b/isatools/isatab/validate/rules/rules_10xx.py
@@ -12,7 +12,7 @@
 def check_samples_not_declared_in_study_used_in_assay(i_df_dict, dir_context):
  """Checks if samples found in assay tables are found in the study-sample table
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :param dir_context: Path to where the investigation file is found
  :return: None
  """
@@ -43,7 +43,7 @@ def check_samples_not_declared_in_study_used_in_assay(i_df_dict, dir_context):
 def check_study_factor_usage(i_df_dict, dir_context):
  """Used for rules 1008 and 1021
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :param dir_context: Path to where the investigation file is found
  :return: None
  """
@@ -112,7 +112,7 @@ def check_study_factor_usage(i_df_dict, dir_context):
 def check_protocol_usage(i_df_dict, dir_context):
  """Used for rules 1007 and 1019
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :param dir_context: Path to where the investigation file is found
  :return: None
  """
@@ -186,7 +186,7 @@ def check_protocol_usage(i_df_dict, dir_context):
 def check_protocol_parameter_usage(i_df_dict, dir_context):
  """Used for rules 1009 and 1020
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :param dir_context: Path to where the investigation file is found
  :return: None
  """
@@ -282,7 +282,7 @@ def check_protocol_names(i_df_dict):
 def check_protocol_parameter_names(i_df_dict):
  """Used for rule 1011
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :return: None
  """
  for study_protocols_df in i_df_dict['s_protocols']:
@@ -301,7 +301,7 @@ def check_protocol_parameter_names(i_df_dict):
 def check_study_factor_names(i_df_dict):
  """Used for rule 1012
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :return: None
  """
  for study_factors_df in i_df_dict['s_factors']:

diff --git a/isatools/isatab/validate/rules/rules_30xx.py b/isatools/isatab/validate/rules/rules_30xx.py
@@ -8,7 +8,7 @@
 def check_filenames_present(i_df_dict: dict) -> None:
  """ Used for rule 3005
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :return: None
  """
  for s_pos, study_df in enumerate(i_df_dict['studies']):
@@ -25,7 +25,7 @@ def check_filenames_present(i_df_dict: dict) -> None:
 def check_date_formats(i_df_dict):
  """ Used for rule 3001
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :return: None
  """
 
@@ -61,7 +61,7 @@ def check_iso8601_date(date_str):
 def check_dois(i_df_dict):
  """ Used for rule 3002
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :return: None
  """
 
@@ -87,7 +87,7 @@ def check_doi(doi_str):
 def check_pubmed_ids_format(i_df_dict):
  """ Used for rule 3003
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :return: None
  """
 
@@ -113,7 +113,7 @@ def check_pubmed_id(pubmed_id_str):
 def check_ontology_sources(i_df_dict):
  """ Used for rule 3008
 
- :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
+ :param i_df_dict: A dictionary of  DataFrame and list of Dataframes representing the Investigation file
  :return: None
  """
  term_source_refs = []