Skip to content

Commit

Permalink
Merge pull request #551 from ISA-tools/extended-511
Browse files Browse the repository at this point in the history
Extended 511
  • Loading branch information
proccaserra authored Mar 18, 2024
2 parents 7d5f19f + e1eda82 commit d7aa027
Show file tree
Hide file tree
Showing 34 changed files with 1,947 additions and 375 deletions.
1,647 changes: 1,549 additions & 98 deletions isa-cookbook/content/notebooks/isa-api-programmatic-BH2023-multiomics-isa.ipynb

Large diffs are not rendered by default.

35 changes: 19 additions & 16 deletions isatools/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
DATA_FILE_LABELS = [
'Raw Data File',
'Raw Spectral Data File',
'Free Induction Decay Data File',
'Image File',
'Derived Data File',
'Derived Spectral Data File',
'Derived Array Data File',
'Derived Array Data Matrix File',
Expand All @@ -27,16 +30,16 @@
'Peptide Assignment File',
'Post Translational Modification Assignment File',
'Acquisition Parameter Data File',
'Free Induction Decay Data File',
'Image File',
'Derived Data File',
'Metabolite Assignment File',
'Metabolite Identification File'
]

_LABELS_DATA_NODES = [
'Raw Data File',
'Raw Spectral Data File',
'Free Induction Decay Data File',
'Image File',
'Derived Data File',
'Derived Spectral Data File',
'Derived Array Data File',
'Derived Array Data Matrix File',
Expand All @@ -45,9 +48,6 @@
'Peptide Assignment File',
'Post Translational Modification Assignment File',
'Acquisition Parameter Data File',
'Free Induction Decay Data File',
'Image File',
'Derived Data File',
'Metabolite Assignment File',
'Metabolite Identification File'
]
Expand All @@ -65,16 +65,6 @@
'Data Transformation Name'
]

_LABELS_ASSAY_NODES = [
'Assay Name',
'MS Assay Name',
'NMR Assay Name',
'Hybridization Assay Name',
'Scan Name',
'Normalization Name',
'Data Transformation Name'
]

QUALIFIER_LABELS = [
'Protocol REF',
'Material Type',
Expand All @@ -83,6 +73,19 @@
'Unit'
]

ALLOWED_NODES = NODE_LABELS.append('Protocol REF')

ALL_LABELS = NODE_LABELS + ASSAY_LABELS + QUALIFIER_LABELS

ALL_LABELS.append('Protocol REF')
ALL_LABELS.append('Label')

_LABELS_ASSAY_NODES = [
'Assay Name',
'MS Assay Name',
'NMR Assay Name',
'Hybridization Assay Name',
'Scan Name',
'Normalization Name',
'Data Transformation Name'
]
2 changes: 1 addition & 1 deletion isatools/create/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3283,4 +3283,4 @@ def compute_single_arm_design_multi_element_cell(treatments, sample_assay_plan,
elements=[follow_up_map[0]]), follow_up_map[1]])
arm = StudyArm('ARM_00', group_size=group_size, arm_map=OrderedDict(arm_map))
design.add_study_arm(arm)
return design
return design
10 changes: 6 additions & 4 deletions isatools/isajson/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,14 +810,16 @@ def check_study_groups(study_or_assay):
def validate(
fp,
config_dir=default_config_dir,
log_level=None,
log_level=logging.INFO,
base_schemas_dir="isa_model_version_1_0_schemas"
):
if config_dir is None:
config_dir = default_config_dir
if log_level in (
logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
logging.ERROR, logging.CRITICAL):
if log_level is None: #(
# logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
# logging.ERROR, logging.CRITICAL):
log.disabled = True
else:
log.setLevel(log_level)
log.info("ISA JSON Validator from ISA tools API v0.12.")
stream = StringIO()
Expand Down
25 changes: 13 additions & 12 deletions isatools/isatab/dump/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def dump(isa_obj, output_path,
raise NameError('Investigation file must match pattern i_*.txt, got {}'.format(i_file_name))

if path.exists(output_path):
fp = open(path.join(output_path, i_file_name), 'w', encoding='utf-8')
fp = open(path.join(output_path, i_file_name), 'wb')
else:
log.debug('output_path=', i_file_name)
raise FileNotFoundError("Can't find " + output_path)
Expand All @@ -55,7 +55,7 @@ def dump(isa_obj, output_path,

# Write ONTOLOGY SOURCE REFERENCE section
ontology_source_references_df = _build_ontology_reference_section(investigation.ontology_source_references)
fp.write('ONTOLOGY SOURCE REFERENCE\n')
fp.write(bytearray('ONTOLOGY SOURCE REFERENCE\n', 'utf-8'))
# Need to set index_label as top left cell
ontology_source_references_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Term Source Name')
Expand All @@ -80,7 +80,7 @@ def dump(isa_obj, output_path,
inv_df_rows.append(comment.value)
investigation_df.loc[0] = inv_df_rows
investigation_df = investigation_df.set_index('Investigation Identifier').T
fp.write('INVESTIGATION\n')
fp.write(bytearray('INVESTIGATION\n', 'utf-8'))
investigation_df.to_csv(
path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Investigation Identifier')
Expand All @@ -90,14 +90,15 @@ def dump(isa_obj, output_path,
prefix='Investigation',
publications=investigation.publications
)
fp.write('INVESTIGATION PUBLICATIONS\n')
fp.write(bytearray('INVESTIGATION PUBLICATIONS\n', 'utf-8'))
investigation_publications_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Investigation PubMed ID')

# Write INVESTIGATION CONTACTS section
investigation_contacts_df = _build_contacts_section_df(
contacts=investigation.contacts)
fp.write('INVESTIGATION CONTACTS\n')
fp.write(bytearray('INVESTIGATION CONTACTS\n', 'utf-8'))

investigation_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Investigation Person Last Name')

Expand Down Expand Up @@ -127,40 +128,40 @@ def dump(isa_obj, output_path,
study_df_row.append(comment.value)
study_df.loc[0] = study_df_row
study_df = study_df.set_index('Study Identifier').T
fp.write('STUDY\n')
fp.write(bytearray('STUDY\n', 'utf-8'))
study_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8', index_label='Study Identifier')
study_design_descriptors_df = _build_design_descriptors_section(design_descriptors=study.design_descriptors)
fp.write('STUDY DESIGN DESCRIPTORS\n')
fp.write(bytearray('STUDY DESIGN DESCRIPTORS\n', 'utf-8'))
study_design_descriptors_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Design Type')

# Write STUDY PUBLICATIONS section
study_publications_df = _build_publications_section_df(prefix='Study', publications=study.publications)
fp.write('STUDY PUBLICATIONS\n')
fp.write(bytearray('STUDY PUBLICATIONS\n', 'utf-8'))
study_publications_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study PubMed ID')

# Write STUDY FACTORS section
study_factors_df = _build_factors_section_df(factors=study.factors)
fp.write('STUDY FACTORS\n')
fp.write(bytearray('STUDY FACTORS\n', 'utf-8'))
study_factors_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Factor Name')

study_assays_df = _build_assays_section_df(assays=study.assays)
fp.write('STUDY ASSAYS\n')
fp.write(bytearray('STUDY ASSAYS\n', 'utf-8'))
study_assays_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Assay File Name')

# Write STUDY PROTOCOLS section
study_protocols_df = _build_protocols_section_df(protocols=study.protocols)
fp.write('STUDY PROTOCOLS\n')
fp.write(bytearray('STUDY PROTOCOLS\n', 'utf-8'))
study_protocols_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Protocol Name')

# Write STUDY CONTACTS section
study_contacts_df = _build_contacts_section_df(
prefix='Study', contacts=study.contacts)
fp.write('STUDY CONTACTS\n')
fp.write(bytearray('STUDY CONTACTS\n', 'utf-8'))
study_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Person Last Name')

Expand Down
5 changes: 2 additions & 3 deletions isatools/isatab/dump/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def flatten(current_list):
paths = _all_end_to_end_paths(
s_graph,
[x for x in s_graph.nodes() if isinstance(s_graph.indexes[x], Source)])
log.warning(s_graph.nodes())

sample_in_path_count = 0
protocol_in_path_count = 0
Expand Down Expand Up @@ -220,7 +219,7 @@ def flatten(current_list):
DF = DF.replace('', nan)
DF = DF.dropna(axis=1, how='all')

with open(path.join(output_dir, study_obj.filename), 'w') as out_fp:
with open(path.join(output_dir, study_obj.filename), 'wb') as out_fp:
DF.to_csv(
path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8')

Expand Down Expand Up @@ -480,7 +479,7 @@ def pbar(x):
DF = DF.dropna(axis=1, how='all')

with open(path.join(
output_dir, assay_obj.filename), 'w') as out_fp:
output_dir, assay_obj.filename), 'wb') as out_fp:
DF.to_csv(path_or_buf=out_fp, index=False, sep='\t',
encoding='utf-8')

Expand Down
3 changes: 1 addition & 2 deletions isatools/isatab/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def _all_end_to_end_paths(G, start_nodes):
num_start_nodes = len(start_nodes)
message = 'Calculating for paths for {} start nodes: '.format(
num_start_nodes)
log.info(start_nodes)
# log.info(start_nodes)
start_node = G.indexes[start_nodes[0]]
if isinstance(start_node, Source):
message = 'Calculating for paths for {} sources: '.format(
Expand Down Expand Up @@ -61,7 +61,6 @@ def _longest_path_and_attrs(paths, indexes):
:return: The longest path and attributes
"""
longest = (0, None)
log.info(paths)
for path in paths:
length = len(path)
for node in path:
Expand Down
6 changes: 6 additions & 0 deletions isatools/isatab/load/ProcessSequenceFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,12 @@ def get_node_by_label_and_key(labl, this_key):
if comment_key not in [x.name for x in process.comments]:
process.comments.append(Comment(name=comment_key, value=str(object_series[comment_column])))

for performer in [c for c in column_group if c == 'Performer']:
process.performer = str(object_series[performer])

for date in [c for c in column_group if c == 'Date']:
process.date = str(object_series[date])

for _, object_series in DF.iterrows(): # don't drop duplicates
process_key_sequence = list()
source_node_context = None
Expand Down
2 changes: 1 addition & 1 deletion isatools/isatab/load/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def merge_study_with_assay_tables(study_file_path, assay_file_path, target_file_
log.info("Merging DataFrames...")
merged_DF = merge(study_DF, assay_DF, on='Sample Name')
log.info("Writing merged DataFrame to file %s", target_file_path)
with open(target_file_path, 'w', encoding='utf-8') as fp:
with open(target_file_path, 'wb') as fp:
merged_DF.to_csv(fp, sep='\t', index=False, header=study_DF.isatab_header + assay_DF.isatab_header[1:])


Expand Down
7 changes: 5 additions & 2 deletions isatools/isatab/validate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from os import path
from glob import glob
import logging

from pandas.errors import ParserError

Expand Down Expand Up @@ -45,7 +46,7 @@ def check_labels(section, labels_expected, df):
if _RX_COMMENT.match(label) is None:
msg = "Invalid label found in investigation file"
spl = "In {} section, label {} is not allowed".format(section, label)
message_handler.add_error(message=msg, supplemental=spl, code= 5)
message_handler.add_error(message=msg, supplemental=spl, code=5)
elif len(_RX_COMMENT.findall(label)) == 0:
spl = "In {} section, label {} is missing a name".format(section, label)
msg = "Missing name in Comment[] label"
Expand Down Expand Up @@ -179,8 +180,10 @@ def validate(fp: TextIO,
:param log_level: optional log level (default: INFO)
:return: a dictionary of the validation results (errors, warnings and info)
"""
if not log_level:
if log_level is None:
log.disabled = True
else:
log.setLevel(log_level)
message_handler.reset_store()
validated = False

Expand Down
12 changes: 6 additions & 6 deletions isatools/isatab/validate/rules/rules_10xx.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def check_samples_not_declared_in_study_used_in_assay(i_df_dict, dir_context):
"""Checks if samples found in assay tables are found in the study-sample table
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
Expand Down Expand Up @@ -43,7 +43,7 @@ def check_samples_not_declared_in_study_used_in_assay(i_df_dict, dir_context):
def check_study_factor_usage(i_df_dict, dir_context):
"""Used for rules 1008 and 1021
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
Expand Down Expand Up @@ -112,7 +112,7 @@ def check_study_factor_usage(i_df_dict, dir_context):
def check_protocol_usage(i_df_dict, dir_context):
"""Used for rules 1007 and 1019
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
Expand Down Expand Up @@ -186,7 +186,7 @@ def check_protocol_usage(i_df_dict, dir_context):
def check_protocol_parameter_usage(i_df_dict, dir_context):
"""Used for rules 1009 and 1020
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
Expand Down Expand Up @@ -282,7 +282,7 @@ def check_protocol_names(i_df_dict):
def check_protocol_parameter_names(i_df_dict):
"""Used for rule 1011
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:return: None
"""
for study_protocols_df in i_df_dict['s_protocols']:
Expand All @@ -301,7 +301,7 @@ def check_protocol_parameter_names(i_df_dict):
def check_study_factor_names(i_df_dict):
"""Used for rule 1012
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:return: None
"""
for study_factors_df in i_df_dict['s_factors']:
Expand Down
10 changes: 5 additions & 5 deletions isatools/isatab/validate/rules/rules_30xx.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
def check_filenames_present(i_df_dict: dict) -> None:
""" Used for rule 3005
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:return: None
"""
for s_pos, study_df in enumerate(i_df_dict['studies']):
Expand All @@ -25,7 +25,7 @@ def check_filenames_present(i_df_dict: dict) -> None:
def check_date_formats(i_df_dict):
""" Used for rule 3001
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:return: None
"""

Expand Down Expand Up @@ -61,7 +61,7 @@ def check_iso8601_date(date_str):
def check_dois(i_df_dict):
""" Used for rule 3002
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:return: None
"""

Expand All @@ -87,7 +87,7 @@ def check_doi(doi_str):
def check_pubmed_ids_format(i_df_dict):
""" Used for rule 3003
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:return: None
"""

Expand All @@ -113,7 +113,7 @@ def check_pubmed_id(pubmed_id_str):
def check_ontology_sources(i_df_dict):
""" Used for rule 3008
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param i_df_dict: A dictionary of DataFrame and list of Dataframes representing the Investigation file
:return: None
"""
term_source_refs = []
Expand Down
Loading

0 comments on commit d7aa027

Please sign in to comment.