From 1106a149f25d628177b7a6afbaaf7ce119f0433f Mon Sep 17 00:00:00 2001 From: chris-s-friedman Date: Tue, 21 May 2024 16:45:51 -0400 Subject: [PATCH 1/6] =?UTF-8?q?=E2=9C=A8=20Modify=20biospecimen=5Fgroup=20?= =?UTF-8?q?to=20ingest=20into=20sample=20in=20kf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kf_lib_data_ingest/common/concept_schema.py | 27 +- .../kids_first_dataservice.py | 287 ++++++++---------- 2 files changed, 154 insertions(+), 160 deletions(-) diff --git a/kf_lib_data_ingest/common/concept_schema.py b/kf_lib_data_ingest/common/concept_schema.py index 47195347..4046de59 100644 --- a/kf_lib_data_ingest/common/concept_schema.py +++ b/kf_lib_data_ingest/common/concept_schema.py @@ -146,7 +146,24 @@ class EVENT_AGE(QuantityMixin): pass class BIOSPECIMEN_GROUP(PropertyMixin): - pass + TISSUE_TYPE = None + NCIT_TISSUE_TYPE_ID = None + ANATOMY_SITE = None + NCIT_ANATOMY_SITE_ID = None + UBERON_ANATOMY_SITE_ID = None + COMPOSITION = None + EVENT_ID = None + EVENT_AGE_DAYS = None + + class EVENT_AGE(QuantityMixin): + pass + + class VOLUME(QuantityMixin): + pass + + VOLUME_UL = None + SAMPLE_PROCUREMENT = None + PRESERVATION_METHOD = None class BIOSPECIMEN(PropertyMixin): TISSUE_TYPE = None @@ -247,18 +264,14 @@ def compile_schema(): property_path = [] property_paths = set() - _set_cls_attrs( - CONCEPT, None, property_path, property_paths, include_root=False - ) + _set_cls_attrs(CONCEPT, None, property_path, property_paths, include_root=False) return property_paths str_to_CONCEPT = {} -def _set_cls_attrs( - node, prev_node, property_path, property_paths, include_root=False -): +def _set_cls_attrs(node, prev_node, property_path, property_paths, include_root=False): """ Recursive method to traverse a class hierarchy and set class attributes equal to a string which represents a path in the hierarchy to reach the diff --git a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py index edd093fe..6108be9d 100644 --- a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py +++ b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py @@ -84,12 +84,8 @@ def build_entity(cls, record, get_target_id_from_record): secondary_components = { "kf_id": get_target_id_from_record(cls, record), "visible": record.get(CONCEPT.INVESTIGATOR.VISIBLE), - "visibility_comment": record.get( - CONCEPT.INVESTIGATOR.VISIBILITY_COMMENT - ), - "visibility_reason": record.get( - CONCEPT.INVESTIGATOR.VISIBILTIY_REASON - ), + "visibility_comment": record.get(CONCEPT.INVESTIGATOR.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.INVESTIGATOR.VISIBILTIY_REASON), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -109,9 +105,7 @@ class Study: @classmethod def get_key_components(cls, record, get_target_id_from_record): - kfid = record.get(cls.target_id_concept) or record.get( - CONCEPT.PROJECT.ID - ) + kfid = record.get(cls.target_id_concept) or record.get(CONCEPT.PROJECT.ID) au = record.get(CONCEPT.STUDY.AUTHORITY) id = record.get(CONCEPT.STUDY.ID) assert (au and id) or kfid @@ -123,9 +117,7 @@ def query_target_ids(cls, host, key_components): if kfid: return [kfid] else: - return list( - yield_kfids(host, cls.api_path, drop_none(key_components)) - ) + return list(yield_kfids(host, cls.api_path, drop_none(key_components))) @classmethod def build_entity(cls, record, get_target_id_from_record): @@ -220,17 +212,11 @@ def build_entity(cls, record, get_target_id_from_record): "ethnicity": record.get(CONCEPT.PARTICIPANT.ETHNICITY), "gender": record.get(CONCEPT.PARTICIPANT.GENDER), "race": record.get(CONCEPT.PARTICIPANT.RACE), - "affected_status": record.get( - CONCEPT.PARTICIPANT.IS_AFFECTED_UNDER_STUDY - ), + "affected_status": record.get(CONCEPT.PARTICIPANT.IS_AFFECTED_UNDER_STUDY), "species": record.get(CONCEPT.PARTICIPANT.SPECIES), "visible": record.get(CONCEPT.PARTICIPANT.VISIBLE), - "visibility_comment": record.get( - CONCEPT.PARTICIPANT.VISIBILITY_COMMENT - ), - "visibility_reason": record.get( - CONCEPT.PARTICIPANT.VISIBILTIY_REASON - ), + "visibility_comment": record.get(CONCEPT.PARTICIPANT.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.PARTICIPANT.VISIBILTIY_REASON), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -251,9 +237,7 @@ class Diagnosis: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "participant_id": not_none( - get_target_id_from_record(Participant, record) - ), + "participant_id": not_none(get_target_id_from_record(Participant, record)), "source_text_diagnosis": not_none(record[CONCEPT.DIAGNOSIS.NAME]), "age_at_event_days": flexible_age( record, @@ -270,26 +254,18 @@ def query_target_ids(cls, host, key_components): def build_entity(cls, record, get_target_id_from_record): secondary_components = { "kf_id": get_target_id_from_record(cls, record), - "source_text_tumor_location": record.get( - CONCEPT.DIAGNOSIS.TUMOR_LOCATION - ), + "source_text_tumor_location": record.get(CONCEPT.DIAGNOSIS.TUMOR_LOCATION), "mondo_id_diagnosis": record.get(CONCEPT.DIAGNOSIS.MONDO_ID), "icd_id_diagnosis": record.get(CONCEPT.DIAGNOSIS.ICD_ID), "uberon_id_tumor_location": record.get( CONCEPT.DIAGNOSIS.UBERON_TUMOR_LOCATION_ID ), "ncit_id_diagnosis": record.get(CONCEPT.DIAGNOSIS.NCIT_ID), - "spatial_descriptor": record.get( - CONCEPT.DIAGNOSIS.SPATIAL_DESCRIPTOR - ), + "spatial_descriptor": record.get(CONCEPT.DIAGNOSIS.SPATIAL_DESCRIPTOR), "diagnosis_category": record.get(CONCEPT.DIAGNOSIS.CATEGORY), "visible": record.get(CONCEPT.DIAGNOSIS.VISIBLE), - "visibility_comment": record.get( - CONCEPT.DIAGNOSIS.VISIBILITY_COMMENT - ), - "visibility_reason": record.get( - CONCEPT.DIAGNOSIS.VISIBILTIY_REASON - ), + "visibility_comment": record.get(CONCEPT.DIAGNOSIS.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.DIAGNOSIS.VISIBILTIY_REASON), "external_id": record.get(CONCEPT.DIAGNOSIS.ID), } return { @@ -311,9 +287,7 @@ class Phenotype: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "participant_id": not_none( - get_target_id_from_record(Participant, record) - ), + "participant_id": not_none(get_target_id_from_record(Participant, record)), "source_text_phenotype": not_none(record[CONCEPT.PHENOTYPE.NAME]), "observed": not_none(record[CONCEPT.PHENOTYPE.OBSERVED]), "age_at_event_days": flexible_age( @@ -334,12 +308,8 @@ def build_entity(cls, record, get_target_id_from_record): "hpo_id_phenotype": record.get(CONCEPT.PHENOTYPE.HPO_ID), "snomed_id_phenotype": record.get(CONCEPT.PHENOTYPE.SNOMED_ID), "visible": record.get(CONCEPT.PHENOTYPE.VISIBLE), - "visibility_comment": record.get( - CONCEPT.PHENOTYPE.VISIBILITY_COMMENT - ), - "visibility_reason": record.get( - CONCEPT.PHENOTYPE.VISIBILTIY_REASON - ), + "visibility_comment": record.get(CONCEPT.PHENOTYPE.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.PHENOTYPE.VISIBILTIY_REASON), "external_id": record.get(CONCEPT.PHENOTYPE.ID), } return { @@ -382,9 +352,7 @@ def get_key_components(cls, record, get_target_id_from_record): # Skip anything without a status, but don't consider it a key field. not_none(record[CONCEPT.OUTCOME.VITAL_STATUS]) return { - "participant_id": not_none( - get_target_id_from_record(Participant, record) - ) + "participant_id": not_none(get_target_id_from_record(Participant, record)) } @classmethod @@ -407,9 +375,7 @@ def build_entity(cls, record, get_target_id_from_record): "kf_id": get_target_id_from_record(cls, record), "disease_related": record.get(CONCEPT.OUTCOME.DISEASE_RELATED), "visible": record.get(CONCEPT.OUTCOME.VISIBLE), - "visibility_comment": record.get( - CONCEPT.OUTCOME.VISIBILITY_COMMENT - ), + "visibility_comment": record.get(CONCEPT.OUTCOME.VISIBILITY_COMMENT), "visibility_reason": record.get(CONCEPT.OUTCOME.VISIBILTIY_REASON), "age_at_event_days": flexible_age( record, @@ -429,6 +395,58 @@ def submit(cls, host, body): return submit(host, cls, body) +class BiospecimenGroup: + class_name = "sample" + api_path = "samples" + target_id_concept = CONCEPT.BIOSPECIMEN_GROUP.TARGET_SERVICE_ID + service_id_fields = {"kf_id", "participant_id"} + + @classmethod + def get_key_components(cls, record, get_target_id_from_record): + return { + "study_id": get_target_id_from_record(Study, record), + "external_id": not_none(record[CONCEPT.BIOSPECIMEN_GROUP.ID]), + } + + @classmethod + def query_target_ids(cls, host, key_components): + return list(yield_kfids(host, cls.api_path, drop_none(key_components))) + + @classmethod + def build_entity(cls, record, get_target_id_from_record): + secondary_components = { + "kf_id": get_target_id_from_record(cls, record), + "age_at_event_days": flexible_age( + record, + CONCEPT.BIOSPECIMEN_GROUP.EVENT_AGE_DAYS, + CONCEPT.BIOSPECIMEN_GROUP.EVENT_AGE, + ), + "anatomical_location": record.get(CONCEPT.BIOSPECIMEN_GROUP.ANATOMY_SITE), + "method_of_sample_procurement": record.get( + CONCEPT.BIOSPECIMEN_GROUP.SAMPLE_PROCUREMENT + ), + "participant_id": not_none(get_target_id_from_record(Participant, record)), + "preservation_method": record.get( + CONCEPT.BIOSPECIMEN_GROUP.PRESERVATION_METHOD + ), + "sample_event_key": record.get(CONCEPT.BIOSPECIMEN_GROUP.EVENT_ID), + "sample_type": record.get(CONCEPT.BIOSPECIMEN_GROUP.COMPOSITION), + "tissue_type": record.get(CONCEPT.BIOSPECIMEN_GROUP.TISSUE_TYPE), + "visible": record.get(CONCEPT.BIOSPECIMEN_GROUP.VISIBLE), + "visibility_comment": record.get( + CONCEPT.BIOSPECIMEN_GROUP.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.BIOSPECIMEN_GROUP.VISIBILTIY_REASON + ), + "volume_ul": record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL), + } + return { + **cls.get_key_components(record, get_target_id_from_record), + **secondary_components, + } + + class Biospecimen: class_name = "biospecimen" api_path = "biospecimens" @@ -453,56 +471,67 @@ def build_entity(cls, record, get_target_id_from_record): "sequencing_center_id": record.get( CONCEPT.SEQUENCING.CENTER.TARGET_SERVICE_ID ), - "participant_id": not_none( - get_target_id_from_record(Participant, record) - ), + "participant_id": not_none(get_target_id_from_record(Participant, record)), "external_sample_id": ( record.get(CONCEPT.BIOSPECIMEN_GROUP.ID) or not_none(record[CONCEPT.BIOSPECIMEN.ID]) ), - "source_text_tissue_type": record.get( - CONCEPT.BIOSPECIMEN.TISSUE_TYPE + "source_text_tissue_type": ( + record.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.TISSUE_TYPE) ), - "composition": record.get(CONCEPT.BIOSPECIMEN.COMPOSITION), - "source_text_anatomical_site": record.get( - CONCEPT.BIOSPECIMEN.ANATOMY_SITE + "composition": ( + record.get(CONCEPT.BIOSPECIMEN.COMPOSITION) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.COMPOSITION) ), - "age_at_event_days": flexible_age( - record, - CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS, - CONCEPT.BIOSPECIMEN.EVENT_AGE, + "source_text_anatomical_site": ( + record.get(CONCEPT.BIOSPECIMEN.ANATOMY_SITE) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.ANATOMY_SITE) + ), + "age_at_event_days": ( + flexible_age( + record, + CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS, + CONCEPT.BIOSPECIMEN.EVENT_AGE, + ) + or flexible_age( + record, + CONCEPT.BIOSPECIMEN_GROUP.EVENT_AGE_DAYS, + CONCEPT.BIOSPECIMEN_GROUP.EVENT_AGE, + ) ), "source_text_tumor_descriptor": record.get( CONCEPT.BIOSPECIMEN.TUMOR_DESCRIPTOR ), - "ncit_id_tissue_type": record.get( - CONCEPT.BIOSPECIMEN.NCIT_TISSUE_TYPE_ID - ), - "ncit_id_anatomical_site": record.get( - CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID + "ncit_id_tissue_type": ( + record.get(CONCEPT.BIOSPECIMEN.NCIT_TISSUE_TYPE_ID) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.NCIT_TISSUE_TYPE_ID) ), - "uberon_id_anatomical_site": record.get( - CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID + "ncit_id_anatomical_site": ( + record.get(CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.NCIT_ANATOMY_SITE_ID) ), - "spatial_descriptor": record.get( - CONCEPT.BIOSPECIMEN.SPATIAL_DESCRIPTOR + "uberon_id_anatomical_site": ( + record.get(CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.UBERON_ANATOMY_SITE_ID) ), + "spatial_descriptor": record.get(CONCEPT.BIOSPECIMEN.SPATIAL_DESCRIPTOR), "shipment_origin": record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_ORIGIN), "shipment_date": record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_DATE), "analyte_type": record.get(CONCEPT.BIOSPECIMEN.ANALYTE), "concentration_mg_per_ml": record.get( CONCEPT.BIOSPECIMEN.CONCENTRATION_MG_PER_ML ), - "volume_ul": record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL), - "visible": record.get(CONCEPT.BIOSPECIMEN.VISIBLE), - "visibility_comment": record.get( - CONCEPT.BIOSPECIMEN.VISIBILITY_COMMENT - ), - "visibility_reason": record.get( - CONCEPT.BIOSPECIMEN.VISIBILTIY_REASON + "volume_ul": ( + record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.VOLUME_UL) ), - "method_of_sample_procurement": record.get( - CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT + "visible": record.get(CONCEPT.BIOSPECIMEN.VISIBLE), + "visibility_comment": record.get(CONCEPT.BIOSPECIMEN.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.BIOSPECIMEN.VISIBILTIY_REASON), + "method_of_sample_procurement": ( + record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.SAMPLE_PROCUREMENT) ), "dbgap_consent_code": record.get( CONCEPT.BIOSPECIMEN.DBGAP_STYLE_CONSENT_CODE @@ -568,25 +597,17 @@ def hashes(record): "urls": str_to_obj(record.get(CONCEPT.GENOMIC_FILE.URL_LIST)), "acl": [], "authz": str_to_obj(record.get(CONCEPT.GENOMIC_FILE.ACL)), - "reference_genome": record.get( - CONCEPT.GENOMIC_FILE.REFERENCE_GENOME - ), + "reference_genome": record.get(CONCEPT.GENOMIC_FILE.REFERENCE_GENOME), "worflow_type": record.get(CONCEPT.GENOMIC_FILE.WORKFLOW_TYPE), "worflow_tool": record.get(CONCEPT.GENOMIC_FILE.WORKFLOW_TOOL), - "workflow_version": record.get( - CONCEPT.GENOMIC_FILE.WORKFLOW_VERSION - ), + "workflow_version": record.get(CONCEPT.GENOMIC_FILE.WORKFLOW_VERSION), "data_category": record.get(CONCEPT.GENOMIC_FILE.DATA_CATEGORY), "file_version_descriptor": record.get( CONCEPT.GENOMIC_FILE.FILE_VERSION_DESCRIPTOR ), "visible": record.get(CONCEPT.GENOMIC_FILE.VISIBLE), - "visibility_comment": record.get( - CONCEPT.GENOMIC_FILE.VISIBILITY_COMMENT - ), - "visibility_reason": record.get( - CONCEPT.GENOMIC_FILE.VISIBILTIY_REASON - ), + "visibility_comment": record.get(CONCEPT.GENOMIC_FILE.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.GENOMIC_FILE.VISIBILTIY_REASON), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -623,12 +644,8 @@ def build_entity(cls, record, get_target_id_from_record): "kf_id": get_target_id_from_record(cls, record), "quality_scale": record.get(CONCEPT.READ_GROUP.QUALITY_SCALE), "visible": record.get(CONCEPT.READ_GROUP.VISIBLE), - "visibility_comment": record.get( - CONCEPT.READ_GROUP.VISIBILITY_COMMENT - ), - "visibility_reason": record.get( - CONCEPT.READ_GROUP.VISIBILTIY_REASON - ), + "visibility_comment": record.get(CONCEPT.READ_GROUP.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.READ_GROUP.VISIBILTIY_REASON), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -669,9 +686,7 @@ def build_entity(cls, record, get_target_id_from_record): "experiment_strategy": record.get(CONCEPT.SEQUENCING.STRATEGY), "library_strand": record.get(CONCEPT.SEQUENCING.LIBRARY_STRAND), "library_prep": record.get(CONCEPT.SEQUENCING.LIBRARY_PREP), - "library_selection": record.get( - CONCEPT.SEQUENCING.LIBRARY_SELECTION - ), + "library_selection": record.get(CONCEPT.SEQUENCING.LIBRARY_SELECTION), "is_paired_end": record.get(CONCEPT.SEQUENCING.PAIRED_END), "platform": record.get(CONCEPT.SEQUENCING.PLATFORM), "instrument_model": record.get(CONCEPT.SEQUENCING.INSTRUMENT), @@ -680,23 +695,13 @@ def build_entity(cls, record, get_target_id_from_record): "mean_depth": record.get(CONCEPT.SEQUENCING.MEAN_DEPTH), "total_reads": record.get(CONCEPT.SEQUENCING.TOTAL_READS), "mean_read_length": record.get(CONCEPT.SEQUENCING.MEAN_READ_LENGTH), - "target_capture_kit": record.get( - CONCEPT.SEQUENCING.TARGET_CAPTURE_KIT - ), + "target_capture_kit": record.get(CONCEPT.SEQUENCING.TARGET_CAPTURE_KIT), "read_pair_number": record.get(CONCEPT.SEQUENCING.READ_PAIR_NUMBER), - "is_adapter_trimmed": record.get( - CONCEPT.SEQUENCING.IS_ADAPTER_TRIMMED - ), - "adapter_sequencing": record.get( - CONCEPT.SEQUENCING.ADAPTER_SEQUENCING - ), + "is_adapter_trimmed": record.get(CONCEPT.SEQUENCING.IS_ADAPTER_TRIMMED), + "adapter_sequencing": record.get(CONCEPT.SEQUENCING.ADAPTER_SEQUENCING), "visible": record.get(CONCEPT.SEQUENCING.VISIBLE), - "visibility_comment": record.get( - CONCEPT.SEQUENCING.VISIBILITY_COMMENT - ), - "visibility_reason": record.get( - CONCEPT.SEQUENCING.VISIBILTIY_REASON - ), + "visibility_comment": record.get(CONCEPT.SEQUENCING.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.SEQUENCING.VISIBILTIY_REASON), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -720,13 +725,9 @@ def transform_records_list(cls, records_list): original = DataFrame(records_list) # Convert participant, mother, father to generic family relationships - df = convert_relationships_to_p1p2( - original, infer_genders=True, bidirect=True - ) + df = convert_relationships_to_p1p2(original, infer_genders=True, bidirect=True) # Add back in the visibility info from original df - vis_df = original[ - [FR.PERSON1.ID, FR.VISIBILTIY_REASON, FR.VISIBILITY_COMMENT] - ] + vis_df = original[[FR.PERSON1.ID, FR.VISIBILTIY_REASON, FR.VISIBILITY_COMMENT]] df = merge(df, vis_df, how="left", on=FR.PERSON1.ID) return df.to_dict("records") @@ -796,12 +797,8 @@ class BiospecimenGenomicFile: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "biospecimen_id": not_none( - get_target_id_from_record(Biospecimen, record) - ), - "genomic_file_id": not_none( - get_target_id_from_record(GenomicFile, record) - ), + "biospecimen_id": not_none(get_target_id_from_record(Biospecimen, record)), + "genomic_file_id": not_none(get_target_id_from_record(GenomicFile, record)), } @classmethod @@ -842,12 +839,8 @@ class BiospecimenDiagnosis: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "biospecimen_id": not_none( - get_target_id_from_record(Biospecimen, record) - ), - "diagnosis_id": not_none( - get_target_id_from_record(Diagnosis, record) - ), + "biospecimen_id": not_none(get_target_id_from_record(Biospecimen, record)), + "diagnosis_id": not_none(get_target_id_from_record(Diagnosis, record)), } @classmethod @@ -888,12 +881,8 @@ class ReadGroupGenomicFile: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "read_group_id": not_none( - get_target_id_from_record(ReadGroup, record) - ), - "genomic_file_id": not_none( - get_target_id_from_record(GenomicFile, record) - ), + "read_group_id": not_none(get_target_id_from_record(ReadGroup, record)), + "genomic_file_id": not_none(get_target_id_from_record(GenomicFile, record)), } @classmethod @@ -937,9 +926,7 @@ def get_key_components(cls, record, get_target_id_from_record): "sequencing_experiment_id": not_none( get_target_id_from_record(SequencingExperiment, record) ), - "genomic_file_id": not_none( - get_target_id_from_record(GenomicFile, record) - ), + "genomic_file_id": not_none(get_target_id_from_record(GenomicFile, record)), } @classmethod @@ -1067,9 +1054,7 @@ def coerce_types(host, entity_class, body): seen_overmax_int[entity_class.class_name] = set() if k not in seen_overmax_int[entity_class.class_name]: try: - max_value = ( - 2 ** (int(properties[k]["format"][-2:]) - 1) - ) - 1 + max_value = (2 ** (int(properties[k]["format"][-2:]) - 1)) - 1 if v > max_value: logger.info( f"The server indicates that {entity_class.class_name}" @@ -1133,12 +1118,8 @@ def submit(host, entity_class, body): resp = _GET(host, api_path, body) result = resp.json()["results"][0] if extid != result["external_id"]: - resp = _PATCH( - host, api_path, result["kf_id"], {"external_id": extid} - ) + resp = _PATCH(host, api_path, result["kf_id"], {"external_id": extid}) result = resp.json()["results"] return result["kf_id"] else: - raise RequestException( - f"Sent to /{api_path}:\n{body}\nGot:\n{resp.text}" - ) + raise RequestException(f"Sent to /{api_path}:\n{body}\nGot:\n{resp.text}") From 0ad1e1ff3d2589ee1197632d2a655ae0b74a7749 Mon Sep 17 00:00:00 2001 From: chris-s-friedman Date: Tue, 21 May 2024 16:50:00 -0400 Subject: [PATCH 2/6] =?UTF-8?q?=E2=9C=A8=20Allow=20connecting=20biospecime?= =?UTF-8?q?n=20to=20sample?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py index 6108be9d..df84c4f9 100644 --- a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py +++ b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py @@ -537,6 +537,7 @@ def build_entity(cls, record, get_target_id_from_record): CONCEPT.BIOSPECIMEN.DBGAP_STYLE_CONSENT_CODE ), "consent_type": record.get(CONCEPT.BIOSPECIMEN.CONSENT_SHORT_NAME), + "sample_id": get_target_id_from_record(BiospecimenGroup, record), } return { **cls.get_key_components(record, get_target_id_from_record), From d6126f60a42e052a2dc3437f489095277e3cab0a Mon Sep 17 00:00:00 2001 From: chris-s-friedman Date: Tue, 21 May 2024 16:55:53 -0400 Subject: [PATCH 3/6] =?UTF-8?q?=E2=9C=A8=20Actually=20ingest=20sample?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../target_api_plugins/kids_first_dataservice.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py index df84c4f9..63e1aefd 100644 --- a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py +++ b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py @@ -395,7 +395,7 @@ def submit(cls, host, body): return submit(host, cls, body) -class BiospecimenGroup: +class Sample: class_name = "sample" api_path = "samples" target_id_concept = CONCEPT.BIOSPECIMEN_GROUP.TARGET_SERVICE_ID @@ -537,7 +537,7 @@ def build_entity(cls, record, get_target_id_from_record): CONCEPT.BIOSPECIMEN.DBGAP_STYLE_CONSENT_CODE ), "consent_type": record.get(CONCEPT.BIOSPECIMEN.CONSENT_SHORT_NAME), - "sample_id": get_target_id_from_record(BiospecimenGroup, record), + "sample_id": get_target_id_from_record(Sample, record), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -991,6 +991,7 @@ def _GET(host, api_path, body): Diagnosis, Phenotype, Outcome, + Sample, Biospecimen, GenomicFile, ReadGroup, From d5bbddf3548c92629fff0b02607a08aed537c0d2 Mon Sep 17 00:00:00 2001 From: chris-s-friedman Date: Thu, 23 May 2024 11:47:37 -0400 Subject: [PATCH 4/6] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Make=20sample=20its=20?= =?UTF-8?q?own=20concept=20and=20allow=20sample=20or=20biospecimen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🚨 Don't mess up formmatting 🚨 Don't mess up formmatting point to sample when building sample key concept 🐛 Add submit function to sample --- kf_lib_data_ingest/common/concept_schema.py | 35 +++-- .../kids_first_dataservice.py | 123 +++++++++++------- 2 files changed, 92 insertions(+), 66 deletions(-) diff --git a/kf_lib_data_ingest/common/concept_schema.py b/kf_lib_data_ingest/common/concept_schema.py index 4046de59..3370489e 100644 --- a/kf_lib_data_ingest/common/concept_schema.py +++ b/kf_lib_data_ingest/common/concept_schema.py @@ -146,12 +146,16 @@ class EVENT_AGE(QuantityMixin): pass class BIOSPECIMEN_GROUP(PropertyMixin): + pass + + class SAMPLE(PropertyMixin): TISSUE_TYPE = None NCIT_TISSUE_TYPE_ID = None ANATOMY_SITE = None NCIT_ANATOMY_SITE_ID = None UBERON_ANATOMY_SITE_ID = None COMPOSITION = None + TUMOR_DESCRIPTOR = None EVENT_ID = None EVENT_AGE_DAYS = None @@ -161,36 +165,22 @@ class EVENT_AGE(QuantityMixin): class VOLUME(QuantityMixin): pass + SPATIAL_DESCRIPTOR = None + SHIPMENT_ORIGIN = None + SHIPMENT_DATE = None VOLUME_UL = None SAMPLE_PROCUREMENT = None PRESERVATION_METHOD = None - class BIOSPECIMEN(PropertyMixin): - TISSUE_TYPE = None - NCIT_TISSUE_TYPE_ID = None - ANATOMY_SITE = None - NCIT_ANATOMY_SITE_ID = None - UBERON_ANATOMY_SITE_ID = None - TUMOR_DESCRIPTOR = None - COMPOSITION = None - EVENT_AGE_DAYS = None - - class EVENT_AGE(QuantityMixin): - pass - + class BIOSPECIMEN(SAMPLE): class QUANTITY(QuantityMixin): pass class CONCENTRATION(QuantityMixin): pass - SPATIAL_DESCRIPTOR = None - SHIPMENT_ORIGIN = None - SHIPMENT_DATE = None ANALYTE = None CONCENTRATION_MG_PER_ML = None - VOLUME_UL = None - SAMPLE_PROCUREMENT = None DBGAP_STYLE_CONSENT_CODE = None CONSENT_SHORT_NAME = None @@ -264,14 +254,18 @@ def compile_schema(): property_path = [] property_paths = set() - _set_cls_attrs(CONCEPT, None, property_path, property_paths, include_root=False) + _set_cls_attrs( + CONCEPT, None, property_path, property_paths, include_root=False + ) return property_paths str_to_CONCEPT = {} -def _set_cls_attrs(node, prev_node, property_path, property_paths, include_root=False): +def _set_cls_attrs( + node, prev_node, property_path, property_paths, include_root=False +): """ Recursive method to traverse a class hierarchy and set class attributes equal to a string which represents a path in the hierarchy to reach the @@ -355,6 +349,7 @@ class C: CONCEPT.FAMILY_RELATIONSHIP.PERSON2, CONCEPT.BIOSPECIMEN_GROUP, CONCEPT.BIOSPECIMEN, + CONCEPT.SAMPLE, CONCEPT.DIAGNOSIS, CONCEPT.PHENOTYPE, CONCEPT.DIAGNOSIS, diff --git a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py index 63e1aefd..56d1c2ff 100644 --- a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py +++ b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py @@ -398,14 +398,16 @@ def submit(cls, host, body): class Sample: class_name = "sample" api_path = "samples" - target_id_concept = CONCEPT.BIOSPECIMEN_GROUP.TARGET_SERVICE_ID + target_id_concept = ( + CONCEPT.SAMPLE.TARGET_SERVICE_ID or CONCEPT.BIOSPECIMEN_GROUP.TARGET_SERVICE_ID + ) service_id_fields = {"kf_id", "participant_id"} @classmethod def get_key_components(cls, record, get_target_id_from_record): return { "study_id": get_target_id_from_record(Study, record), - "external_id": not_none(record[CONCEPT.BIOSPECIMEN_GROUP.ID]), + "external_id": not_none(record[CONCEPT.SAMPLE.ID]), } @classmethod @@ -416,36 +418,54 @@ def query_target_ids(cls, host, key_components): def build_entity(cls, record, get_target_id_from_record): secondary_components = { "kf_id": get_target_id_from_record(cls, record), - "age_at_event_days": flexible_age( - record, - CONCEPT.BIOSPECIMEN_GROUP.EVENT_AGE_DAYS, - CONCEPT.BIOSPECIMEN_GROUP.EVENT_AGE, + "age_at_event_days": ( + flexible_age( + record, + CONCEPT.SAMPLE.EVENT_AGE_DAYS, + CONCEPT.SAMPLE.EVENT_AGE, + ) + or flexible_age( + record, + CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS, + CONCEPT.BIOSPECIMEN.EVENT_AGE, + ) + ), + "anatomical_location": ( + record.get(CONCEPT.SAMPLE.ANATOMY_SITE) + or record.get(CONCEPT.BIOSPECIMEN.ANATOMY_SITE) ), - "anatomical_location": record.get(CONCEPT.BIOSPECIMEN_GROUP.ANATOMY_SITE), - "method_of_sample_procurement": record.get( - CONCEPT.BIOSPECIMEN_GROUP.SAMPLE_PROCUREMENT + "method_of_sample_procurement": ( + record.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT) + or record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT) ), "participant_id": not_none(get_target_id_from_record(Participant, record)), - "preservation_method": record.get( - CONCEPT.BIOSPECIMEN_GROUP.PRESERVATION_METHOD + "preservation_method": record.get(CONCEPT.SAMPLE.PRESERVATION_METHOD), + "sample_event_key": record.get(CONCEPT.SAMPLE.EVENT_ID), + "sample_type": ( + record.get(CONCEPT.SAMPLE.COMPOSITION) + or record.get(CONCEPT.BIOSPECIMEN.COMPOSITION) ), - "sample_event_key": record.get(CONCEPT.BIOSPECIMEN_GROUP.EVENT_ID), - "sample_type": record.get(CONCEPT.BIOSPECIMEN_GROUP.COMPOSITION), - "tissue_type": record.get(CONCEPT.BIOSPECIMEN_GROUP.TISSUE_TYPE), - "visible": record.get(CONCEPT.BIOSPECIMEN_GROUP.VISIBLE), - "visibility_comment": record.get( - CONCEPT.BIOSPECIMEN_GROUP.VISIBILITY_COMMENT + "tissue_type": ( + record.get(CONCEPT.SAMPLE.TISSUE_TYPE) + or record.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE) ), - "visibility_reason": record.get( - CONCEPT.BIOSPECIMEN_GROUP.VISIBILTIY_REASON + "visible": record.get(CONCEPT.SAMPLE.VISIBLE), + "visibility_comment": record.get(CONCEPT.SAMPLE.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.SAMPLE.VISIBILTIY_REASON), + "volume_ul": ( + record.get(CONCEPT.SAMPLE.VOLUME_UL) + or record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL) ), - "volume_ul": record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL), } return { **cls.get_key_components(record, get_target_id_from_record), **secondary_components, } + @classmethod + def submit(cls, host, body): + return submit(host, cls, body) + class Biospecimen: class_name = "biospecimen" @@ -473,65 +493,76 @@ def build_entity(cls, record, get_target_id_from_record): ), "participant_id": not_none(get_target_id_from_record(Participant, record)), "external_sample_id": ( - record.get(CONCEPT.BIOSPECIMEN_GROUP.ID) + record.get(CONCEPT.SAMPLE.ID) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.ID) or not_none(record[CONCEPT.BIOSPECIMEN.ID]) ), "source_text_tissue_type": ( - record.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE) - or record.get(CONCEPT.BIOSPECIMEN_GROUP.TISSUE_TYPE) + record.get(CONCEPT.SAMPLE.TISSUE_TYPE) + or record.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE) ), "composition": ( - record.get(CONCEPT.BIOSPECIMEN.COMPOSITION) - or record.get(CONCEPT.BIOSPECIMEN_GROUP.COMPOSITION) + record.get(CONCEPT.SAMPLE.COMPOSITION) + or record.get(CONCEPT.BIOSPECIMEN.COMPOSITION) ), "source_text_anatomical_site": ( - record.get(CONCEPT.BIOSPECIMEN.ANATOMY_SITE) - or record.get(CONCEPT.BIOSPECIMEN_GROUP.ANATOMY_SITE) + record.get(CONCEPT.SAMPLE.ANATOMY_SITE) + or record.get(CONCEPT.BIOSPECIMEN.ANATOMY_SITE) ), "age_at_event_days": ( flexible_age( record, - CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS, - CONCEPT.BIOSPECIMEN.EVENT_AGE, + CONCEPT.SAMPLE.EVENT_AGE_DAYS, + CONCEPT.SAMPLE.EVENT_AGE, ) or flexible_age( record, - CONCEPT.BIOSPECIMEN_GROUP.EVENT_AGE_DAYS, - CONCEPT.BIOSPECIMEN_GROUP.EVENT_AGE, + CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS, + CONCEPT.BIOSPECIMEN.EVENT_AGE, ) ), - "source_text_tumor_descriptor": record.get( - CONCEPT.BIOSPECIMEN.TUMOR_DESCRIPTOR + "source_text_tumor_descriptor": ( + record.get(CONCEPT.SAMPLE.TUMOR_DESCRIPTOR) + or record.get(CONCEPT.BIOSPECIMEN.TUMOR_DESCRIPTOR) ), "ncit_id_tissue_type": ( - record.get(CONCEPT.BIOSPECIMEN.NCIT_TISSUE_TYPE_ID) - or record.get(CONCEPT.BIOSPECIMEN_GROUP.NCIT_TISSUE_TYPE_ID) + record.get(CONCEPT.SAMPLE.NCIT_TISSUE_TYPE_ID) + or record.get(CONCEPT.BIOSPECIMEN.NCIT_TISSUE_TYPE_ID) ), "ncit_id_anatomical_site": ( - record.get(CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID) - or record.get(CONCEPT.BIOSPECIMEN_GROUP.NCIT_ANATOMY_SITE_ID) + record.get(CONCEPT.SAMPLE.NCIT_ANATOMY_SITE_ID) + or record.get(CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID) ), "uberon_id_anatomical_site": ( - record.get(CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID) - or record.get(CONCEPT.BIOSPECIMEN_GROUP.UBERON_ANATOMY_SITE_ID) + record.get(CONCEPT.SAMPLE.UBERON_ANATOMY_SITE_ID) + or record.get(CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID) + ), + "spatial_descriptor": ( + record.get(CONCEPT.SAMPLE.SPATIAL_DESCRIPTOR) + or record.get(CONCEPT.BIOSPECIMEN.SPATIAL_DESCRIPTOR) + ), + "shipment_origin": ( + record.get(CONCEPT.SAMPLE.SHIPMENT_ORIGIN) + or record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_ORIGIN) + ), + "shipment_date": ( + record.get(CONCEPT.SAMPLE.SHIPMENT_DATE) + or record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_DATE) ), - "spatial_descriptor": record.get(CONCEPT.BIOSPECIMEN.SPATIAL_DESCRIPTOR), - "shipment_origin": record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_ORIGIN), - "shipment_date": record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_DATE), "analyte_type": record.get(CONCEPT.BIOSPECIMEN.ANALYTE), "concentration_mg_per_ml": record.get( CONCEPT.BIOSPECIMEN.CONCENTRATION_MG_PER_ML ), "volume_ul": ( - record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL) - or record.get(CONCEPT.BIOSPECIMEN_GROUP.VOLUME_UL) + record.get(CONCEPT.SAMPLE.VOLUME_UL) + or record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL) ), "visible": record.get(CONCEPT.BIOSPECIMEN.VISIBLE), "visibility_comment": record.get(CONCEPT.BIOSPECIMEN.VISIBILITY_COMMENT), "visibility_reason": record.get(CONCEPT.BIOSPECIMEN.VISIBILTIY_REASON), "method_of_sample_procurement": ( - record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT) - or record.get(CONCEPT.BIOSPECIMEN_GROUP.SAMPLE_PROCUREMENT) + record.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT) + or record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT) ), "dbgap_consent_code": record.get( CONCEPT.BIOSPECIMEN.DBGAP_STYLE_CONSENT_CODE From a956f021dbdb1469240abd3bd46411a54008a255 Mon Sep 17 00:00:00 2001 From: chris-s-friedman Date: Thu, 23 May 2024 15:26:47 -0400 Subject: [PATCH 5/6] =?UTF-8?q?=F0=9F=93=9D=20Document=20samples=20and=20s?= =?UTF-8?q?pecimens?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🚧 Working on documentation for samples and biospecimens 📝 Document samples and specimens 🔥 Remove test text 🚨 Enable black linting 📝 re-word docs for better clarity Co-authored-by: Natasha Singh Update docs/source/design/concepts/samples_and_specimens.rst Co-authored-by: Natasha Singh Update docs/source/design/concepts/samples_and_specimens.rst Co-authored-by: Natasha Singh Update docs/source/design/concepts/samples_and_specimens.rst Co-authored-by: Natasha Singh Update docs/source/design/concepts/samples_and_specimens.rst Co-authored-by: Natasha Singh Update docs/source/design/concepts/samples_and_specimens.rst Co-authored-by: Natasha Singh Update docs/source/design/concepts/index.rst Co-authored-by: Natasha Singh 🚨 Keep docs under 80 character line length 🚨 Documentation line length --- docs/source/design/concepts/index.rst | 19 ++ .../design/concepts/samples_and_specimens.rst | 50 +++++ docs/source/design/overview.rst | 1 + .../kids_first_dataservice.py | 187 +++++++++++++----- 4 files changed, 210 insertions(+), 47 deletions(-) create mode 100644 docs/source/design/concepts/index.rst create mode 100644 docs/source/design/concepts/samples_and_specimens.rst diff --git a/docs/source/design/concepts/index.rst b/docs/source/design/concepts/index.rst new file mode 100644 index 00000000..1d23deff --- /dev/null +++ b/docs/source/design/concepts/index.rst @@ -0,0 +1,19 @@ +.. _about_concepts: + +============== +Concept Schema +============== + +A key part of the ingest library is the intermediate model that raw data is +mapped to. The intermediate model is called the "concept schema". The concept +schema is essentially just a list of column names that follow a standard that +denotes the type/concept and the attribute (e.g. `PARTICIPANT.GENDER`). Data +that has been mapped to the concept schema is later transformed into the final +schema that is used to create the tables in the target API. + + + +.. toctree:: + :maxdepth: 2 + + samples_and_specimens.rst diff --git a/docs/source/design/concepts/samples_and_specimens.rst b/docs/source/design/concepts/samples_and_specimens.rst new file mode 100644 index 00000000..f90408db --- /dev/null +++ b/docs/source/design/concepts/samples_and_specimens.rst @@ -0,0 +1,50 @@ +.. _samples_and_specimens: + +======================== +Samples and Biospecimens +======================== + +Although similarly named, samples and biospecimens refer to different concepts: + + * A **sample** represents a physical piece of tissue, blood, or other + biologically distinct material taken from a patient. + * A **biospecimen** is represents a portion or a part of that sample, e.g. + an aliquot of a sample. + +While samples and biospecimens are distinct concepts, they share much in +common. In fact, when the ingest library was first written, its primary target +API, the Kids First Data Service, only had a table for biospecimens. As a +result, the ingest library's architecture provides for a biospecimen to share +*all* the qualities of a sample. In fact, biospecimen is a child class of +sample! + +This architecture allows the ingest library to be used against target APIs +that, like the older versions of the Kids First Data Service, only have a table +for biospecimens. + +A sample has qualities: + + * A sample may have information about itself, such as the type of tissue it + is, the type of tumor it comes from, when the sample was collected from + the participant, its volume, etc. + * A sample may have information about shipping, such as the date it was + shipped and shipment origin + + +As discussed above, a biospecimen is a child class of sample, so biospecimens +may have all of the same qualities of a sample*. In addition: + + * a biospecimen may have information about its concentration + * a biospecimen may have information about its analyte type (e.g. DNA vs + RNA) + * a biospecimen may have information about the consent under which it was + collected. + +Biospecimen is designed as a child class of sample to provide for +backwards-compatibility with older ingest packages that existed before the +sample concept. + +Moving forward, it is advised to use the sample class when +extracting information that is most related to the sample and use biospecimen +only when extracting information that is specific to the biospecimen +(such as concentration, analyte, and consent information). diff --git a/docs/source/design/overview.rst b/docs/source/design/overview.rst index d3cc1536..2cbd4e13 100644 --- a/docs/source/design/overview.rst +++ b/docs/source/design/overview.rst @@ -40,6 +40,7 @@ described. :maxdepth: 1 value_principles + concepts/index.rst extract_mapping transform load diff --git a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py index 56d1c2ff..26fbeb12 100644 --- a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py +++ b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py @@ -84,8 +84,12 @@ def build_entity(cls, record, get_target_id_from_record): secondary_components = { "kf_id": get_target_id_from_record(cls, record), "visible": record.get(CONCEPT.INVESTIGATOR.VISIBLE), - "visibility_comment": record.get(CONCEPT.INVESTIGATOR.VISIBILITY_COMMENT), - "visibility_reason": record.get(CONCEPT.INVESTIGATOR.VISIBILTIY_REASON), + "visibility_comment": record.get( + CONCEPT.INVESTIGATOR.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.INVESTIGATOR.VISIBILTIY_REASON + ), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -105,7 +109,9 @@ class Study: @classmethod def get_key_components(cls, record, get_target_id_from_record): - kfid = record.get(cls.target_id_concept) or record.get(CONCEPT.PROJECT.ID) + kfid = record.get(cls.target_id_concept) or record.get( + CONCEPT.PROJECT.ID + ) au = record.get(CONCEPT.STUDY.AUTHORITY) id = record.get(CONCEPT.STUDY.ID) assert (au and id) or kfid @@ -117,7 +123,9 @@ def query_target_ids(cls, host, key_components): if kfid: return [kfid] else: - return list(yield_kfids(host, cls.api_path, drop_none(key_components))) + return list( + yield_kfids(host, cls.api_path, drop_none(key_components)) + ) @classmethod def build_entity(cls, record, get_target_id_from_record): @@ -212,11 +220,17 @@ def build_entity(cls, record, get_target_id_from_record): "ethnicity": record.get(CONCEPT.PARTICIPANT.ETHNICITY), "gender": record.get(CONCEPT.PARTICIPANT.GENDER), "race": record.get(CONCEPT.PARTICIPANT.RACE), - "affected_status": record.get(CONCEPT.PARTICIPANT.IS_AFFECTED_UNDER_STUDY), + "affected_status": record.get( + CONCEPT.PARTICIPANT.IS_AFFECTED_UNDER_STUDY + ), "species": record.get(CONCEPT.PARTICIPANT.SPECIES), "visible": record.get(CONCEPT.PARTICIPANT.VISIBLE), - "visibility_comment": record.get(CONCEPT.PARTICIPANT.VISIBILITY_COMMENT), - "visibility_reason": record.get(CONCEPT.PARTICIPANT.VISIBILTIY_REASON), + "visibility_comment": record.get( + CONCEPT.PARTICIPANT.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.PARTICIPANT.VISIBILTIY_REASON + ), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -237,7 +251,9 @@ class Diagnosis: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "participant_id": not_none(get_target_id_from_record(Participant, record)), + "participant_id": not_none( + get_target_id_from_record(Participant, record) + ), "source_text_diagnosis": not_none(record[CONCEPT.DIAGNOSIS.NAME]), "age_at_event_days": flexible_age( record, @@ -254,18 +270,26 @@ def query_target_ids(cls, host, key_components): def build_entity(cls, record, get_target_id_from_record): secondary_components = { "kf_id": get_target_id_from_record(cls, record), - "source_text_tumor_location": record.get(CONCEPT.DIAGNOSIS.TUMOR_LOCATION), + "source_text_tumor_location": record.get( + CONCEPT.DIAGNOSIS.TUMOR_LOCATION + ), "mondo_id_diagnosis": record.get(CONCEPT.DIAGNOSIS.MONDO_ID), "icd_id_diagnosis": record.get(CONCEPT.DIAGNOSIS.ICD_ID), "uberon_id_tumor_location": record.get( CONCEPT.DIAGNOSIS.UBERON_TUMOR_LOCATION_ID ), "ncit_id_diagnosis": record.get(CONCEPT.DIAGNOSIS.NCIT_ID), - "spatial_descriptor": record.get(CONCEPT.DIAGNOSIS.SPATIAL_DESCRIPTOR), + "spatial_descriptor": record.get( + CONCEPT.DIAGNOSIS.SPATIAL_DESCRIPTOR + ), "diagnosis_category": record.get(CONCEPT.DIAGNOSIS.CATEGORY), "visible": record.get(CONCEPT.DIAGNOSIS.VISIBLE), - "visibility_comment": record.get(CONCEPT.DIAGNOSIS.VISIBILITY_COMMENT), - "visibility_reason": record.get(CONCEPT.DIAGNOSIS.VISIBILTIY_REASON), + "visibility_comment": record.get( + CONCEPT.DIAGNOSIS.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.DIAGNOSIS.VISIBILTIY_REASON + ), "external_id": record.get(CONCEPT.DIAGNOSIS.ID), } return { @@ -287,7 +311,9 @@ class Phenotype: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "participant_id": not_none(get_target_id_from_record(Participant, record)), + "participant_id": not_none( + get_target_id_from_record(Participant, record) + ), "source_text_phenotype": not_none(record[CONCEPT.PHENOTYPE.NAME]), "observed": not_none(record[CONCEPT.PHENOTYPE.OBSERVED]), "age_at_event_days": flexible_age( @@ -308,8 +334,12 @@ def build_entity(cls, record, get_target_id_from_record): "hpo_id_phenotype": record.get(CONCEPT.PHENOTYPE.HPO_ID), "snomed_id_phenotype": record.get(CONCEPT.PHENOTYPE.SNOMED_ID), "visible": record.get(CONCEPT.PHENOTYPE.VISIBLE), - "visibility_comment": record.get(CONCEPT.PHENOTYPE.VISIBILITY_COMMENT), - "visibility_reason": record.get(CONCEPT.PHENOTYPE.VISIBILTIY_REASON), + "visibility_comment": record.get( + CONCEPT.PHENOTYPE.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.PHENOTYPE.VISIBILTIY_REASON + ), "external_id": record.get(CONCEPT.PHENOTYPE.ID), } return { @@ -352,7 +382,9 @@ def get_key_components(cls, record, get_target_id_from_record): # Skip anything without a status, but don't consider it a key field. not_none(record[CONCEPT.OUTCOME.VITAL_STATUS]) return { - "participant_id": not_none(get_target_id_from_record(Participant, record)) + "participant_id": not_none( + get_target_id_from_record(Participant, record) + ) } @classmethod @@ -375,7 +407,9 @@ def build_entity(cls, record, get_target_id_from_record): "kf_id": get_target_id_from_record(cls, record), "disease_related": record.get(CONCEPT.OUTCOME.DISEASE_RELATED), "visible": record.get(CONCEPT.OUTCOME.VISIBLE), - "visibility_comment": record.get(CONCEPT.OUTCOME.VISIBILITY_COMMENT), + "visibility_comment": record.get( + CONCEPT.OUTCOME.VISIBILITY_COMMENT + ), "visibility_reason": record.get(CONCEPT.OUTCOME.VISIBILTIY_REASON), "age_at_event_days": flexible_age( record, @@ -399,7 +433,8 @@ class Sample: class_name = "sample" api_path = "samples" target_id_concept = ( - CONCEPT.SAMPLE.TARGET_SERVICE_ID or CONCEPT.BIOSPECIMEN_GROUP.TARGET_SERVICE_ID + CONCEPT.SAMPLE.TARGET_SERVICE_ID + or CONCEPT.BIOSPECIMEN_GROUP.TARGET_SERVICE_ID ) service_id_fields = {"kf_id", "participant_id"} @@ -438,8 +473,12 @@ def build_entity(cls, record, get_target_id_from_record): record.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT) or record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT) ), - "participant_id": not_none(get_target_id_from_record(Participant, record)), - "preservation_method": record.get(CONCEPT.SAMPLE.PRESERVATION_METHOD), + "participant_id": not_none( + get_target_id_from_record(Participant, record) + ), + "preservation_method": record.get( + CONCEPT.SAMPLE.PRESERVATION_METHOD + ), "sample_event_key": record.get(CONCEPT.SAMPLE.EVENT_ID), "sample_type": ( record.get(CONCEPT.SAMPLE.COMPOSITION) @@ -491,7 +530,9 @@ def build_entity(cls, record, get_target_id_from_record): "sequencing_center_id": record.get( CONCEPT.SEQUENCING.CENTER.TARGET_SERVICE_ID ), - "participant_id": not_none(get_target_id_from_record(Participant, record)), + "participant_id": not_none( + get_target_id_from_record(Participant, record) + ), "external_sample_id": ( record.get(CONCEPT.SAMPLE.ID) or record.get(CONCEPT.BIOSPECIMEN_GROUP.ID) @@ -558,8 +599,12 @@ def build_entity(cls, record, get_target_id_from_record): or record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL) ), "visible": record.get(CONCEPT.BIOSPECIMEN.VISIBLE), - "visibility_comment": record.get(CONCEPT.BIOSPECIMEN.VISIBILITY_COMMENT), - "visibility_reason": record.get(CONCEPT.BIOSPECIMEN.VISIBILTIY_REASON), + "visibility_comment": record.get( + CONCEPT.BIOSPECIMEN.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.BIOSPECIMEN.VISIBILTIY_REASON + ), "method_of_sample_procurement": ( record.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT) or record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT) @@ -629,17 +674,25 @@ def hashes(record): "urls": str_to_obj(record.get(CONCEPT.GENOMIC_FILE.URL_LIST)), "acl": [], "authz": str_to_obj(record.get(CONCEPT.GENOMIC_FILE.ACL)), - "reference_genome": record.get(CONCEPT.GENOMIC_FILE.REFERENCE_GENOME), + "reference_genome": record.get( + CONCEPT.GENOMIC_FILE.REFERENCE_GENOME + ), "worflow_type": record.get(CONCEPT.GENOMIC_FILE.WORKFLOW_TYPE), "worflow_tool": record.get(CONCEPT.GENOMIC_FILE.WORKFLOW_TOOL), - "workflow_version": record.get(CONCEPT.GENOMIC_FILE.WORKFLOW_VERSION), + "workflow_version": record.get( + CONCEPT.GENOMIC_FILE.WORKFLOW_VERSION + ), "data_category": record.get(CONCEPT.GENOMIC_FILE.DATA_CATEGORY), "file_version_descriptor": record.get( CONCEPT.GENOMIC_FILE.FILE_VERSION_DESCRIPTOR ), "visible": record.get(CONCEPT.GENOMIC_FILE.VISIBLE), - "visibility_comment": record.get(CONCEPT.GENOMIC_FILE.VISIBILITY_COMMENT), - "visibility_reason": record.get(CONCEPT.GENOMIC_FILE.VISIBILTIY_REASON), + "visibility_comment": record.get( + CONCEPT.GENOMIC_FILE.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.GENOMIC_FILE.VISIBILTIY_REASON + ), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -676,8 +729,12 @@ def build_entity(cls, record, get_target_id_from_record): "kf_id": get_target_id_from_record(cls, record), "quality_scale": record.get(CONCEPT.READ_GROUP.QUALITY_SCALE), "visible": record.get(CONCEPT.READ_GROUP.VISIBLE), - "visibility_comment": record.get(CONCEPT.READ_GROUP.VISIBILITY_COMMENT), - "visibility_reason": record.get(CONCEPT.READ_GROUP.VISIBILTIY_REASON), + "visibility_comment": record.get( + CONCEPT.READ_GROUP.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.READ_GROUP.VISIBILTIY_REASON + ), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -718,7 +775,9 @@ def build_entity(cls, record, get_target_id_from_record): "experiment_strategy": record.get(CONCEPT.SEQUENCING.STRATEGY), "library_strand": record.get(CONCEPT.SEQUENCING.LIBRARY_STRAND), "library_prep": record.get(CONCEPT.SEQUENCING.LIBRARY_PREP), - "library_selection": record.get(CONCEPT.SEQUENCING.LIBRARY_SELECTION), + "library_selection": record.get( + CONCEPT.SEQUENCING.LIBRARY_SELECTION + ), "is_paired_end": record.get(CONCEPT.SEQUENCING.PAIRED_END), "platform": record.get(CONCEPT.SEQUENCING.PLATFORM), "instrument_model": record.get(CONCEPT.SEQUENCING.INSTRUMENT), @@ -727,13 +786,23 @@ def build_entity(cls, record, get_target_id_from_record): "mean_depth": record.get(CONCEPT.SEQUENCING.MEAN_DEPTH), "total_reads": record.get(CONCEPT.SEQUENCING.TOTAL_READS), "mean_read_length": record.get(CONCEPT.SEQUENCING.MEAN_READ_LENGTH), - "target_capture_kit": record.get(CONCEPT.SEQUENCING.TARGET_CAPTURE_KIT), + "target_capture_kit": record.get( + CONCEPT.SEQUENCING.TARGET_CAPTURE_KIT + ), "read_pair_number": record.get(CONCEPT.SEQUENCING.READ_PAIR_NUMBER), - "is_adapter_trimmed": record.get(CONCEPT.SEQUENCING.IS_ADAPTER_TRIMMED), - "adapter_sequencing": record.get(CONCEPT.SEQUENCING.ADAPTER_SEQUENCING), + "is_adapter_trimmed": record.get( + CONCEPT.SEQUENCING.IS_ADAPTER_TRIMMED + ), + "adapter_sequencing": record.get( + CONCEPT.SEQUENCING.ADAPTER_SEQUENCING + ), "visible": record.get(CONCEPT.SEQUENCING.VISIBLE), - "visibility_comment": record.get(CONCEPT.SEQUENCING.VISIBILITY_COMMENT), - "visibility_reason": record.get(CONCEPT.SEQUENCING.VISIBILTIY_REASON), + "visibility_comment": record.get( + CONCEPT.SEQUENCING.VISIBILITY_COMMENT + ), + "visibility_reason": record.get( + CONCEPT.SEQUENCING.VISIBILTIY_REASON + ), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -757,9 +826,13 @@ def transform_records_list(cls, records_list): original = DataFrame(records_list) # Convert participant, mother, father to generic family relationships - df = convert_relationships_to_p1p2(original, infer_genders=True, bidirect=True) + df = convert_relationships_to_p1p2( + original, infer_genders=True, bidirect=True + ) # Add back in the visibility info from original df - vis_df = original[[FR.PERSON1.ID, FR.VISIBILTIY_REASON, FR.VISIBILITY_COMMENT]] + vis_df = original[ + [FR.PERSON1.ID, FR.VISIBILTIY_REASON, FR.VISIBILITY_COMMENT] + ] df = merge(df, vis_df, how="left", on=FR.PERSON1.ID) return df.to_dict("records") @@ -829,8 +902,12 @@ class BiospecimenGenomicFile: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "biospecimen_id": not_none(get_target_id_from_record(Biospecimen, record)), - "genomic_file_id": not_none(get_target_id_from_record(GenomicFile, record)), + "biospecimen_id": not_none( + get_target_id_from_record(Biospecimen, record) + ), + "genomic_file_id": not_none( + get_target_id_from_record(GenomicFile, record) + ), } @classmethod @@ -871,8 +948,12 @@ class BiospecimenDiagnosis: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "biospecimen_id": not_none(get_target_id_from_record(Biospecimen, record)), - "diagnosis_id": not_none(get_target_id_from_record(Diagnosis, record)), + "biospecimen_id": not_none( + get_target_id_from_record(Biospecimen, record) + ), + "diagnosis_id": not_none( + get_target_id_from_record(Diagnosis, record) + ), } @classmethod @@ -913,8 +994,12 @@ class ReadGroupGenomicFile: @classmethod def get_key_components(cls, record, get_target_id_from_record): return { - "read_group_id": not_none(get_target_id_from_record(ReadGroup, record)), - "genomic_file_id": not_none(get_target_id_from_record(GenomicFile, record)), + "read_group_id": not_none( + get_target_id_from_record(ReadGroup, record) + ), + "genomic_file_id": not_none( + get_target_id_from_record(GenomicFile, record) + ), } @classmethod @@ -958,7 +1043,9 @@ def get_key_components(cls, record, get_target_id_from_record): "sequencing_experiment_id": not_none( get_target_id_from_record(SequencingExperiment, record) ), - "genomic_file_id": not_none(get_target_id_from_record(GenomicFile, record)), + "genomic_file_id": not_none( + get_target_id_from_record(GenomicFile, record) + ), } @classmethod @@ -1087,7 +1174,9 @@ def coerce_types(host, entity_class, body): seen_overmax_int[entity_class.class_name] = set() if k not in seen_overmax_int[entity_class.class_name]: try: - max_value = (2 ** (int(properties[k]["format"][-2:]) - 1)) - 1 + max_value = ( + 2 ** (int(properties[k]["format"][-2:]) - 1) + ) - 1 if v > max_value: logger.info( f"The server indicates that {entity_class.class_name}" @@ -1151,8 +1240,12 @@ def submit(host, entity_class, body): resp = _GET(host, api_path, body) result = resp.json()["results"][0] if extid != result["external_id"]: - resp = _PATCH(host, api_path, result["kf_id"], {"external_id": extid}) + resp = _PATCH( + host, api_path, result["kf_id"], {"external_id": extid} + ) result = resp.json()["results"] return result["kf_id"] else: - raise RequestException(f"Sent to /{api_path}:\n{body}\nGot:\n{resp.text}") + raise RequestException( + f"Sent to /{api_path}:\n{body}\nGot:\n{resp.text}" + ) From 32069405f9368d431bf5b415dbe8a995b1b404f2 Mon Sep 17 00:00:00 2001 From: chris-s-friedman Date: Thu, 30 May 2024 15:16:49 -0400 Subject: [PATCH 6/6] =?UTF-8?q?=F0=9F=93=9D=20Document=20why=20biospecimen?= =?UTF-8?q?=20group=20is=20still=20around?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kf_lib_data_ingest/common/concept_schema.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kf_lib_data_ingest/common/concept_schema.py b/kf_lib_data_ingest/common/concept_schema.py index 3370489e..f46eb2ba 100644 --- a/kf_lib_data_ingest/common/concept_schema.py +++ b/kf_lib_data_ingest/common/concept_schema.py @@ -149,6 +149,16 @@ class BIOSPECIMEN_GROUP(PropertyMixin): pass class SAMPLE(PropertyMixin): + """Sample + + `BIOSPECIMEN_GROUP` is previously existing concept used in the ingest + library. It is being replaced by `SAMPLE` to better reflect that + BIOSPECIMENS can be organized within multiple hierarchical groups with + relationships to one another. The `BIOSPECIMEN_GROUP` is still in the + ingest library to support historical ingest packages. It is recommended + to use `SAMPLE` for new ingest packages. + """ + TISSUE_TYPE = None NCIT_TISSUE_TYPE_ID = None ANATOMY_SITE = None