diff --git a/docs/source/design/concepts/index.rst b/docs/source/design/concepts/index.rst new file mode 100644 index 00000000..1d23deff --- /dev/null +++ b/docs/source/design/concepts/index.rst @@ -0,0 +1,19 @@ +.. _about_concepts: + +============== +Concept Schema +============== + +A key part of the ingest library is the intermediate model that raw data is +mapped to. The intermediate model is called the "concept schema". The concept +schema is essentially just a list of column names that follow a standard that +denotes the type/concept and the attribute (e.g. `PARTICIPANT.GENDER`). Data +that has been mapped to the concept schema is later transformed into the final +schema that is used to create the tables in the target API. + + + +.. toctree:: + :maxdepth: 2 + + samples_and_specimens.rst diff --git a/docs/source/design/concepts/samples_and_specimens.rst b/docs/source/design/concepts/samples_and_specimens.rst new file mode 100644 index 00000000..f90408db --- /dev/null +++ b/docs/source/design/concepts/samples_and_specimens.rst @@ -0,0 +1,50 @@ +.. _samples_and_specimens: + +======================== +Samples and Biospecimens +======================== + +Although similarly named, samples and biospecimens refer to different concepts: + + * A **sample** represents a physical piece of tissue, blood, or other + biologically distinct material taken from a patient. + * A **biospecimen** is represents a portion or a part of that sample, e.g. + an aliquot of a sample. + +While samples and biospecimens are distinct concepts, they share much in +common. In fact, when the ingest library was first written, its primary target +API, the Kids First Data Service, only had a table for biospecimens. As a +result, the ingest library's architecture provides for a biospecimen to share +*all* the qualities of a sample. In fact, biospecimen is a child class of +sample! + +This architecture allows the ingest library to be used against target APIs +that, like the older versions of the Kids First Data Service, only have a table +for biospecimens. + +A sample has qualities: + + * A sample may have information about itself, such as the type of tissue it + is, the type of tumor it comes from, when the sample was collected from + the participant, its volume, etc. + * A sample may have information about shipping, such as the date it was + shipped and shipment origin + + +As discussed above, a biospecimen is a child class of sample, so biospecimens +may have all of the same qualities of a sample*. In addition: + + * a biospecimen may have information about its concentration + * a biospecimen may have information about its analyte type (e.g. DNA vs + RNA) + * a biospecimen may have information about the consent under which it was + collected. + +Biospecimen is designed as a child class of sample to provide for +backwards-compatibility with older ingest packages that existed before the +sample concept. + +Moving forward, it is advised to use the sample class when +extracting information that is most related to the sample and use biospecimen +only when extracting information that is specific to the biospecimen +(such as concentration, analyte, and consent information). diff --git a/docs/source/design/overview.rst b/docs/source/design/overview.rst index d3cc1536..2cbd4e13 100644 --- a/docs/source/design/overview.rst +++ b/docs/source/design/overview.rst @@ -40,6 +40,7 @@ described. :maxdepth: 1 value_principles + concepts/index.rst extract_mapping transform load diff --git a/kf_lib_data_ingest/common/concept_schema.py b/kf_lib_data_ingest/common/concept_schema.py index 47195347..f46eb2ba 100644 --- a/kf_lib_data_ingest/common/concept_schema.py +++ b/kf_lib_data_ingest/common/concept_schema.py @@ -148,32 +148,49 @@ class EVENT_AGE(QuantityMixin): class BIOSPECIMEN_GROUP(PropertyMixin): pass - class BIOSPECIMEN(PropertyMixin): + class SAMPLE(PropertyMixin): + """Sample + + `BIOSPECIMEN_GROUP` is previously existing concept used in the ingest + library. It is being replaced by `SAMPLE` to better reflect that + BIOSPECIMENS can be organized within multiple hierarchical groups with + relationships to one another. The `BIOSPECIMEN_GROUP` is still in the + ingest library to support historical ingest packages. It is recommended + to use `SAMPLE` for new ingest packages. + """ + TISSUE_TYPE = None NCIT_TISSUE_TYPE_ID = None ANATOMY_SITE = None NCIT_ANATOMY_SITE_ID = None UBERON_ANATOMY_SITE_ID = None - TUMOR_DESCRIPTOR = None COMPOSITION = None + TUMOR_DESCRIPTOR = None + EVENT_ID = None EVENT_AGE_DAYS = None class EVENT_AGE(QuantityMixin): pass + class VOLUME(QuantityMixin): + pass + + SPATIAL_DESCRIPTOR = None + SHIPMENT_ORIGIN = None + SHIPMENT_DATE = None + VOLUME_UL = None + SAMPLE_PROCUREMENT = None + PRESERVATION_METHOD = None + + class BIOSPECIMEN(SAMPLE): class QUANTITY(QuantityMixin): pass class CONCENTRATION(QuantityMixin): pass - SPATIAL_DESCRIPTOR = None - SHIPMENT_ORIGIN = None - SHIPMENT_DATE = None ANALYTE = None CONCENTRATION_MG_PER_ML = None - VOLUME_UL = None - SAMPLE_PROCUREMENT = None DBGAP_STYLE_CONSENT_CODE = None CONSENT_SHORT_NAME = None @@ -342,6 +359,7 @@ class C: CONCEPT.FAMILY_RELATIONSHIP.PERSON2, CONCEPT.BIOSPECIMEN_GROUP, CONCEPT.BIOSPECIMEN, + CONCEPT.SAMPLE, CONCEPT.DIAGNOSIS, CONCEPT.PHENOTYPE, CONCEPT.DIAGNOSIS, diff --git a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py index edd093fe..26fbeb12 100644 --- a/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py +++ b/kf_lib_data_ingest/target_api_plugins/kids_first_dataservice.py @@ -429,6 +429,83 @@ def submit(cls, host, body): return submit(host, cls, body) +class Sample: + class_name = "sample" + api_path = "samples" + target_id_concept = ( + CONCEPT.SAMPLE.TARGET_SERVICE_ID + or CONCEPT.BIOSPECIMEN_GROUP.TARGET_SERVICE_ID + ) + service_id_fields = {"kf_id", "participant_id"} + + @classmethod + def get_key_components(cls, record, get_target_id_from_record): + return { + "study_id": get_target_id_from_record(Study, record), + "external_id": not_none(record[CONCEPT.SAMPLE.ID]), + } + + @classmethod + def query_target_ids(cls, host, key_components): + return list(yield_kfids(host, cls.api_path, drop_none(key_components))) + + @classmethod + def build_entity(cls, record, get_target_id_from_record): + secondary_components = { + "kf_id": get_target_id_from_record(cls, record), + "age_at_event_days": ( + flexible_age( + record, + CONCEPT.SAMPLE.EVENT_AGE_DAYS, + CONCEPT.SAMPLE.EVENT_AGE, + ) + or flexible_age( + record, + CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS, + CONCEPT.BIOSPECIMEN.EVENT_AGE, + ) + ), + "anatomical_location": ( + record.get(CONCEPT.SAMPLE.ANATOMY_SITE) + or record.get(CONCEPT.BIOSPECIMEN.ANATOMY_SITE) + ), + "method_of_sample_procurement": ( + record.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT) + or record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT) + ), + "participant_id": not_none( + get_target_id_from_record(Participant, record) + ), + "preservation_method": record.get( + CONCEPT.SAMPLE.PRESERVATION_METHOD + ), + "sample_event_key": record.get(CONCEPT.SAMPLE.EVENT_ID), + "sample_type": ( + record.get(CONCEPT.SAMPLE.COMPOSITION) + or record.get(CONCEPT.BIOSPECIMEN.COMPOSITION) + ), + "tissue_type": ( + record.get(CONCEPT.SAMPLE.TISSUE_TYPE) + or record.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE) + ), + "visible": record.get(CONCEPT.SAMPLE.VISIBLE), + "visibility_comment": record.get(CONCEPT.SAMPLE.VISIBILITY_COMMENT), + "visibility_reason": record.get(CONCEPT.SAMPLE.VISIBILTIY_REASON), + "volume_ul": ( + record.get(CONCEPT.SAMPLE.VOLUME_UL) + or record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL) + ), + } + return { + **cls.get_key_components(record, get_target_id_from_record), + **secondary_components, + } + + @classmethod + def submit(cls, host, body): + return submit(host, cls, body) + + class Biospecimen: class_name = "biospecimen" api_path = "biospecimens" @@ -457,43 +534,70 @@ def build_entity(cls, record, get_target_id_from_record): get_target_id_from_record(Participant, record) ), "external_sample_id": ( - record.get(CONCEPT.BIOSPECIMEN_GROUP.ID) + record.get(CONCEPT.SAMPLE.ID) + or record.get(CONCEPT.BIOSPECIMEN_GROUP.ID) or not_none(record[CONCEPT.BIOSPECIMEN.ID]) ), - "source_text_tissue_type": record.get( - CONCEPT.BIOSPECIMEN.TISSUE_TYPE + "source_text_tissue_type": ( + record.get(CONCEPT.SAMPLE.TISSUE_TYPE) + or record.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE) ), - "composition": record.get(CONCEPT.BIOSPECIMEN.COMPOSITION), - "source_text_anatomical_site": record.get( - CONCEPT.BIOSPECIMEN.ANATOMY_SITE + "composition": ( + record.get(CONCEPT.SAMPLE.COMPOSITION) + or record.get(CONCEPT.BIOSPECIMEN.COMPOSITION) ), - "age_at_event_days": flexible_age( - record, - CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS, - CONCEPT.BIOSPECIMEN.EVENT_AGE, + "source_text_anatomical_site": ( + record.get(CONCEPT.SAMPLE.ANATOMY_SITE) + or record.get(CONCEPT.BIOSPECIMEN.ANATOMY_SITE) + ), + "age_at_event_days": ( + flexible_age( + record, + CONCEPT.SAMPLE.EVENT_AGE_DAYS, + CONCEPT.SAMPLE.EVENT_AGE, + ) + or flexible_age( + record, + CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS, + CONCEPT.BIOSPECIMEN.EVENT_AGE, + ) ), - "source_text_tumor_descriptor": record.get( - CONCEPT.BIOSPECIMEN.TUMOR_DESCRIPTOR + "source_text_tumor_descriptor": ( + record.get(CONCEPT.SAMPLE.TUMOR_DESCRIPTOR) + or record.get(CONCEPT.BIOSPECIMEN.TUMOR_DESCRIPTOR) ), - "ncit_id_tissue_type": record.get( - CONCEPT.BIOSPECIMEN.NCIT_TISSUE_TYPE_ID + "ncit_id_tissue_type": ( + record.get(CONCEPT.SAMPLE.NCIT_TISSUE_TYPE_ID) + or record.get(CONCEPT.BIOSPECIMEN.NCIT_TISSUE_TYPE_ID) ), - "ncit_id_anatomical_site": record.get( - CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID + "ncit_id_anatomical_site": ( + record.get(CONCEPT.SAMPLE.NCIT_ANATOMY_SITE_ID) + or record.get(CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID) ), - "uberon_id_anatomical_site": record.get( - CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID + "uberon_id_anatomical_site": ( + record.get(CONCEPT.SAMPLE.UBERON_ANATOMY_SITE_ID) + or record.get(CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID) ), - "spatial_descriptor": record.get( - CONCEPT.BIOSPECIMEN.SPATIAL_DESCRIPTOR + "spatial_descriptor": ( + record.get(CONCEPT.SAMPLE.SPATIAL_DESCRIPTOR) + or record.get(CONCEPT.BIOSPECIMEN.SPATIAL_DESCRIPTOR) + ), + "shipment_origin": ( + record.get(CONCEPT.SAMPLE.SHIPMENT_ORIGIN) + or record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_ORIGIN) + ), + "shipment_date": ( + record.get(CONCEPT.SAMPLE.SHIPMENT_DATE) + or record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_DATE) ), - "shipment_origin": record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_ORIGIN), - "shipment_date": record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_DATE), "analyte_type": record.get(CONCEPT.BIOSPECIMEN.ANALYTE), "concentration_mg_per_ml": record.get( CONCEPT.BIOSPECIMEN.CONCENTRATION_MG_PER_ML ), - "volume_ul": record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL), + "volume_ul": ( + record.get(CONCEPT.SAMPLE.VOLUME_UL) + or record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL) + ), "visible": record.get(CONCEPT.BIOSPECIMEN.VISIBLE), "visibility_comment": record.get( CONCEPT.BIOSPECIMEN.VISIBILITY_COMMENT @@ -501,13 +605,15 @@ def build_entity(cls, record, get_target_id_from_record): "visibility_reason": record.get( CONCEPT.BIOSPECIMEN.VISIBILTIY_REASON ), - "method_of_sample_procurement": record.get( - CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT + "method_of_sample_procurement": ( + record.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT) + or record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT) ), "dbgap_consent_code": record.get( CONCEPT.BIOSPECIMEN.DBGAP_STYLE_CONSENT_CODE ), "consent_type": record.get(CONCEPT.BIOSPECIMEN.CONSENT_SHORT_NAME), + "sample_id": get_target_id_from_record(Sample, record), } return { **cls.get_key_components(record, get_target_id_from_record), @@ -1003,6 +1109,7 @@ def _GET(host, api_path, body): Diagnosis, Phenotype, Outcome, + Sample, Biospecimen, GenomicFile, ReadGroup,