kids-first · chris-s-friedman · May 30, 2024 · May 21, 2024 · May 21, 2024 · May 21, 2024
@@ -0,0 +1,19 @@
+.. _about_concepts:
+
+==============
+Concept Schema
+==============
+
+A key part of the ingest library is the intermediate model that raw data is
+mapped to. The intermediate model is called the "concept schema". The concept
+schema is essentially just a list of column names that follow a standard that
+denotes the type/concept and the attribute (e.g. `PARTICIPANT.GENDER`). Data
+that has been mapped to the concept schema is later transformed into the final
+schema that is used to create the tables in the target API.
+
+
+
+.. toctree::
+    :maxdepth: 2
+
+    samples_and_specimens.rst
@@ -0,0 +1,50 @@
+.. _samples_and_specimens:
+
+========================
+Samples and Biospecimens
+========================
+
+Although similarly named, samples and biospecimens refer to different concepts:
+
+    * A **sample** represents a physical piece of tissue, blood, or other
+      biologically distinct material taken from a patient.
+    * A **biospecimen** is represents a portion or a part of that sample, e.g.
+      an aliquot of a sample.
+
+While samples and biospecimens are distinct concepts, they share much in
+common. In fact, when the ingest library was first written, its primary target
+API, the Kids First Data Service, only had a table for biospecimens. As a
+result, the ingest library's architecture provides for a biospecimen to share
+*all* the qualities of a sample. In fact, biospecimen is a child class of
+sample!
+
+This architecture allows the ingest library to be used against target APIs
+that, like the older versions of the Kids First Data Service, only have a table
+for biospecimens.
+
+A sample has qualities:
+
+    * A sample may have information about itself, such as the type of tissue it
+      is, the type of tumor it comes from, when the sample was collected from
+      the participant, its volume, etc.
+    * A sample may have information about shipping, such as the date it was
+      shipped and shipment origin
+
+
+As discussed above, a biospecimen is a child class of sample, so biospecimens
+may have all of the same qualities of a sample*. In addition:
+
+    * a biospecimen may have information about its concentration
+    * a biospecimen may have information about its analyte type (e.g. DNA vs
+      RNA)
+    * a biospecimen may have information about the consent under which it was
+      collected.
+
+Biospecimen is designed as a child class of sample to provide for
+backwards-compatibility with older ingest packages that existed before the
+sample concept.
+
+Moving forward, it is advised to use the sample class when
+extracting information that is most related to the sample and use biospecimen
+only when extracting information that is specific to the biospecimen
+(such as concentration, analyte, and consent information).
@@ -40,6 +40,7 @@ described.
    :maxdepth: 1
 
    value_principles
+   concepts/index.rst
    extract_mapping
    transform
    load

@@ -148,32 +148,49 @@ class EVENT_AGE(QuantityMixin):
     class BIOSPECIMEN_GROUP(PropertyMixin):
         pass
 
-    class BIOSPECIMEN(PropertyMixin):
+    class SAMPLE(PropertyMixin):
+        """Sample
+
+        `BIOSPECIMEN_GROUP` is previously existing concept used in the ingest
+        library. It is being replaced by `SAMPLE` to better reflect that
+        BIOSPECIMENS can be organized within multiple hierarchical groups with
+        relationships to one another. The `BIOSPECIMEN_GROUP` is still in the
+        ingest library to support historical ingest packages. It is recommended
+        to use `SAMPLE` for new ingest packages.
+        """
+
         TISSUE_TYPE = None
         NCIT_TISSUE_TYPE_ID = None
         ANATOMY_SITE = None
         NCIT_ANATOMY_SITE_ID = None
         UBERON_ANATOMY_SITE_ID = None
-        TUMOR_DESCRIPTOR = None
         COMPOSITION = None
+        TUMOR_DESCRIPTOR = None
+        EVENT_ID = None
         EVENT_AGE_DAYS = None
 
         class EVENT_AGE(QuantityMixin):
             pass
 
+        class VOLUME(QuantityMixin):
+            pass
+
+        SPATIAL_DESCRIPTOR = None
+        SHIPMENT_ORIGIN = None
+        SHIPMENT_DATE = None
+        VOLUME_UL = None
+        SAMPLE_PROCUREMENT = None
+        PRESERVATION_METHOD = None
+
+    class BIOSPECIMEN(SAMPLE):
         class QUANTITY(QuantityMixin):
             pass
 
         class CONCENTRATION(QuantityMixin):
             pass
 
-        SPATIAL_DESCRIPTOR = None
-        SHIPMENT_ORIGIN = None
-        SHIPMENT_DATE = None
         ANALYTE = None
         CONCENTRATION_MG_PER_ML = None
-        VOLUME_UL = None
-        SAMPLE_PROCUREMENT = None
         DBGAP_STYLE_CONSENT_CODE = None
         CONSENT_SHORT_NAME = None
 
@@ -342,6 +359,7 @@ class C:
     CONCEPT.FAMILY_RELATIONSHIP.PERSON2,
     CONCEPT.BIOSPECIMEN_GROUP,
     CONCEPT.BIOSPECIMEN,
+    CONCEPT.SAMPLE,
     CONCEPT.DIAGNOSIS,
     CONCEPT.PHENOTYPE,
     CONCEPT.DIAGNOSIS,

@@ -429,6 +429,83 @@ def submit(cls, host, body):
         return submit(host, cls, body)
 
 
+class Sample:
+    class_name = "sample"
+    api_path = "samples"
+    target_id_concept = (
+        CONCEPT.SAMPLE.TARGET_SERVICE_ID
+        or CONCEPT.BIOSPECIMEN_GROUP.TARGET_SERVICE_ID
+    )
+    service_id_fields = {"kf_id", "participant_id"}
+
+    @classmethod
+    def get_key_components(cls, record, get_target_id_from_record):
+        return {
+            "study_id": get_target_id_from_record(Study, record),
+            "external_id": not_none(record[CONCEPT.SAMPLE.ID]),
+        }
+
+    @classmethod
+    def query_target_ids(cls, host, key_components):
+        return list(yield_kfids(host, cls.api_path, drop_none(key_components)))
+
+    @classmethod
+    def build_entity(cls, record, get_target_id_from_record):
+        secondary_components = {
+            "kf_id": get_target_id_from_record(cls, record),
+            "age_at_event_days": (
+                flexible_age(
+                    record,
+                    CONCEPT.SAMPLE.EVENT_AGE_DAYS,
+                    CONCEPT.SAMPLE.EVENT_AGE,
+                )
+                or flexible_age(
+                    record,
+                    CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS,
+                    CONCEPT.BIOSPECIMEN.EVENT_AGE,
+                )
+            ),
+            "anatomical_location": (
+                record.get(CONCEPT.SAMPLE.ANATOMY_SITE)
+                or record.get(CONCEPT.BIOSPECIMEN.ANATOMY_SITE)
+            ),
+            "method_of_sample_procurement": (
+                record.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT)
+                or record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT)
+            ),
+            "participant_id": not_none(
+                get_target_id_from_record(Participant, record)
+            ),
+            "preservation_method": record.get(
+                CONCEPT.SAMPLE.PRESERVATION_METHOD
+            ),
+            "sample_event_key": record.get(CONCEPT.SAMPLE.EVENT_ID),
+            "sample_type": (
+                record.get(CONCEPT.SAMPLE.COMPOSITION)
+                or record.get(CONCEPT.BIOSPECIMEN.COMPOSITION)
+            ),
+            "tissue_type": (
+                record.get(CONCEPT.SAMPLE.TISSUE_TYPE)
+                or record.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE)
+            ),
+            "visible": record.get(CONCEPT.SAMPLE.VISIBLE),
+            "visibility_comment": record.get(CONCEPT.SAMPLE.VISIBILITY_COMMENT),
+            "visibility_reason": record.get(CONCEPT.SAMPLE.VISIBILTIY_REASON),
+            "volume_ul": (
+                record.get(CONCEPT.SAMPLE.VOLUME_UL)
+                or record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL)
+            ),
+        }
+        return {
+            **cls.get_key_components(record, get_target_id_from_record),
+            **secondary_components,
+        }
+
+    @classmethod
+    def submit(cls, host, body):
+        return submit(host, cls, body)
+
+
 class Biospecimen:
     class_name = "biospecimen"
     api_path = "biospecimens"
@@ -457,57 +534,86 @@ def build_entity(cls, record, get_target_id_from_record):
                 get_target_id_from_record(Participant, record)
             ),
             "external_sample_id": (
-                record.get(CONCEPT.BIOSPECIMEN_GROUP.ID)
+                record.get(CONCEPT.SAMPLE.ID)
+                or record.get(CONCEPT.BIOSPECIMEN_GROUP.ID)
                 or not_none(record[CONCEPT.BIOSPECIMEN.ID])
             ),
-            "source_text_tissue_type": record.get(
-                CONCEPT.BIOSPECIMEN.TISSUE_TYPE
+            "source_text_tissue_type": (
+                record.get(CONCEPT.SAMPLE.TISSUE_TYPE)
+                or record.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE)
             ),
-            "composition": record.get(CONCEPT.BIOSPECIMEN.COMPOSITION),
-            "source_text_anatomical_site": record.get(
-                CONCEPT.BIOSPECIMEN.ANATOMY_SITE
+            "composition": (
+                record.get(CONCEPT.SAMPLE.COMPOSITION)
+                or record.get(CONCEPT.BIOSPECIMEN.COMPOSITION)
             ),
-            "age_at_event_days": flexible_age(
-                record,
-                CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS,
-                CONCEPT.BIOSPECIMEN.EVENT_AGE,
+            "source_text_anatomical_site": (
+                record.get(CONCEPT.SAMPLE.ANATOMY_SITE)
+                or record.get(CONCEPT.BIOSPECIMEN.ANATOMY_SITE)
+            ),
+            "age_at_event_days": (
+                flexible_age(
+                    record,
+                    CONCEPT.SAMPLE.EVENT_AGE_DAYS,
+                    CONCEPT.SAMPLE.EVENT_AGE,
+                )
+                or flexible_age(
+                    record,
+                    CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS,
+                    CONCEPT.BIOSPECIMEN.EVENT_AGE,
+                )
             ),
-            "source_text_tumor_descriptor": record.get(
-                CONCEPT.BIOSPECIMEN.TUMOR_DESCRIPTOR
+            "source_text_tumor_descriptor": (
+                record.get(CONCEPT.SAMPLE.TUMOR_DESCRIPTOR)
+                or record.get(CONCEPT.BIOSPECIMEN.TUMOR_DESCRIPTOR)
             ),
-            "ncit_id_tissue_type": record.get(
-                CONCEPT.BIOSPECIMEN.NCIT_TISSUE_TYPE_ID
+            "ncit_id_tissue_type": (
+                record.get(CONCEPT.SAMPLE.NCIT_TISSUE_TYPE_ID)
+                or record.get(CONCEPT.BIOSPECIMEN.NCIT_TISSUE_TYPE_ID)
             ),
-            "ncit_id_anatomical_site": record.get(
-                CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID
+            "ncit_id_anatomical_site": (
+                record.get(CONCEPT.SAMPLE.NCIT_ANATOMY_SITE_ID)
+                or record.get(CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID)
             ),
-            "uberon_id_anatomical_site": record.get(
-                CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID
+            "uberon_id_anatomical_site": (
+                record.get(CONCEPT.SAMPLE.UBERON_ANATOMY_SITE_ID)
+                or record.get(CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID)
             ),
-            "spatial_descriptor": record.get(
-                CONCEPT.BIOSPECIMEN.SPATIAL_DESCRIPTOR
+            "spatial_descriptor": (
+                record.get(CONCEPT.SAMPLE.SPATIAL_DESCRIPTOR)
+                or record.get(CONCEPT.BIOSPECIMEN.SPATIAL_DESCRIPTOR)
+            ),
+            "shipment_origin": (
+                record.get(CONCEPT.SAMPLE.SHIPMENT_ORIGIN)
+                or record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_ORIGIN)
+            ),
+            "shipment_date": (
+                record.get(CONCEPT.SAMPLE.SHIPMENT_DATE)
+                or record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_DATE)
             ),
-            "shipment_origin": record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_ORIGIN),
-            "shipment_date": record.get(CONCEPT.BIOSPECIMEN.SHIPMENT_DATE),
             "analyte_type": record.get(CONCEPT.BIOSPECIMEN.ANALYTE),
             "concentration_mg_per_ml": record.get(
                 CONCEPT.BIOSPECIMEN.CONCENTRATION_MG_PER_ML
             ),
-            "volume_ul": record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL),
+            "volume_ul": (
+                record.get(CONCEPT.SAMPLE.VOLUME_UL)
+                or record.get(CONCEPT.BIOSPECIMEN.VOLUME_UL)
+            ),
             "visible": record.get(CONCEPT.BIOSPECIMEN.VISIBLE),
             "visibility_comment": record.get(
                 CONCEPT.BIOSPECIMEN.VISIBILITY_COMMENT
             ),
             "visibility_reason": record.get(
                 CONCEPT.BIOSPECIMEN.VISIBILTIY_REASON
             ),
-            "method_of_sample_procurement": record.get(
-                CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT
+            "method_of_sample_procurement": (
+                record.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT)
+                or record.get(CONCEPT.BIOSPECIMEN.SAMPLE_PROCUREMENT)
             ),
             "dbgap_consent_code": record.get(
                 CONCEPT.BIOSPECIMEN.DBGAP_STYLE_CONSENT_CODE
             ),
             "consent_type": record.get(CONCEPT.BIOSPECIMEN.CONSENT_SHORT_NAME),
+            "sample_id": get_target_id_from_record(Sample, record),
         }
         return {
             **cls.get_key_components(record, get_target_id_from_record),
@@ -1003,6 +1109,7 @@ def _GET(host, api_path, body):
     Diagnosis,
     Phenotype,
     Outcome,
+    Sample,
     Biospecimen,
     GenomicFile,
     ReadGroup,