From 36703c7957d56542237dda43e74603c62e07e81a Mon Sep 17 00:00:00 2001 From: chris-s-friedman Date: Fri, 31 May 2024 12:12:22 -0400 Subject: [PATCH] =?UTF-8?q?=E2=9C=85=20Add=20sample=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data/sample_manifest.csv | 13 +++++ .../extract_configs/sample.py | 53 +++++++++++++++++++ .../extract_configs/sample_relationship.py | 25 +++++++++ .../ingest_package_config.py | 11 ++++ .../sample_registration/transform_module.py | 23 ++++++++ 5 files changed, 125 insertions(+) create mode 100644 tests/data/sample_registration/data/sample_manifest.csv create mode 100644 tests/data/sample_registration/extract_configs/sample.py create mode 100644 tests/data/sample_registration/extract_configs/sample_relationship.py create mode 100644 tests/data/sample_registration/ingest_package_config.py create mode 100644 tests/data/sample_registration/transform_module.py diff --git a/tests/data/sample_registration/data/sample_manifest.csv b/tests/data/sample_registration/data/sample_manifest.csv new file mode 100644 index 00000000..491a93f2 --- /dev/null +++ b/tests/data/sample_registration/data/sample_manifest.csv @@ -0,0 +1,13 @@ +sample_id,aliquot_id,participant,tissue_type,composition,age_at_collection,analyte,parent_sample +s001,a001,p01,tumor,tissue,32,DNA,na +s001,a002,p01,tumor,tissue,32,DNA,na +s001,a003,p01,tumor,tissue,32,DNA,na +s001,a004,p01,tumor,tissue,32,RNA,na +s002,a005,p01,normal,blood,42,DNA,na +s002,a006,p01,normal,blood,42,DNA,na +s003,a007,p01,tumor,tissue,32,RNA,s001 +s003,a008,p01,tumor,tissue,32,DNA,s001 +s004,na,p01,tumor,tissue,32,na,s003 +s005,a009,p02,tumor,bone,56,DNA,na +s006,a010,p02,normal,blood,58,DNA,na +s007,na,p03,tumor,tissue,61,NA,na \ No newline at end of file diff --git a/tests/data/sample_registration/extract_configs/sample.py b/tests/data/sample_registration/extract_configs/sample.py new file mode 100644 index 00000000..594e2f2a --- /dev/null +++ b/tests/data/sample_registration/extract_configs/sample.py @@ -0,0 +1,53 @@ +""" +Auto-generated extract config module. + +See documentation at +https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for +information on writing extract config files. +""" + +from kf_lib_data_ingest.common import constants +from kf_lib_data_ingest.etl.extract.operations import keep_map, value_map +from kf_lib_data_ingest.common.concept_schema import CONCEPT + +# TODO - Replace this with a URL to your own data file +source_data_url = "file://../data/sample_manifest.csv" + +# TODO - Replace this with operations that make sense for your own data file +operations = [ + keep_map(in_col="participant", out_col=CONCEPT.PARTICIPANT.ID), + keep_map(in_col="sample_id", out_col=CONCEPT.SAMPLE.ID), + value_map( + in_col="aliquot_id", + m=lambda x: None if x == "na" else x, + out_col=CONCEPT.BIOSPECIMEN.ID, + ), + value_map( + in_col="tissue_type", + m={ + "tumor": constants.SPECIMEN.TISSUE_TYPE.TUMOR, + "normal": constants.SPECIMEN.TISSUE_TYPE.NORMAL, + }, + out_col=CONCEPT.SAMPLE.TISSUE_TYPE, + ), + value_map( + in_col="composition", + m={ + "tissue": constants.SPECIMEN.COMPOSITION.TISSUE, + "blood": constants.SPECIMEN.COMPOSITION.BLOOD, + "bone": constants.SPECIMEN.COMPOSITION.BONE, + }, + out_col=CONCEPT.SAMPLE.COMPOSITION, + ), + keep_map(in_col="age_at_collection", out_col=CONCEPT.SAMPLE.EVENT_AGE_DAYS), + value_map( + in_col="analyte", + m={ + "DNA": constants.SEQUENCING.ANALYTE.DNA, + "RNA": constants.SEQUENCING.ANALYTE.RNA, + "NA": None, + "na": None, + }, + out_col=CONCEPT.BIOSPECIMEN.ANALYTE, + ), +] diff --git a/tests/data/sample_registration/extract_configs/sample_relationship.py b/tests/data/sample_registration/extract_configs/sample_relationship.py new file mode 100644 index 00000000..761ab1f3 --- /dev/null +++ b/tests/data/sample_registration/extract_configs/sample_relationship.py @@ -0,0 +1,25 @@ +""" +Auto-generated extract config module. + +See documentation at +https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for +information on writing extract config files. +""" + +from kf_lib_data_ingest.common import constants +from kf_lib_data_ingest.etl.extract.operations import keep_map, value_map +from kf_lib_data_ingest.common.concept_schema import CONCEPT + +# TODO - Replace this with a URL to your own data file +source_data_url = "file://../data/sample_manifest.csv" + +# TODO - Replace this with operations that make sense for your own data file +operations = [ + keep_map(in_col="participant", out_col=CONCEPT.PARTICIPANT.ID), + keep_map(in_col="sample_id", out_col=CONCEPT.SAMPLE.ID), + value_map( + in_col="parent_sample", + m=lambda x: None if x == "na" else x, + out_col=CONCEPT.BIOSPECIMEN.ID, + ), +] diff --git a/tests/data/sample_registration/ingest_package_config.py b/tests/data/sample_registration/ingest_package_config.py new file mode 100644 index 00000000..a693358f --- /dev/null +++ b/tests/data/sample_registration/ingest_package_config.py @@ -0,0 +1,11 @@ +""" Ingest Package Config """ + +# The list of entities that will be loaded into the target service +target_service_entities = ["sample" "biospecimen", "sample_relationship"] + +# All paths are relative to the directory this file is in +extract_config_dir = "extract_configs" + +transform_function_path = "transform_module.py" + +study = "SD_ME0WME0W" diff --git a/tests/data/sample_registration/transform_module.py b/tests/data/sample_registration/transform_module.py new file mode 100644 index 00000000..4d7586d9 --- /dev/null +++ b/tests/data/sample_registration/transform_module.py @@ -0,0 +1,23 @@ +""" +Auto-generated transform module + +Replace the contents of transform_function with your own code + +See documentation at +https://kids-first.github.io/kf-lib-data-ingest/ for information on +implementing transform_function. +""" + +import os + +# Use these merge funcs, not pandas.merge +from kf_lib_data_ingest.common.pandas_utils import outer_merge +from kf_lib_data_ingest.common.concept_schema import CONCEPT +from kf_lib_data_ingest.config import DEFAULT_KEY + + +def transform_function(mapped_df_dict): + sr = mapped_df_dict["sample_relationship.py"] + sr = sr.dropna() + + return {DEFAULT_KEY: mapped_df_dict["sample.py"], "sample_relationship": sr}