From 34b16d80431056bdc0c066ff680ce339312ac784 Mon Sep 17 00:00:00 2001 From: Chao Pang Date: Thu, 12 Sep 2024 10:08:07 -0400 Subject: [PATCH] added patient_id to the pretraining data for debugging --- src/cehrbert/data_generators/hf_data_generator/hf_dataset.py | 1 + .../data_generators/hf_data_generator/hf_dataset_mapping.py | 2 +- src/cehrbert/utils/stat_utils.py | 0 3 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 src/cehrbert/utils/stat_utils.py diff --git a/src/cehrbert/data_generators/hf_data_generator/hf_dataset.py b/src/cehrbert/data_generators/hf_data_generator/hf_dataset.py index 6212a522..e06d1176 100644 --- a/src/cehrbert/data_generators/hf_data_generator/hf_dataset.py +++ b/src/cehrbert/data_generators/hf_data_generator/hf_dataset.py @@ -12,6 +12,7 @@ from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments CEHRBERT_COLUMNS = [ + "person_id", "concept_ids", "ages", "dates", diff --git a/src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py b/src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py index f5cf17d0..f3377e9c 100644 --- a/src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py +++ b/src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py @@ -115,7 +115,7 @@ def __init__(self, data_args: DataTrainingArguments, is_pretraining: bool = True def remove_columns(self): if self._is_pretraining: - return ["visits", "patient_id", "birth_datetime", "index_date"] + return ["visits", "birth_datetime", "index_date"] else: return [ "visits", diff --git a/src/cehrbert/utils/stat_utils.py b/src/cehrbert/utils/stat_utils.py new file mode 100644 index 00000000..e69de29b