Skip to content

Commit

Permalink
used the existing subject_split to split the cohort automatically ins…
Browse files Browse the repository at this point in the history
…tead of doing it manually (#63)
  • Loading branch information
ChaoPang authored Oct 10, 2024
1 parent 1517c3d commit fa6d0ae
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/cehrbert/data_generators/hf_data_generator/meds_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,8 +436,12 @@ def _create_cehrbert_data_from_meds(
assert split in ["held_out", "train", "tuning"]
batches = []
if data_args.cohort_folder:
cohort = pd.read_parquet(os.path.join(os.path.expanduser(data_args.cohort_folder), split))
for cohort_row in cohort.itertuples():
# Load the entire cohort
cohort = pd.read_parquet(os.path.expanduser(data_args.cohort_folder))
patient_split = get_subject_split(os.path.expanduser(data_args.data_folder))
subject_ids = patient_split[split]
cohort_split = cohort[cohort.subject_id.isin(subject_ids)]
for cohort_row in cohort_split.itertuples():
subject_id = cohort_row.subject_id
prediction_time = cohort_row.prediction_time
label = int(cohort_row.boolean_value)
Expand Down

0 comments on commit fa6d0ae

Please sign in to comment.