-
Notifications
You must be signed in to change notification settings - Fork 0
/
MIMIC-IV-Ext-Creation.py
456 lines (329 loc) · 17 KB
/
MIMIC-IV-Ext-Creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
## PREPROCESSING MIMIC_IV
## import libraries
import pandas as pd
import numpy as np
from io import StringIO
from tqdm import tqdm
import re
## Import datasets from MIMIC-IV, MIMIC-IV-ED, MIMIC-IV-Note
## Load from MIMIC-IV-ED
triage = pd.read_csv("/mimic-iv-ed-2.2/ed/triage.csv", on_bad_lines='skip', low_memory=False)
vitalsigns = pd.read_csv("/mimic-iv-ed-2.2/ed/vitalsign.csv", on_bad_lines='skip', low_memory=False)
ed_stays = pd.read_csv("/mimic-iv-ed-2.2/ed/edstays.csv")
diagnostics = pd.read_csv('/mimic-iv-ed-2.2/ed/diagnosis.csv',on_bad_lines='skip')
## Load from MIMIC-IV
patients = pd.read_csv("/mimic-iv/mimic-iv-3.0/hosp/patients.csv.gz", compression='gzip', low_memory=False)
## Load Discharge from MIMIC-IV-Note
# Read the discharge.csv file into a string
txt = open('/mimic-iv-note/discharge.csv').read()
# Replace all occurrences of '|' with ',<vl>' (custom delimiter), ',""""\n' with ',<br>' (indicating a line break marker), 'Followup Instructions:\n___\n""""' with new markers '</br>|' for parsing
txt = txt.replace('|', ',<vl>')
txt = txt.replace(',""""\n', ',<br>')
txt = txt.replace('Followup Instructions:\n___\n""""','Followup Instructions:\n___\n</br>|')
# find text between <br> and </br> and replace any ',' with '<comma>'
txt = re.sub(r'<br>([^<]*)</br>', lambda x: x.group(0).replace(',', '<comma>'), txt)
# Remove all occurrences of double quotes '"' from the text
txt = txt.replace('"', '')
# Replace the 'text\n' pattern with 'text|' to format for CSV parsing
txt = txt.replace('text\n', 'text|')
# Use pandas to read the modified txt content as a CSV, using '|' as the line terminator
df = pd.read_csv(StringIO(txt), lineterminator='|', on_bad_lines='warn')
## merge different datasets
## Add "stay_id" and "text" from edstays dataset
for index, row in df.iterrows():
try:
hadm_id = float(row['hadm_id'])
# Find the corresponding 'stay_id' in 'ed_stays' DataFrame that matches the 'hadm_id'
stay_id = ed_stays[ed_stays['hadm_id'] == hadm_id]['stay_id']
# If no matching 'stay_id' is found, skip to the next iteration
if stay_id.empty:
continue
df.at[index, 'stay_id'] = stay_id.iloc[0]
except Exception as e:
print(f"{e} at {index}")
continue
df = df[df['stay_id'].notnull()]
## Add all columns from Triage dataset
# merged to the triage df, because it is unique on stay_id
df = pd.merge(triage, df, on="stay_id", how="inner")
## Add gender and race from edstays dataset
df = pd.merge(df, ed_stays, on='stay_id')
# Removing Duplicate Rows Based on subject_id
unique_df = df.drop_duplicates(subset=['subject_id_x'])
## Add age from patient dataset
unique_df = pd.merge(unique_df, patients, on='subject_id')
## Extract Relevant Information from the Clinical Text
## EXTRACT: Tests
def get_tests(text):
lower_text = text.lower()
try:
if "discharge labs" in lower_text.split("pertinent results:")[1].split('brief hospital course:')[0]:
return lower_text.split("pertinent results:")[1].split('brief hospital course:')[0].split('discharge labs')[0]
else:
return lower_text.split("pertinent results:")[1].split('brief hospital course:')[0]
except:
#print(lower_text)
return None
unique_df["tests"] = unique_df['text'].apply(get_tests)
## EXTRACT: Past medication
def get_medication(text):
lower_text = text.lower()
try:
# Extract the text between "medications on admission:" and "discharge medications:"
return lower_text.split("medications on admission:")[1].split('discharge medications:')[0]
except:
# print(lower_text)
return None
unique_df["past_medication"] = unique_df['text'].apply(get_medication)
## EXTRACT: History of Present Illness (to be continued and refined later on in the code)
def get_HPI(text):
# Replace custom placeholders with their intended characters and clean up text markers
text = text.replace('<comma>', ',').replace('<br>', '').replace('</br>', '')
# Extract the text between "History of Present Illness:" and "Physical Exam:" sections
text = text.split('History of Present Illness:')[-1].split('Physical Exam:')[0]
return text
unique_df['preprocessed_text'] = unique_df['text'].apply(get_HPI)
## Cleaning and organizing the Dataframe for clarity
## Drop Redundant Columns and Rename Relevant Columns for Consistency and Clarity
unique_df = unique_df.drop(columns=['subject_id_x', 'subject_id_y', 'hadm_id_x', 'gender_y'])
unique_df = unique_df.rename(columns={
'hadm_id_y': 'hadm_id',
'gender_x': 'gender',
'symptoms': 'preprocessed_text'
})
df = unique_df.copy()
## Merge ICD information from diagnostics dataset
# Filter the diagnostics data to keep only rows where "seq_num" equals 1 (indicating the most relevant ICD code)
diagnostics = diagnostics[diagnostics["seq_num"] == 1]
# Merge diagnostics data into df to add 'icd_code', 'icd_title', and 'icd_version' columns
df = df.merge(diagnostics[['stay_id', 'icd_code', 'icd_title', "icd_version"]],
on='stay_id', how='left')
# Remove rows where 'icd_code' is NaN
df = df.dropna(subset=['icd_code'])
# Drop columns that are no longer needed for the analysis or further processing
df = df.drop(columns=["note_id", "note_type", "note_seq", "charttime", "storetime", "intime", "outtime", "arrival_transport", "disposition", "anchor_year", "anchor_year_group", "dod" ])
## Create Initial Vitals from Temperature, Heartrate, respiration rate, o2 saturation, bloodpressure (dbp, sbp)
def create_vitals(row):
vitals = []
if not pd.isna(row['temperature']):
vitals.append(f"Temperature: {row['temperature']}")
if not pd.isna(row['heartrate']):
vitals.append(f"Heartrate: {row['heartrate']}")
if not pd.isna(row['resprate']):
vitals.append(f"resprate: {row['resprate']}")
if not pd.isna(row['o2sat']):
vitals.append(f"o2sat: {row['o2sat']}")
if not pd.isna(row['sbp']):
vitals.append(f"sbp: {row['sbp']}")
if not pd.isna(row['dbp']):
vitals.append(f"dbp: {row['dbp']}")
return ", ".join(vitals)
df.loc[:,'initial_vitals'] = df.apply(create_vitals, axis=1)
## Create Patient Info from Gender, Race and Year
def create_patient_info(row):
patient_info = []
# Append the gender information with a readable format
if row["gender"] == "F":
patient_info.append("Gender: Female")
elif row["gender"] == "M":
patient_info.append("Gender: Male")
else:
patient_info.append(f"Gender: {row['gender']}")
patient_info.append(f"Race: {row['race']}")
patient_info.append(f"Age: {row['anchor_age']}")
return ", ".join(patient_info)
df.loc[:,'patient_info'] = df.apply(create_patient_info, axis=1)
## Cleaning and organizing the Dataframe for clarity
# Drop columns that are no longer needed for the analysis or further processing and rearrange columns
df = df.drop(columns=["gender", "race", "anchor_age", "temperature", "heartrate", "resprate", "o2sat", "sbp", "dbp"])
df = df[['stay_id', 'subject_id', 'hadm_id', "text", 'patient_info', 'initial_vitals', 'pain', 'chiefcomplaint', 'preprocessed_text', 'past_medication', 'tests', 'acuity', 'icd_code', 'icd_title', 'icd_version']]
## remove rows that have nans in acuity, because acuity will be predicted and NaNs carry no useful information
df = df.dropna(subset=['acuity'])
df = df.dropna(subset=['tests'])
## convert nans to empty strings
df["pain"] = df['pain'].fillna("")
df["chiefcomplaint"] = df['chiefcomplaint'].fillna("")
df["past_medication"] = df['past_medication'].fillna("")
## convert numpy.float64 to numpy.int64
df['acuity'] = df['acuity'].astype(np.int64)
df['hadm_id'] = df['hadm_id'].astype(np.int64)
df['icd_version'] = df['icd_version'].astype(np.int64)
## rename acuity to triage
df = df.rename(columns={"acuity": "triage"})
## find the rows that have "history of present illness" in the "text" column and keep only these rows
hpi = df['text'].str.contains('history of present illness', case=False, na=False)
hpi_index = hpi[hpi==True].index
df = df.loc[hpi_index]
## EXTRACT: HPI
def extract_hpi(text):
pos_past_med_hist = text.lower().find('past medical history:')
pos_soc_hist = text.lower().find('social history:')
pos_fam_hist = text.lower().find('family history:')
#text = text.replace("\n", " ")
if pos_past_med_hist != -1:
return text[:pos_past_med_hist].strip()
elif pos_soc_hist != -1:
return text[:pos_soc_hist].strip()
elif pos_soc_hist != -1:
return text[:pos_fam_hist].strip()
else:
return text
df["HPI"] = df["preprocessed_text"].apply(extract_hpi)
## EXTRACT: DIAGNOSIS
def extract_diagnosis(text):
split_text = text.split("Discharge Diagnosis:" )[-1].split("Discharge Condition:")[0]
split_text= split_text.replace('<comma>', ', ')
return("Discharge Diagnosis: " + split_text)
df["diagnosis"] = df["text"].apply(extract_diagnosis)
## PROCESS HPI
## cut length of HPI <2000 and the tests <3000
string_lengths = df['HPI'].str.len()
mask = string_lengths<2000
df = df[mask]
string_lengths = df['HPI'].str.len()
mask = string_lengths>50
df = df[mask]
string_lengths = df['tests'].str.len()
mask = string_lengths<3000
df = df[mask]
## Removing Unwanted Sections Related to ED Course and Initial Vitals
df = df.dropna(subset=['HPI'])
df = df[df['HPI'] != ""]
## HPI preprocess
def extract_only_hpi(text):
## remove everything after
#text = re.sub(re.compile("in the ED.*", re.IGNORECASE), "", text)
text = re.sub(re.compile(r"in the ED, initial vital.*", re.IGNORECASE | re.DOTALL), "", text)
text = re.sub(re.compile(r"in the ED initial vital.*", re.IGNORECASE | re.DOTALL), "", text)
text = re.sub(re.compile(r"\bED Course.*", re.IGNORECASE | re.DOTALL), "", text)
text = re.sub(re.compile(r"\bIn ED initial VS.*", re.IGNORECASE | re.DOTALL), "", text)
text = re.sub(re.compile(r"in the ED, initial VS.*", re.IGNORECASE | re.DOTALL), "", text)
text = re.sub(re.compile(r"\binitial VS.*", re.IGNORECASE | re.DOTALL), "", text)
text = re.sub(re.compile(r"in the ED.*", re.IGNORECASE | re.DOTALL), "", text)
return text
tqdm.pandas()
df["HPI"] = df["HPI"].progress_apply(extract_only_hpi)
## Remove the ones that have ED in them
mask = df["HPI"].str.contains(r'\bED', case=False, na=False)
df = df[~mask]
## remove where test is nan to be able to compare between normal user and expert
df = df.dropna(subset=['tests'])
## PROCESS DIAGNOSIS
## Removing Specific Headers, Unwanted Secrtions, and irrelevant Records
## remove the header "discharge diagnosis"
def remove_header(text, header):
text = re.sub(re.compile(header, re.IGNORECASE), "", text)
return text
## Remove Header in diagnosis "discharge diagnosis"
df['diagnosis'] = df['diagnosis'].apply(lambda text: remove_header(text, "discharge diagnosis:"))
## Remove all content before and including the "Facility:\n___" marker
def delete_before_string(text):
text = re.sub(re.compile(r".*Facility:\n___", re.IGNORECASE | re.DOTALL), "", text)
return text
df['diagnosis'] = df['diagnosis'].apply(delete_before_string)
## Remove all content before and including the "___ Diagnosis:" marker
def delete_before_string(text):
text = re.sub(re.compile(r".*___ Diagnosis:", re.IGNORECASE | re.DOTALL), "", text)
return text
df['diagnosis'] = df['diagnosis'].apply(delete_before_string)
## Remove all content after the "PMH" marker (Past Medical History)
def delete_after_string(text):
text = re.sub(re.compile(r"PMH.*", re.IGNORECASE | re.DOTALL), "", text)
return text
df['diagnosis'] = df['diagnosis'].apply(delete_after_string)
## FILTER ROWS with excessive Information to preserve prediction integrity
# Filter out rows in 'HPI' that contain specific terms like 'ER', 'Emergency room', 'Emergency department', or 'impression'
# These rows likely refer to emergency settings and shouldn't be in the text for further analysis
mask = df["HPI"].str.contains(' ER ', case=False, na=False)
df = df[~mask]
mask = df["HPI"].str.contains('Emergency room', case=False, na=False)
df = df[~mask]
mask = df["HPI"].str.contains('Emergency department', case=False, na=False)
df = df[~mask]
mask = df["HPI"].str.contains('impression', case=False, na=False)
df = df[~mask]
# Filter out rows in 'diagnosis' that contain the terms 'deceased' or 'died'
mask = df["diagnosis"].str.contains('deceased', case=False, na=False)
df = df[~mask]
mask = df["diagnosis"].str.contains('died', case=False, na=False)
df = df[~mask]
# Further filter out rows where 'diagnosis' contains the term 'history of present illness'
# This ensures that diagnosis-related fields don't inadvertently contain HPI-related content
mask_hpi = df["diagnosis"].str.contains('history of present illness', case=False, na=False)
df = df[~mask_hpi]
print(len(df))
## CREATE PRIMARY AND SECONDARY DIAGNOSIS
## Drop rows that include "primary" as primary diagnosis but not surely in the beginning
mask = df["diagnosis"].str.contains('primary', case=False, na=False)
ind = df[mask].index.tolist()
mask2 = df['diagnosis'].str.contains(r'^\s*\nprimary', flags=re.IGNORECASE, regex=True)
ind2 = df[mask2].index.tolist()
ind_drop = set(ind) - set(ind2)
df = df[~df.index.isin(ind_drop)]
## Drop rows that include "secondary" as secondary diagnosis but not surely in the beginning
mask = df["diagnosis"].str.contains('secondary', case=False, na=False)
ind = df[mask].index.tolist()
mask2 = df['diagnosis'].str.contains('\nsecondary', flags=re.IGNORECASE, regex=True)
ind2 = df[mask2].index.tolist()
ind_drop = set(ind) - set(ind2)
df = df[~df.index.isin(ind_drop)]
## Segregate Discharge Diagnosis into Primary and Secondary Categories with Post-Processing and Filtering
df["primary_diagnosis"] = None
df["secondary_diagnosis"] = None
## divide discharge diagnosis into primary and secondary diangosis if possible
for i in df.index:
index = df["diagnosis"][i].lower().find('secondary')
if index != -1:
df.loc[i, "primary_diagnosis"] = df["diagnosis"][i][:index]
df.loc[i, "secondary_diagnosis"] = df["diagnosis"][i][index:]
else:
df.loc[i, "primary_diagnosis"] = df["diagnosis"][i]
df.loc[i, "secondary_diagnosis"] = ""
## Remove any text after "___ Condition:"
def delete_after_string(text):
text = re.sub(re.compile(r"___ Condition:.*", re.IGNORECASE | re.DOTALL), "", text)
return text
df['primary_diagnosis'] = df['primary_diagnosis'].apply(delete_after_string)
## Filter rows in the DataFrame where 'primary_diagnosis' has fewer than 16 single newlines (less than 16 diagnoses)
def count_single_newlines(text):
single_newlines = re.findall(r'(?<!\n)\n(?!\n)', text)
return len(single_newlines)
# Apply the function to the entire column and get a list of counts
newline_counts = df['primary_diagnosis'].apply(count_single_newlines).tolist()
mask = [value < 16 for value in newline_counts]
df = df[mask]
df = df.drop(columns=['text', 'preprocessed_text', 'past_medication'], inplace=False)
## convert primary and secondary diagnoses into a list of diagnoses for each patient
## replace colon without \n to colon with \n
def colon_replacement(text):
# remove everything after
text = re.sub(r":\s*(?!\n)", ':\n', text)
return text
df['primary_diagnosis'] = df['primary_diagnosis'].apply(colon_replacement)
df['secondary_diagnosis'] = df['secondary_diagnosis'].apply(colon_replacement)
## make diagnosis into a list for each row
liste = df['primary_diagnosis'].apply(lambda x: [s for s in x.split('\n') if s.strip()] if pd.notna(x) else x)
liste = liste.apply(lambda lst: [item for item in lst if "primary diagnoses" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "primary diagnosis" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "primary" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "====" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "" != item.lower()])
def remove_number_prefix(item):
return re.sub(r'^[1-8]\)\s*', '', item)
liste = liste.apply(lambda lst: [remove_number_prefix(item) for item in lst])
df["primary_diagnosis"] = liste
df['secondary_diagnosis'] = df['secondary_diagnosis'].fillna("")
liste = df['secondary_diagnosis'].apply(lambda x: [s for s in x.split('\n') if s.strip()])
liste = liste.apply(lambda lst: [item for item in lst if "secondary diagnoses" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "secondary diagnosis" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "secondary" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "====" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "" != item.lower()])
def remove_number_prefix(item):
return re.sub(r'^[1-8]\)\s*', '', item)
liste = liste.apply(lambda lst: [remove_number_prefix(item) for item in lst])
df["secondary_diagnosis"] = liste
## Extract the first 2200 (goal is to predict 2000, 200 are in case rows need to be remove - see postprocessing and additional postprocessing)
df = df[:2200]
## save file
df.to_csv('MIMIC-IV-Ext-Triage-Specialty-Diagnosis-Decision-Support.csv', index=False)