-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_bloodcells.R
156 lines (139 loc) · 6.77 KB
/
get_bloodcells.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
library(data.table)
library(dplyr)
input.file1 <- file.path("/gpfs/data/xhe-lab/uk-biobank/data/phenotypes",
"12-feb-2019","ukb26140.csv.gz")
input.file2 <- file.path("/gpfs/data/xhe-lab/uk-biobank/data/phenotypes",
"11-jun-2019","ukb32141.csv.gz")
input.file3 <- file.path("/gpfs/data/xhe-lab/uk-biobank/data/phenotypes",
"16-oct-2020","ukb44231.csv.gz")
output.file <- file.path("/gpfs/data/stephens-lab/finemap-uk-biobank",
"data/raw/BloodCells/bloodcells.csv")
cols <- c("eid","31-0.0","54-0.0","21022-0.0","22006-0.0",
"22001-0.0","22000-0.0","22005-0.0",paste0("22009-0.",1:10),
"22021-0.0", "22027-0.0",
"30000-0.0", "30010-0.0", "30020-0.0", "30040-0.0",
"30070-0.0", "30080-0.0", "30090-0.0",
"30110-0.0", "30180-0.0", "30190-0.0", "30200-0.0",
"30210-0.0", "30220-0.0", "30240-0.0",
"30270-0.0", "30290-0.0",
paste0("41202-0.", 0:379), "3140-0.0")
col_names <- c("id","sex","assessment_centre","age","ethnic_genetic",
"sex_genetic","genotype_measurement_batch","missingness",
paste0("pc_genetic",1:10),
"kinship_genetic", "outliers",
"WBC_count", "RBC_count", "Haemoglobin", "MCV",
"RDW", "Platelet_count", "Plateletcrit",
"PDW", "Lymphocyte_perc", "Monocyte_perc", "Neutrophill_perc",
"Eosinophill_perc", "Basophill_perc", "Reticulocyte_perc",
"MSCV", "HLR_perc",
paste0('ICD10.',0:379), "pregnancy")
cat("Reading data from the CSV files.\n")
out <- system.time({
dat1 <- fread(input.file1,sep = ",",header = TRUE,verbose = FALSE,
showProgress = FALSE,colClasses = "character");
dat2 <- fread(input.file2,sep = ",",header = TRUE,verbose = FALSE,
showProgress = FALSE,colClasses = "character");
dat3 <- fread(input.file3,sep = ",",header = TRUE,verbose = FALSE,
showProgress = FALSE,colClasses = "character")
})
class(dat1) <- "data.frame"
class(dat2) <- "data.frame"
class(dat3) <- "data.frame"
cat(sprintf("Data loading step took %d seconds.\n",round(out["elapsed"])))
dat12 <- inner_join(dat1,dat2,by = "eid")
dat <- inner_join(dat12, dat3, by='eid')
rm(dat1,dat2,dat3,dat12)
cat(sprintf("Merged table contains %d rows.\n",nrow(dat)))
## There are 502492 samples
# PREPARE DATA
# ------------
# Select the requested columns.
cat("Preparing data.\n")
dat <- dat[,cols]
names(dat) <- col_names
# Convert numerical columns except the first one (the first column contains
# the sample ids) to numeric values, and set all empty strings to NA.
n <- length(dat)
for (i in 2:n) {
x <- dat[,i]
x[x == ""] <- as.character(NA)
if(grepl('ICD10', colnames(dat)[i])){
dat[,i] <- x
}else{
dat[,i] <- as.numeric(x)
}
}
# Remove any samples that are not marked as being "White British".
# This step should filter out 92887 rows.
dat = dat %>% filter(ethnic_genetic == 1)
cat(sprintf("After removing non White British, %d rows remain.\n",nrow(dat)))
# Remove all rows in which one or more of the values are missing,
# aside from the in the "outlier", "ICD10", "pregnancy" columns.
# The "outliers" have value 1 when it is an outlier, NA otherwise.
# The "pregnancy" have value NA for males.
# This step should filter out 18578 rows.
cols <- !(grepl(paste(c('ICD10', "outliers", "pregnancy"),collapse = '|'), names(dat)))
rows <- which(rowSums(is.na(dat[,cols])) == 0)
dat <- dat[rows,]
cat(sprintf("After removing rows with NAs, %d rows remain.\n",nrow(dat)))
# Remove rows with mismatches between self-reported and genetic sex
# This step should filter out 287 rows.
dat <- dat %>% filter(sex == sex_genetic)
cat(sprintf("After removing sex mismatches, %d rows remain.\n",nrow(dat)))
# Remove "missingness" and "heterozygosity" outliers as defined by UK
# Biobank. This step should filter out 665 rows. Note that this step
# will remove any samples in which the "missingness" column is greater
# than 5%.
dat <- dat %>% filter(is.na(outliers))
cat(sprintf("After removing outliers, %d rows remain.\n",nrow(dat)))
# Remove any individuals have at leat one relative based on the
# kinship calculations. This step should filter out 126,236 rows.
dat <- dat %>% filter(kinship_genetic == 0)
cat(sprintf(paste("After removing relatedness individuals based on kinship,",
"%d rows remain.\n"),nrow(dat)))
# Remove any pregnant individuals
# This step should filter out 164 rows.
dat <- dat %>% filter(!(pregnancy %in% c(1,2)))
cat(sprintf(paste("After removing pregnant individuals,",
"%d rows remain.\n"),nrow(dat)))
# Remove any individuals with blood related diseases
# This step should filter out 6070 rows
icd10 = c('C94', 'C95', 'Z856', "C901", "C914", "C82", "C83", 'C84', "C85", "Z948",
"Z511", "Z512", "Z542", "D46", paste0("D", 55:64), paste0("B", 20:24),
"N180", "Z992", "Z491", "Z492", "K74", "C88", "C900", "C902", "C91", "C92",
"D45", "D47", "E831")
daticd10 = dat %>% select(which(grepl('ICD10', colnames(dat)))) %>% as.matrix
icd_status = matrix(grepl(paste(icd10, collapse='|'), daticd10), nrow(daticd10), ncol(daticd10))
dat = dat %>% filter(rowSums(icd_status) == 0)
cat(sprintf(paste("After removing individuals with blood diseases,",
"%d rows remain.\n"),nrow(dat)))
# Remove individuals with "abnormal" measurements.
# This step should filter out 8625 rows
pheno_names = c("WBC_count", "RBC_count", "Haemoglobin", "MCV", "RDW", "Platelet_count",
"Plateletcrit", "PDW", "Lymphocyte_perc", "Monocyte_perc",
"Neutrophill_perc", "Eosinophill_perc", "Basophill_perc",
"Reticulocyte_perc", "MSCV", "HLR_perc")
## RINT
for(name in pheno_names){
id = which(colnames(dat) == name)
dat[,id] = qnorm((rank(dat[,id],na.last="keep")-0.5)/sum(!is.na(dat[,id])))
}
## compute empirical covariance matrix
covy = dat %>% select(pheno_names) %>% cov
D2 = stats::mahalanobis(dat %>% select(pheno_names), center=0, cov=covy) ## mahalanobis distance
dat = dat[D2 < qchisq(0.01, df=16, lower.tail = F),]
cat(sprintf(paste("After removing individuals with abnormal measurements,",
"%d rows remain.\n"),nrow(dat)))
# Finally, remove the columns that are no longer needed for subsequent
# analyses.
cols.to.remove <- c("sex_genetic","ethnic_genetic",
"missingness",
"kinship_genetic","outliers","pregnancy",paste0("ICD10.", 0:379))
cols <- which(!is.element(names(dat),cols.to.remove))
dat <- dat[,cols]
# SUMMARIZE DATA
# --------------
# Double-check that everything looks okay.
summary(dat)
cat("Writing prepared data to CSV file.\n")
write.csv(dat,output.file,row.names = FALSE,quote = FALSE, na='NA')