hla_binding_preferences.Rmd

---
title: "Self and foreing protein presentation preferences of HLA alleles"
output: html_document
---

Setup

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

if (!startsWith(as.character(getRversion()), "3.6")) {
  stop("R v3.6 required in order to execute this Rmd file")
}

library(tidyverse)
library(broom)
library(data.table)
library(limma) #bioc
library(topGO)
library(org.Hs.eg.db) #bioc
library(ontologyIndex)
library(ggbeeswarm)
library(ggplot2)
library(Biostrings) #bioc
library(ggrepel)
library(circlize)
library(multipanelfigure)
library(ggsci)
library(latex2exp)
library(ggdist)
library(ggpubr)
library(ComplexHeatmap)
library(rstatix)
library(ontologySimilarity) 
library(gridExtra)
library(cowplot)

get_os <- function(){
  sysinf <- Sys.info()
  if (!is.null(sysinf)){
  os <- sysinf['sysname']
  if (os == 'Darwin')
    os <- "osx"
  } else { ## mystery machine
    os <- .Platform$OS.type
    if (grepl("^darwin", R.version$os))
      os <- "osx"
    if (grepl("linux-gnu", R.version$os))
      os <- "linux"
  }
  tolower(os)
}

if (get_os() == "osx") {
  read_gz <- function(x) fread(paste("gzcat", x))
} else {
  read_gz <- function(x) fread(paste("zcat", x))
}

rename <- dplyr::rename
select <- dplyr::select
```

## 1. HLA ligand-enriched genes and their GO analysis

### Load and preprocess binding predictions

Read in netMHCpan binders

```{r}
data.pred.full <- read_gz("data/human_proteome_netmhcpan_summary.txt.gz")

data.pred.full %>%
  group_by(hla, ligand.len) %>%
  summarize(count = sum(count)) %>%
  arrange(-count) %>%
  head()
```

Select data for pilot analysis

```{r}
data.pred.full %>%
  filter(ligand.len == 9, hla %in% c("HLA-A1101", "HLA-A0201", 
                                     "HLA-A0101", "HLA-A0301", 
                                     "HLA-B0702", "HLA-B2705",
                                     "HLA-B0801", "HLA-B5701", # HIV
                                     "HLA-C0702", "HLA-C1502", # SARS
                                     "HLA-C0202", "HLA-C0801")) -> data.pred.sel

data.pred.sel  %>%
  group_by(hla) %>%
  summarize(count = sum(count)) %>%
  arrange(-count)
```

Get protein lengths and Entrez IDs

```{r}
human_prots <- readAAStringSet("data/UP000005640_9606.fasta") %>%
  as.character

# ! this provides uniprot <> entrez conversions
meta.prot <- data.table(uniprot.id2 = str_split_fixed(names(human_prots), "[\\| ]", 4)[,3],
                        protein.len = nchar(human_prots)) %>%
  mutate(name = str_split_fixed(uniprot.id2, "_", 2)[,1]) %>%
  merge(fread("data/uniprot_to_entrez.txt"))
```

Compute base statistics - observed and expected number of presented peptides per gene. Expected number is an average across a given HLA

```{r}
data.pred.sel.s <- data.pred.sel %>%
  as.data.frame %>%
  merge(meta.prot) %>%
  group_by(hla) %>%
  mutate(P0 = sum(count) / sum(protein.len - ligand.len)) %>%
  ungroup %>%
  rowwise() %>%
  mutate(odds = log2((count + 1) / (protein.len + 1 - ligand.len) / P0),
         p.value = binom.test(count + 1, protein.len + 1 - ligand.len, 
                              p = P0, alternative = "two.sided")$p.value) %>%
  group_by(hla) %>%
  mutate(p.value.adj = p.adjust(p.value, method = "BH")) %>%
  ungroup
```

### Human proteins enriched and depleted in HLA ligands

```{r}
#! hla.gene column is not used further
data.pred.sel.s <- data.pred.sel.s %>%
  mutate(hla.gene = substr(hla, 5, 5),
         sel = ifelse(odds >= 1 & p.value.adj < 0.05, 1, 
                      ifelse(odds <= -1 & p.value.adj < 0.05, -1,
                             0)),
         protein.len.q = cut(protein.len, quantile(protein.len)))
```

Motifs of surveyed HLA alleles
```{r FigS1, fig.width=10, fig.height=12}
pfigs1 <- ggdraw() + 
  draw_image("motifs/FigS1/HLA-A0201.png", x = .5, y = 1.05, hjust = 1, vjust = 1, width = 0.5, height = .45) +
  draw_image("motifs/FigS1/HLA-A1101.png", x = 1, y = 1.05, hjust = 1, vjust = 1, width = 0.5, height = .45) +
  draw_image("motifs/FigS1/HLA-B0702.png", x = .5, y = .72, hjust = 1, vjust = 1, width = 0.5, height = .45) +
  draw_image("motifs/FigS1/HLA-B2705.png", x = 1, y = .72, hjust = 1, vjust = 1, width = 0.5, height = .45) +
  draw_image("motifs/FigS1/HLA-C0202.png", x = .5, y = .39, hjust = 1, vjust = 1, width = 0.5, height = .45) +
  draw_image("motifs/FigS1/HLA-C1502.png", x = 1, y = .39, hjust = 1, vjust = 1, width = 0.5, height = .45) +
  draw_plot_label(c("HLA-A0201", "HLA-A1101", "HLA-B0702", "HLA-B2705", "HLA-C0202", "HLA-C1502"), 
                  x = c(.15, .65, .15, .65, .15, .65), y = c(1, 1, .67, .67, .34, .34))

pfigs1

pdf("figures/FigS1.pdf", height = 12, width = 10)
pfigs1
dev.off()
```

Plot over- and under-represented proteins according to HLA ligand frequency

```{r Fig2a, fig.width = 6, fig.height = 10}
#! excl_for_now may be excessive
excl_for_now <- c(
  "HLA-A0301","HLA-A0101",
  "HLA-B5701", "HLA-B0801", 
  "HLA-C0801", "HLA-C0702")

hla_incl_vert <- c("HLA-A0201", "HLA-A1101", 
                   "HLA-B0702", "HLA-B2705", 
                   "HLA-C0202", "HLA-C1502")

hla_incl <- c("HLA-A0201", "HLA-B0702", "HLA-C0202",
              "HLA-A1101", "HLA-B2705", "HLA-C1502")

pfig2a <- data.pred.sel.s %>%
  filter(!(hla %in% excl_for_now)) %>%
  mutate(hla = factor(hla, levels = hla_incl_vert)) %>%
  ggplot(aes(x = odds, y = pmin(-log10(p.value), 20), size = count,
             color = sel %>% as.factor())) +
  geom_point() +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey") +
  #geom_hline(yintercept = 3, linetype = "dashed", color = "grey") +
  scale_size_continuous("Number of\npeptides", breaks = c(1, 10, 100, 1000)) +
  scale_color_manual("Ligand -", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted", "No change", "Enriched")) +
  scale_x_continuous(TeX("$log_{2}\\,\\left[\\frac{ligands_{obs}}{ligands_{exp}}\\right]$"), limits = c(-5, 5)) +
  ylab(TeX("$-log_{10}\\~Pvalue_{adj}$")) +
  facet_wrap(~hla, ncol = 2) + #facet_wrap(~hla, nrow = 2) +
  theme_pubclean() + 
  theme(legend.position = "bottom", 
        legend.box = "vertical")
pfig2a
```

```{r TableS1}
pst1 <- data.pred.sel.s %>%
  filter(hla %in% hla_incl,
         sel == 1) %>%
  group_by(hla) %>%
  summarize(enriched.genes = length(unique(entrez.id))) %>%
  merge(data.pred.sel.s %>%
          filter(hla %in% hla_incl,
                 sel == -1) %>%
    group_by(hla) %>%
          summarize(depleted.genes = length(unique(entrez.id)))) %>%
  merge(data.pred.sel %>%
          group_by(hla) %>%
          summarize(ligand.count = sum(count)) ) %>%
  arrange(hla) %>% 
  setNames(., c("Allele", "Number of\n enriched genes", 
                "Number of\n depleted genes", "Ligand count")) 

ggtexttable(pst1, rows = NULL,
                  theme = ttheme("classic"))
pst1 
```

### Presentation preferences based on source human protein length

```{r}
data.pred.sel.s.len <- data.pred.sel.s %>%
  filter(!(hla %in% excl_for_now)) %>%
  group_by(hla, sel, protein.len.q) %>%
  summarize(count = n()) %>%
  group_by(hla, sel) %>%
  mutate(total = sum(count)) %>%
  mutate(p = count / total)
```

```{r FigS2}
pfigs2 <- data.pred.sel.s.len %>%
  merge(expand.grid(hla = data.pred.sel.s.len$hla %>% unique, 
                    sel = data.pred.sel.s.len$sel %>% unique,
                    protein.len.q = data.pred.sel.s.len$protein.len.q %>% unique), all = T) %>%
  mutate(p = ifelse(is.na(p), 0, p)) %>%
  mutate(hla = factor(hla, levels = hla_incl)) %>%
  ggplot(aes(x = protein.len.q %>% as.integer, 
             y = p,
             fill = sel %>% as.factor,
             color = sel %>% as.factor)) +
  geom_errorbar(aes(ymin = p, ymax = p + sqrt(p * (1-p) / total)),
                position = "dodge") +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual("", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted","No change","Enriched")) +
  scale_color_manual("", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted","No change","Enriched")) +
  xlab("Protein length quartile") + ylab("Fraction of proteins") +
  facet_wrap(~hla, nrow = 2) +
  theme_pubclean() + 
  theme(aspect = 1, legend.position = "bottom")

pfigs2

pdf("figures/FigS2.pdf", height = 4, width = 7)
pfigs2
dev.off()
```


### GO analysis

http://web.mit.edu/~r/current/arch/i386_linux26/lib/R/library/limma/html/goana.html

```{r}
get_annots <- function(.set1, .set2 = meta.prot$entrez.id,
                       .threshold = 0.01, .fun = goana) {
  .set1 <- .set1 %>% unique
  .set2 <- c(.set1, .set2) %>% unique
  .fun(.set1, universe = .set2, 
       FDR = 1.01) %>% # we'll do our own FDR filtering
    add_rownames("Term.ID") %>%
    mutate(total.N = length(.set2),
           total.DE = length(.set1),
           fold = log2(DE / total.DE / N * total.N),
           P.DE.adj = p.adjust(P.DE, method = "BH")) %>%
    filter(P.DE.adj < .threshold) %>% 
    arrange(P.DE, -fold)
}

get_annots1 <- function(.sel, .hla, data = data.pred.sel.s, 
                        .threshold = 0.01, .fun = goana) {
  get_annots(data %>% filter(hla == .hla, sel == .sel) %>% .$entrez.id, 
             data %>% filter(hla == .hla, sel == 0) %>% .$entrez.id,
             .threshold, .fun) %>%
    mutate(hla = .hla, sel = .sel)
}

data.pred.sel.s %>%
  select(hla, sel) %>%
  unique %>%
  filter(sel != 0) %>%
  group_by(sel, hla) %>%
  group_modify(~ get_annots(filter(data.pred.sel.s, hla == .y$hla, sel == .y$sel)$entrez.id,
                            filter(data.pred.sel.s, hla == .y$hla, sel == 0)$entrez.id)) -> go.enr.1
```

```{r Fig2b, fig.height=14, fig.width=10}
trunc_str <- function(x, len = 6) { 
strsplit(x, " ") %>%
  lapply(function(xx) {
    if(length(xx) >= len) {
      xx[len] <- "..."
      return(paste0(xx[1:len], collapse = " "))
      }
    return(paste0(xx, collapse = " "))
  }) %>% unlist
}

go.top.1 <- go.enr.1 %>%
  group_by(sel, hla) %>%
  filter(rank(P.DE.adj) <= 20) %>%
  .$Term.ID %>% unique

pfig2b <- go.enr.1 %>%
  filter(!(hla %in% excl_for_now),
         Term.ID %in% go.top.1) %>%
  mutate(Term = trunc_str(Term, 4),
         direction = ifelse(sel > 0, "enriched", "depleted")) %>%
  ggplot(aes(x = factor(hla, levels = hla_incl_vert),
             y = fct_reorder2(paste(Ont, Term), fold, paste(sel, hla)))) +
  geom_quasirandom(aes(size = fold, fill = direction), shape = 21, width = 0.2) + 
  #geom_text(aes(label = paste0(round(100 * DE / total.DE, 0), "%"))) +
  scale_y_discrete("", position = "right") + 
  #scale_x_discrete("", position = "top") + 
  xlab("") +
  scale_fill_manual("HLA ligand -", values = c("#56b4df", "#e69d00")) +
  #scale_color_distiller("-log10 Padj", palette = "Reds", direction = 1) +
  scale_size_continuous("GO term enrinchment\nfold, log2", breaks = c(1, 2 ,4)) +
  theme_pubclean() +
  theme(legend.position = "bottom", 
        legend.box = "vertical",
        axis.text.x = element_text(angle = 90, vjust = 0.5),
        axis.text.y = element_text(family = "mono", size = 8))
pfig2b
```

```{r Fig2, fig.height = 14, fig.width = 14}
pfig2 <- ggdraw() +
  draw_plot(pfig2a, x = 0, y = 0, width = .42, height = 1) +
  draw_plot(pfig2b, x = .43, y = -.1, width = .57, height = 1.1) +
  draw_plot_label(label = c("A", "B"), size = 11,
                  x = c(0, .42), y = c(1, 1))

pfig2

pdf("figures/Fig2.pdf", height = 14, width = 14)
pfig2
dev.off()
```


### Amino acid composition of HLEPs

```{r Fig3a}
#function for calculation of amino acid composition of a protein
aa_freq_count <- function(prot.seq, id_name) {  
  as.character(prot.seq) %>% 
    strsplit(split="") %>% unlist() %>% 
    table() %>% as_tibble() %>% 
    rename(aminoacid = '.') %>%
    filter(!aminoacid %in% c("X", "U")) %>%
    mutate(freq = n / sum(n),
           id = id_name) %>% 
    select(aminoacid, freq, id)
}

#amino acid composition of all proteins from human proteome
aa.freq.prots <- data.table(seq = human_prots,
                          uniprot.id2 = str_split_fixed(names(human_prots), "[\\| ]", 4)[,3]) %>%
  apply(MARGIN = 1, FUN = function(x) {aa_freq_count(x[1], x[2])}) %>% 
  rbindlist() %>% 
  rename(uniprot.id2 = id)
  
#amino acid composition of proteins enriched or depleted for selected alleles
data.pred.sel.s.aafreq <- data.pred.sel.s %>%
  filter(hla %in% hla_incl) %>%
  select(uniprot.id2, hla, sel) %>%
  distinct() %>%
  merge(aa.freq.prots) %>% 
  group_by(hla, sel, aminoacid) %>%
  summarise(freq.mean = mean(freq),
            sem = sd(freq)/sqrt(n())) %>% 
  ungroup()

hla.anchor.res <- tibble(hla = c(rep("HLA-A0201", 2), "HLA-A1101", "HLA-B0702", "HLA-B2705", rep("HLA-C0202", 3), rep("HLA-C1502", 3)), aminoacid = c("L", "V", "K", "P", "R", "F", "Y", "L", "L", "I", "V"), anchor = 1)

data.pred.sel.s.aafreq <-  data.pred.sel.s.aafreq %>% 
  merge(hla.anchor.res, all = T) %>%
  mutate(anchor = replace_na(anchor, -1))

aa.levels <- c("L","F","I","M","V","W","Y","C","H","A","G","P","T","S","Q","N","D","E","R","K")

pfig3a <- data.pred.sel.s.aafreq %>%
  mutate(sel = factor(sel, levels = c("-1", "0", "1"))) %>% 
  filter(sel != "-1") %>% 
  ggplot(aes(x = factor(aminoacid, levels = aa.levels), color = sel, group = sel)) +
  geom_bar(aes(y = anchor), stat = "identity", fill = "grey", color = 'white', alpha = 0.2) +
  geom_line(aes(y = freq.mean)) +
  geom_errorbar(aes(ymin=freq.mean-1.96*sem, ymax=freq.mean+1.96*sem), width=.1) +
  facet_wrap(~hla, ncol = 2, scales = "free_x") +
  scale_color_manual("", 
                     values = c("#000000", "#e69d00"),
                     labels=c("Human proteome", "HLA ligand-enriched\n proteins")) +
  xlab("") + ylab("Frequency of amino acid") +
  coord_cartesian(ylim = c(0.005,0.15)) +
  theme_pubclean() +
  theme(legend.position = "bottom", 
        legend.box = "vertical") 
  
pfig3a
```

### Amino acid composition of proteins of GO terms depleted for all alleles but HLA-B0702

```{r Fig3b}
data(gene_GO_terms)

#Which genes correspond to particular GO terms
go_genes <- plyr::ldply(gene_GO_terms, rbind) %>%
  melt(id=1, value.name = "Term.ID") %>%
  select(uniprot.id2 = .id, Term.ID) %>%
  filter(Term.ID != '')

#Which GO terms are depleted for most of alleles
go.enr.1 %>% 
  filter(hla %in% hla_incl) %>% 
  group_by(Term.ID) %>% 
  summarise(sel.sum = sum(sel),
            alleles = paste0(substring(hla, 6,10), collapse = ' ')) %>%
  arrange(sel.sum)

#Note that there is a group of 14 GO terms that are common for HLDPs of all alleles but HLA-B0702

go.depl <- go.enr.1 %>% 
  filter(hla %in% hla_incl,
         hla != "HLA-B0702") %>% 
  group_by(Term.ID) %>% 
  summarise(sel.go = sum(sel)) %>% 
  filter(sel.go == min(sel.go))

pfig3b <- go.enr.1 %>%
  ungroup() %>% 
  filter(Term.ID %in% go.depl$Term.ID) %>% 
  select(Ontology = Ont, Term) %>% 
  distinct() %>% 
  mutate(Term = str_wrap(Term, 40)) %>% 
  arrange(Ontology) %>% 
  ggtexttable(rows = NULL,
                  theme = ttheme("classic", base_size = 8, padding = unit(c(2, 2), "mm")))


go.depl.genes <- go.depl %>%
  merge(go_genes) %>% 
  merge(data.pred.sel.s %>% 
          mutate(uniprot.id2 = str_split_fixed(uniprot.id2, "_", 4)[,1]) %>%
          filter(hla %in% hla_incl,
                 hla != "HLA-B0702",
                 sel < 0) %>%
          distinct(uniprot.id2))

go.depl.aafreq <- go.depl.genes %>% 
  merge(aa.freq.prots %>% 
          mutate(uniprot.id2 = str_split_fixed(uniprot.id2, "_", 4)[,1])) %>% 
  bind_rows(aa.freq.prots %>% 
          mutate(sel.go = 0)) %>% 
  group_by(sel.go, aminoacid) %>%
  summarise(freq.mean = mean(freq),
            sem = sd(freq)/sqrt(n())) 


pfig3c <- go.depl.aafreq %>%
  mutate(sel.go = factor(sel.go)) %>% 
  ggplot(aes(x = factor(aminoacid, levels = aa.levels), color = sel.go, group = sel.go)) +
  geom_line(aes(y = freq.mean)) +
  geom_errorbar(aes(ymin=freq.mean-1.96*sem, ymax=freq.mean+1.96*sem), width=.1) +
  scale_color_manual("", 
                     values = c("#56b4df", "#000000"),
                     labels=c("Proteins for GO terms depleted\n in all HLAs but HLA-B0702", "Human proteome")) +
  xlab("") + ylab("Frequency of amino acid") +
  coord_cartesian(ylim = c(0.005,0.15)) +
  theme_pubclean() +
  theme(legend.position = "bottom", 
        legend.box = "vertical") 

pfig3c 
```

```{r Fig3, fig.width=7, fig.height=8}
pfig3 <- ggdraw() +
  draw_plot(pfig3a, x = 0, y = .43, width = 1, height = .57) +
  draw_plot(pfig3b, x = .15, y = 0, width = .1, height = .4) +
  draw_plot(pfig3c, x = .4, y = 0, width = .58, height = .4) +
  draw_plot_label(label = c("A", "B", "C"), size = 11,
                  x = c(0, 0, .4), y = c(1, .42, .42))

pfig3

pdf("figures/Fig3.pdf", height = 8, width = 7)
pfig3
dev.off()
```


### Different amino acid composition of proteins of different length quartiles

```{r FigS6a}
aa.short.2 <- tibble(aminoacid = c("L","F","I","M","V","W","Y","C","H","A","G","P","T","S","Q","N","D","E","R","K"), short = c(rep("LFIMV",5), rep("other", 6), "P", rep("other", 4),"DE","DE","RK","RK"))

levels.anchors <- c("LFIMV", "RK", "DE", "P", "other")

aa.freq.prots.q <- aa.freq.prots %>% 
  merge(meta.prot %>% 
          select(uniprot.id2, protein.len) %>% 
          distinct() %>% 
          mutate(protein.len.q = cut(protein.len, quantile(protein.len))))

pfigs6a <- aa.freq.prots.q %>% 
  merge(aa.short.2, by.x = "aminoacid", by.y = "aminoacid") %>%
  group_by(short, uniprot.id2, protein.len.q) %>% 
  summarise(freq = sum(freq)) %>% 
  group_by(short, protein.len.q) %>% 
  summarise(freq.mean = mean(freq),
            sd = sd(freq),
            sem = sd(freq)/sqrt(n())) %>% 
  drop_na() %>% 
  ggplot(aes(x = protein.len.q %>% as.integer,
             group = protein.len.q,
             y = freq.mean)) +
  facet_wrap(~factor(short, levels = levels.anchors), scales = "free") +
  geom_point() +
  geom_errorbar(aes(ymin=freq.mean-1.96*sem, ymax=freq.mean+1.96*sem), width=.1) +
  xlab("Protein length quartile") + ylab("Aggregated frequency of amino acids") +
  theme_pubr()

pfigs6a
```

### Re-check GO differences with Seph data 

```{r}
data.seph <- fread("data/seph_hla_ligands.txt") %>%
  merge(meta.prot, by = "uniprot.id")

data.seph.ann <- data.seph %>%
  group_by(hla, ligand.len) %>%
  group_modify(~get_annots(.x$entrez.id, .threshold = 1.01))
```

```{r Fig4a}
go.enr.1 %>%
  filter(hla %in% c("HLA-A0201", "HLA-A1101"),
         sel == 1) %>%
  group_by(hla) %>%
  filter(rank(P.DE.adj) <= 20) %>%
  mutate(fold.orig = fold, P.DE.adj.orig = P.DE.adj,
         hla.orig = hla) %>%
  ungroup %>%
  select(Term.ID, Ont, Term, fold.orig, P.DE.adj.orig, hla.orig) %>%
  unique %>%
  merge(data.seph.ann) -> data.seph.ann.sel

pfig4a <- data.seph.ann.sel %>%
  filter(Ont == "CC", ligand.len == 9) %>%
  mutate(hla.orig = paste0(hla.orig, "-assoc. GO"),
         p = DE / total.DE, psd = sqrt(p * (1 - p) / total.DE)) %>%
  arrange(p) %>%
  group_by(hla.orig, Term) %>%
  mutate(p.norm = p / mean(p), psd.norm = psd / mean(p), delta = abs(p[1] - p[2])) %>%
  #filter(Ont == "CC", Term == "ribosome" | Term == "membrane" | Term == "nucleosome") %>%
  ggplot(aes(x = fct_reorder(Term, P.DE), 
             y = p.norm, color = hla)) +
  geom_pointrange(aes(ymin = p.norm - psd.norm, ymax = p.norm + psd.norm), width = 0.2) +
  geom_point(aes(y = p.norm, size = DE), width = 0.2) +
  geom_line(aes(group = hla)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "grey30", size = 0.5) +
  scale_color_manual(name = "Experimental\n ligandome", values = c("#dfc27d", "#80cdc1")) +
  facet_wrap(~hla.orig, scales = "free") + scale_size("Number of\nproteins", breaks = c(100, 500, 1000)) +
  xlab("") + ylab("Fraction of presented proteins / \n mean for two HLAs") +
  theme_pubr() +
  theme(legend.position = "right", #c(0.1, 0.7),
        axis.text.x = element_text(angle = -45, vjust = 0.5, hjust = 0))
pfig4a
```


---

## 2. Human proteins are differentially presented by HLA ligands of different lengths

### An example A11

A0101 - bias in depleted genes for 
B0702 - another example, with bias in depleted/enriched
A0201 - no effect
B0801 - no effect
B2701 - no effect

```{r}
data.pred.len <- data.pred.full %>% # note some duplicates due to entrez id
  filter(hla == "HLA-A1101") %>%
  merge(meta.prot, by = "uniprot.id") %>%
  group_by(hla, ligand.len) %>%
  mutate(P0 = sum(count) / sum(protein.len - ligand.len)) %>%
  ungroup %>%
  rowwise() %>%
  mutate(odds = log2((count + 1) / (protein.len + 1 - ligand.len) / P0),
         p.value = binom.test(count + 1, protein.len + 1 - ligand.len, 
                              p = P0, alternative = "two.sided")$p.value) %>%
  group_by(hla, ligand.len) %>%
  mutate(p.value.adj = p.adjust(p.value, method = "BH")) %>%
  ungroup

#! hla.gene column isn't used further
data.pred.len <- data.pred.len %>%
  mutate(hla.gene = substr(hla, 5, 5),
         sel = ifelse(odds >= 1 & p.value.adj < 0.05, 1, 
                      ifelse(odds <= -1 & p.value.adj < 0.05, -1,
                             0)),
         protein.len.q = cut(protein.len, quantile(protein.len)))
```

```{r TableS2}
pst2 <- data.pred.len %>%
  filter(sel == 1) %>%
  group_by(hla, ligand.len) %>%
  summarize(enriched.genes = length(unique(entrez.id))) %>%
  merge(data.pred.len %>%
          filter(sel == -1) %>%
          group_by(hla, ligand.len) %>%
          summarize(depleted.genes = length(unique(entrez.id)))) %>%
  merge(data.pred.full %>%
          filter(hla == "HLA-A1101") %>%
          group_by(hla, ligand.len) %>%
          summarize(ligand.count = sum(count)) ) %>%
  select(-hla) %>%  
  arrange(ligand.len) %>% 
  setNames(., c("Ligand length", "Number of\n enriched genes", 
                "Number of\n depleted genes", "Ligand count")) 

ggtexttable(pst2, rows = NULL,
                  theme = ttheme("classic"))
```

```{r FigS3}
pfigs3 <- data.pred.len %>%
  ggplot(aes(x = odds, y = pmin(-log10(p.value), 20), size = count,
             color = sel %>% as.factor())) +
  geom_point()  +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey") +
  #geom_hline(yintercept = 3, linetype = "dashed", color = "grey") +
  scale_color_manual("", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted", "No change", "Enriched")) +
  scale_x_continuous(TeX("$log_2 (observed / expected)$"), limits = c(-3, 3)) +
  ylab(TeX("$-log_{10}(P-value)$")) +
  scale_size_continuous("Number of\npeptides", breaks = c(1, 10, 100, 1000)) +
  facet_wrap(~ligand.len, nrow = 2) +
  theme_pubclean() + 
  theme(aspect = 1, legend.position = "right")

pfigs3

pdf("figures/FigS3.pdf", height = 4, width = 7)
pfigs3
dev.off()

data.pred.len.plen <- data.pred.len %>%
  group_by(ligand.len, sel, protein.len.q) %>%
  summarize(count = n()) %>%
  group_by(ligand.len, sel) %>%
  mutate(total = sum(count)) %>%
  mutate(p = count / total) %>%
  merge(expand.grid(sel = data.pred.len$sel %>% unique,
                    ligand.len = data.pred.len$ligand.len %>% unique,
                    protein.len.q = data.pred.len$protein.len.q %>% unique), all = T) %>%
  mutate(p = ifelse(is.na(p), 0, p))

#! this figure is not included in the article
data.pred.len.plen %>%
  ggplot(aes(x = protein.len.q %>% as.integer, 
             y = count / total,
             fill = sel %>% as.factor,
             color = sel %>% as.factor)) +
  geom_errorbar(aes(ymin = p, ymax = p + sqrt(p * (1-p) / total)),
                position = "dodge") +
  geom_bar(stat = "identity", position = "dodge") +
  scale_color_manual("Direction", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted","No change","Enriched")) +
  scale_fill_manual("Direction", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted","No change","Enriched")) +
  xlab("Protein length quartile") + ylab("Fraction of peptides") +
  facet_wrap(~ligand.len, nrow = 2) +
  theme_minimal()

get_annots2 <- function(.sel, .ligand.len, data = data.pred.len, 
                        .threshold = 0.01, .fun = goana) {
  get_annots(data %>% filter(ligand.len == .ligand.len, sel == .sel) %>% .$entrez.id, 
             data %>% filter(ligand.len == .ligand.len, sel == 0) %>% .$entrez.id,
             .threshold, .fun) %>%
    mutate(ligand.len = .ligand.len, sel = .sel)
}

expand.grid(sel = c(1,-1), ligand.len = data.pred.len$ligand.len %>% unique) %>%
  rowwise %>%
  do(get_annots2(.$sel, .$ligand.len)) -> go.enr.len

go.top.len <- go.enr.len %>%
  group_by(sel, ligand.len) %>%
  filter(rank(P.DE.adj) <= 20) %>%
  .$Term.ID %>% unique
```

````{r FigS4, fig.height=14, fig.width=10}
pfigs4 <- go.enr.len %>%
  filter(Term.ID %in% go.top.len) %>%
  mutate(Term = trunc_str(Term),
         direction = ifelse(sel > 0, "Ligand enriched", "Ligand depleted")) %>%
  ggplot(aes(x = paste0(ligand.len, "-mer") %>% factor(levels = paste0(8:12, "-mer")),
             y = fct_reorder2(paste(Ont, Term), fold, paste(sel, ligand.len)))) +
  geom_quasirandom(aes(size = fold, fill = direction), shape = 21, width = 0.2) + 
  #geom_point(aes(size = fold, fill = direction), shape = 21) + 
  #geom_text(aes(label = paste0(round(100 * DE / total.DE, 0), "%"))) +
  scale_y_discrete("", position = "right") + xlab("") +
  scale_size_continuous("GO enr. fold (log2)", breaks = c(1, 2 ,4)) +
  scale_fill_manual("", values = c("#56b4df", "#e69d00")) +
  #scale_color_distiller("-log10 Padj", palette = "Reds", direction = 1) +
  #scale_size_continuous(guide = F, breaks = c(1,2,4,6)) +
  theme_minimal() +
  theme(legend.position = "top",
        axis.text.x = element_text(angle = 90, vjust = 0.5))

pfigs4

pdf("figures/FigS4.pdf", height = 14, width = 10)
pfigs4
dev.off()
```

### Re-check ligand length bias with Seph data 

```{r}
go.enr.len %>%
  mutate(hla = "HLA-A1101") %>%
  filter(sel == 1) %>%
  filter(Term.ID %in% go.top.len) %>%
  group_by(Term.ID) %>%
  mutate(max.fold.len = ligand.len[which(fold == max(fold))]) %>%
  #group_by(hla, ligand.len) %>%
  #filter(rank(P.DE.adj) <= 20) %>%
  mutate(fold.orig = fold, P.DE.adj.orig = P.DE.adj,
         DE.orig = DE, total.DE.orig = total.DE,
         ligand.len.orig = ligand.len) %>%
  ungroup %>%
  select(Term.ID, Ont, Term,
         DE.orig, total.DE.orig,
         fold.orig, P.DE.adj.orig, hla, ligand.len.orig, max.fold.len) %>%
  unique %>%
  merge(data.seph.ann) -> data.seph.ann.len
```

```{r Fig4b, fig.width=7, fig.height=7}
pfig4b <- data.seph.ann.len %>%
  filter(ligand.len == ligand.len.orig) %>%
  filter(Ont == "CC") %>%
  mutate(ligand.len = factor(paste0(ligand.len, "-mer"), levels = c("9-mer", "10-mer", "11-mer", "12-mer")),
         max.fold.len = factor(paste0(max.fold.len, "-mer"), levels = c("9-mer", "10-mer", "11-mer", "12-mer"))) %>%
  #filter(ligand.len %in% c(10, 11), ligand.len.orig %in% c(10, 11)) %>%
  ggplot(aes(x = DE.orig / total.DE.orig, y = DE / total.DE)) +
  geom_smooth(method = "lm", se = F, color = "grey30", linetype = "dashed", size = 0.5) +
  geom_point(aes(color = max.fold.len %>% as.factor), size = 3) +
  geom_text_repel(aes(label = Term), seed = 42, force = 5,
                  #nudge_x = 0.4, 
                  #nudge_y = 0.01, 
                  segment.alpha = 0.7) +
  facet_wrap(~ligand.len, scales = "free") +
  scale_color_manual(name = "Max enrichment for", values = c("#542788", "#998ec3", "#f1a340")) +
  scale_x_continuous("Fraction of presented proteins (synthetic)", expand = c(0.1, 0.01)) +
  scale_y_continuous("Fraction of presented proteins (experimental)", expand = c(0.1, 0.01)) +
  theme_pubr() +
  theme(legend.position = "bottom")

pfig4b
```

```{r Fig4, fig.width=8, fig.height=12}
pfig4 <- ggdraw() +
  draw_plot(pfig4a, x = 0, y = .58, width = 1, height = .42) +
  draw_plot(pfig4b, x = 0, y = 0, width = 1, height = .57) +
  draw_plot_label(label = c("A", "B"), size = 11,
                  x = c(0, 0), y = c(1, .57))
pfig4

pdf("figures/Fig4.pdf", height = 12, width = 8)
pfig4
dev.off()
```


## 3. Analysis of the extended dataset of in silico ligandomes for 93 HLA alleles

```{r}
#allele list and their frequencies were taken from Sarkizova et al., Nature Biotechnology (2020)

#see netMHCpan_output_preproc.Rmd for details about anchor residue inference

hla.freq <- fread("data/HLA_freq.tsv")

hla_extended <- hla.freq$hla

pst3a <- hla.freq %>%
  setNames(., c("Allele", "Frequency", "P2\n anchor", "P9\n anchor", "Anchor\n group")) %>% 
  filter(substring(Allele, 1, 5) == "HLA-A") %>% 
  ggtexttable(rows = NULL,
                  theme = ttheme("classic"))

pst3b <- hla.freq %>%
  setNames(., c("Allele", "Frequency", "P2\n anchor", "P9\n anchor", "Anchor\n group")) %>%
  filter(substring(Allele, 1, 5) == "HLA-B") %>% 
  ggtexttable(rows = NULL,
                  theme = ttheme("classic"))

pst3c <- hla.freq %>%
  setNames(., c("Allele", "Frequency", "P2\n anchor", "P9\n anchor", "Anchor\n group")) %>%
  filter(substring(Allele, 1, 5) == "HLA-C") %>% 
  ggtexttable(rows = NULL,
                  theme = ttheme("classic"))
```

```{r TableS3, fig.width=14, fig.height=12}
pst3 <- ggdraw() +
  draw_plot(pst3a, x = -.3, y = .107, width = 1, height = 1) +
  draw_plot(pst3b, x = 0, y = .007, width = 1, height = 1) +
  draw_plot(pst3c, x = .3, y = .212, width = 1, height = 1) 

pst3

pdf("figures/TableS3.pdf", height = 12, width = 14)
pst3
dev.off()
```

### Explore protein length bias for 93 HLA alleles

```{r FigS6abc}
data.pred.sel.s.all <- read_gz("data/genes_enr_extended.tsv.gz")

data.pred.sel.s.all.len <- data.pred.sel.s.all %>%
  filter(hla %in% hla_extended) %>%
  merge(aa.freq.prots.q %>% 
          select(uniprot.id2, protein.len.q) %>% distinct()) %>% 
  group_by(hla, sel, protein.len.q) %>%
  summarize(count = n()) %>%
  group_by(hla, sel) %>%
  mutate(total = sum(count)) %>%
  mutate(p = count / total) %>%
  ungroup()

pfigs6b <- data.pred.sel.s.all.len %>%
  filter(sel != 0) %>% 
  drop_na() %>% 
  ggplot(aes(x = protein.len.q %>% as.integer() %>% factor(), y = p, fill = factor(sel))) + 
  geom_boxplot(position = "dodge", outlier.shape = NaN) +
  geom_quasirandom(size = .5, dodge.width = 0.75) +
  xlab("Protein length quartile") + ylab("Fraction of HLE(D)Ps") +
  scale_fill_manual("", 
                     values = c("#56b4df", "#e69d00"),
                     labels=c("Depleted", "Enriched")) +
  theme_pubclean()

pfigs6b

pfigs6c <- data.pred.sel.s.all.len %>%
  select(hla, sel, p, protein.len.q) %>% 
  filter(sel == 1) %>%
  group_by(hla, sel) %>% 
  filter(rank(p) == max(rank(p))) %>% 
  group_by(protein.len.q, sel) %>%
  summarise(n = n(),
            fraction = n() / length(unique(data.pred.sel.s.all.len$hla))) %>% 
  ggplot(aes(x = factor(protein.len.q %>% as.integer()), y = n)) + 
  geom_bar(stat = 'identity', fill = "#e69d00") +
  theme_pubclean() +
  xlab("Protein length quantile corresponding\n to maximum of HLEPs distribution") +
  ylab("Number of alleles (out of 93)") 
  
pfigs6c
```


```{r FigS6, fig.width=8, fig.height=10}
pfigs6 <- ggdraw() +
  draw_plot(pfigs6a, x = .15, y = .65, width = .7, height = .35) +
  draw_plot(pfigs6b, x = 0, y = .23, width = .6, height = .4) +
  draw_plot(pfigs6c, x = .6, y = .23, width = .4, height = .35) +
  draw_text("Alleles with maximum of HLEPs in Q1", x = .25, y = .18, fontface = "bold") +
  draw_text("Alleles with maximum of HLEPs in Q4", x = .75, y = .18, fontface = "bold") +
  draw_image("motifs/FigS6/A3001.png", 
             x = .02, y = 0, width = .15, height = .15) +
  draw_image("motifs/FigS6/A3101.png", 
             x = .17, y = 0, width = .15, height = .15) +
  draw_image("motifs/FigS6/A3303.png", 
             x = .32, y = 0, width = .15, height = .15) +
  draw_image("motifs/FigS6/B4402.png", 
             x = .6, y = 0, width = .15, height = .15) +
  draw_image("motifs/FigS6/B4403.png", 
             x = .75, y = 0, width = .15, height = .15) +
  draw_text("HLA-A3001", x = .1, y = .14) +
  draw_text("HLA-A3101", x = .25, y = .14) +
  draw_text("HLA-A3303", x = .4, y = .14) +
  draw_text("HLA-B4402", x = .68, y = .14) +
  draw_text("HLA-B4403", x = .83, y = .14) +
  draw_plot_label(label = c("A", "B", "C", "D", "E"), size = 11,
                  x = c(.1, 0, .6, 0, .5), y = c(1, .6, .6, .22, .22)) 

pfigs6

pdf("figures/FigS6.pdf", height = 10, width = 8)
pfigs6
dev.off()
```


### GO enrichment for 93 HLA alleles 

```{r}
#GO enrichment analysis was performed analogously as in section 1
go.enr.all <- fread("data/GO_enr_extended.tsv")
```


Which terms are more frequenty depleted? 
In which alleles they are not depleted?
```{r FigS5abc}
#GO terms depleted in majority of alleles
go.depl.all <- go.enr.all %>% 
  select(sel, hla, Term.ID, Term, Ont) %>% 
  merge(hla.freq) %>% 
  group_by(sel, Term.ID, Term, Ont) %>% 
  summarise(n.allele = n(),
            frac.allele = n.allele / length(hla_extended)) %>%
  ungroup() %>% 
  filter(sel == -1) %>% 
  slice_max(order_by = n.allele, n = 20) 

pfigs5a <- go.depl.all %>% 
  mutate(Alleles = paste0(n.allele, " (", round(frac.allele * 100, 0),
                              "%)")) %>% 
  select(Ontology = Ont, Term, Alleles) %>% 
  ggtexttable(rows = NULL,
              theme = ttheme("classic", base_size = 9, padding = unit(c(2, 2), "mm")))

#Alleles-exceptions
go.depl.all.except <- go.enr.all %>% 
  filter(hla %in% hla_extended,
         Term.ID %in% go.depl.all$Term.ID) %>% 
  group_by(hla) %>% 
  summarise(n = n()) %>% 
  merge(hla.freq) %>% 
  filter(n < 0.33 * max(n)) %>% 
  arrange(n, -Frequency) 

#Genes differentially depleted in alleles-exceptions and in other alleles
go.depl.all.genes <- data.pred.sel.s.all %>% 
  filter(hla %in% hla_extended) %>% 
  mutate(exception = ifelse(hla %in% go.depl.all.except$hla, "exceptions", "others")) %>% 
  select(uniprot.id2, hla, sel, exception) %>% 
  distinct() %>% 
  group_by(uniprot.id2, exception) %>% 
  summarise(sel.sum = sum(sel)) %>% 
  dcast(uniprot.id2 ~ exception) %>% 
  filter(exceptions >= 0,
         others <= -0.5*length(hla_extended))

go.depl.all.genes.aafreq <- go.depl.all.genes %>%   
  mutate(sel.go = -1) %>% 
  merge(aa.freq.prots) %>% 
  bind_rows(aa.freq.prots %>% 
              mutate(sel.go = 0)) %>% 
  group_by(sel.go, aminoacid) %>%
  summarise(freq.mean = mean(freq),
            sem = sd(freq)/sqrt(n())) 

pfigs5c <- go.depl.all.genes.aafreq %>%
  mutate(sel.go = factor(sel.go)) %>% 
  ggplot(aes(x = factor(aminoacid, levels = aa.levels), color = sel.go, group = sel.go)) +
  geom_line(aes(y = freq.mean)) +
  geom_errorbar(aes(ymin=freq.mean-1.96*sem, ymax=freq.mean+1.96*sem), width=.1) +
  scale_color_manual("", 
                     values = c("#56b4df", "#000000"),
                     labels=c("Proteins for GO terms depleted\n in the majority of HLAs", "Human proteome")) +
  xlab("") + ylab("Frequency of amino acid") +
  coord_cartesian(ylim = c(0.005,0.15)) +
  theme_pubclean() +
  theme(legend.position = "bottom", 
        legend.box = "vertical") 

pfigs5c
```

```{r FigS5, fig.width=9, fig.height=9}
pfigs5 <- ggdraw() +
  draw_plot(pfigs5a, x = .15, y = .60, width = .7, height = .35) +
  draw_plot(pfigs5c, x = .4, y = .1, width = .6, height = .4) +
  draw_text("Alleles-exceptions", x = .2, y = .53, fontface = "bold") +
  draw_image("motifs/FigS5b/B0702.png", 
             x = .02, y = .36, width = .15, height = .15) +
  draw_image("motifs/FigS5b/B5401.png", 
             x = .2, y = .36, width = .15, height = .15) +
  draw_image("motifs/FigS5b/B5601.png", 
             x = .02, y = .24, width = .15, height = .15) +
  draw_image("motifs/FigS5b/B4201.png", 
             x = .2, y = .24, width = .15, height = .15) +
  draw_image("motifs/FigS5b/B5502.png", 
             x = .02, y = .12, width = .15, height = .15) +
  draw_image("motifs/FigS5b/B3503.png", 
             x = .2, y = .12, width = .15, height = .15) +
  draw_image("motifs/FigS5b/B5501.png", 
             x = .1, y = 0, width = .15, height = .15) +
  draw_text("HLA-B0702", x = .12, y = .49, size = 11) +
  draw_text("HLA-B5401", x = .3, y = .49, size = 11) +
  draw_text("HLA-B5601", x = .12, y = .37, size = 11) +
  draw_text("HLA-B4201", x = .3, y = .37, size = 11) +
  draw_text("HLA-B5502", x = .12, y = .25, size = 11) +
  draw_text("HLA-B3503", x = .3, y = .25, size = 11) +
  draw_text("HLA-B5501", x = .2, y = .13, size = 11) +
  draw_plot_label(label = c("A", "B", "C"), size = 11,
                  x = c(.2, 0, .4), y = c(1, .53, .53)) 
pfigs5

pdf("figures/FigS5.pdf", height = 9, width = 9)
pfigs5
dev.off()
```

---

## 4. Real displayed proteins from HLA ligand atlas

```{r}
data.atlas <- fread("data/HLA_peptide_allele_tissue.tsv") %>%
  merge(fread("data/HLA_protein_map.tsv"), allow.cartesian = T) %>%
  select(peptide.seq = peptide_sequence,
         hla.class = hla_type_class,
         hla = hla_type_name,
         uniprot.id = uniprot_id) %>%
  unique %>%
  merge(meta.prot, by = "uniprot.id")
```

```{r}
data.atlas.ann <- data.atlas %>%
  group_by(hla.class, hla) %>%
  do(get_annots(.$entrez.id %>% unique, meta.prot$entrez.id %>% unique))
```

```{r}
data.atlas.ann %>%
  ungroup %>%
  select(hla, Term.ID, fold) %>%
  dcast(Term.ID ~ hla, value.var = "fold", fill = 0) -> mat.atlas.ann

rownames(mat.atlas.ann) <- mat.atlas.ann$Term.ID
mat.atlas.ann$Term.ID <- NULL
mat.atlas.ann <- as.matrix(mat.atlas.ann)

pca.atlas <- prcomp(mat.atlas.ann, scale. = T)

df.pca.atlas <- pca.atlas$rotation %>% as.matrix %>% as.data.frame
df.pca.atlas$hla <- rownames(df.pca.atlas)
df.pca.atlas <- df.pca.atlas %>%
  mutate(hla.class = ifelse(substr(hla, 1, 1) == "D", "class2", "class1"),
         hla.gene = ifelse(hla.class == "class1", 
                           substr(hla, 1, 1),
                           substr(hla, 1, 2)))
```

```{r Fig5a, fig.height=8, fig.width=8}
pfig5a <- df.pca.atlas %>%
  ggplot(aes(x = PC1, y = PC2, color = hla.gene)) +
  geom_density_2d(bins = 4) +
  geom_point() +
  geom_text_repel(aes(label = hla), cex = 2.2, color = "black", segment.alpha = 0.3) +
  theme_minimal() + xlab("PC1") + ylab("PC2") +
  scale_color_manual("HLA gene", 
                     values = c("#0071b2", "#56b3e9", "#009e74", "#d55c00", "#cc79a7")) +
  theme_pubr() +
  theme(legend.position = "right")
pfig5a
```

Distances between genes

```{r Fig5b}
mat.atlas.ann %>% t %>% 
  dist %>% #(method = "manhattan") %>%
  as.matrix %>% 
  melt %>%
  filter(Var1 != Var2) %>%
  mutate(gene1 = substr(str_split_fixed(Var1, fixed("*"), 2)[,1], 1, 2),
         gene2 = substr(str_split_fixed(Var2, fixed("*"), 2)[,1], 1, 2)) -> dist.atlas.ann

pfig5b <- dist.atlas.ann %>%
  mutate(value = value / nrow(mat.atlas.ann)) %>%
  ggplot(aes(x = gene1, y = value)) +
  geom_violin(aes(fill = gene2, color = gene2), position = position_dodge(width = 0.65)) +
  geom_boxplot(aes(color = gene2), width = 0.2, position = position_dodge(width = 0.65),
               outlier.colour = NA) +
  #geom_text_repel(aes(label = hla), cex = 2.2, color = "black", segment.alpha = 0.3) +
  theme_minimal() + xlab("") + ylab("Difference between GO enrichment profiles") +
  scale_fill_manual("HLA gene", 
                     values = c("#0071b2", "#56b3e9", "#009e74", "#d55c00", "#cc79a7")) +
  scale_color_manual("HLA gene", 
                     values = c("#0071b2", "#56b3e9", "#009e74", "#d55c00", "#cc79a7")) +
  coord_flip() +
  theme_pubr() +
  theme(legend.position = "none")
pfig5b
```

```{r Fig5c}
df.hla12.go <- pca.atlas$x %>% as.tibble %>%
  select(PC2)
df.hla12.go$Term.ID <- rownames(pca.atlas$x)
df.hla12.go %>%
  melt %>%
  group_by(variable) %>%
  filter(rank(-value, ties.method = "first") <= 10 | rank(value, ties.method = "first") <= 10) %>%
  merge(data.atlas.ann %>% ungroup %>% select(Term.ID, Ont, Term) %>% unique) -> df.hla12.go.2

pfig5c <- df.hla12.go.2 %>%
  mutate(direction = ifelse(value > 0, "More in class II", "More in class I")) %>%
  ggplot(aes(x = paste(Ont, Term) %>% fct_reorder(value), y = value, 
             fill = direction, color = direction)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey30") +
  geom_bar(stat = "identity", width = 0.1) +
  geom_point(size = 5) +
  coord_flip() +
  #geom_text(aes(label = paste0(round(100 * DE / total.DE, 0), "%"))) +
  scale_x_discrete("", position = "top") + scale_y_continuous("PC2 weight", 
                                                              limits = c(-22, 22)) +
  scale_fill_manual("", values = c("#009e74", "#cc79a7")) +
  scale_color_manual("", values = c("#009e74", "#cc79a7")) +
  #scale_color_distiller("-log10 Padj", palette = "Reds", direction = 1) +
  theme_pubclean() +
  theme(legend.position = "bottom")
  #scale_fill_distiller(palette = "Spectral", limits = c(-32, 32))
pfig5c
```

```{r Fig5, fig.height=10, fig.width=11}
pfig5 <- ggdraw() +
  draw_plot(pfig5a, x = 0, y = .38, width = .64, height = .62) +
  draw_plot(pfig5b, x = .64, y = .38, width = .36, height = .62) +
  draw_plot(pfig5c, x = 0, y = 0, width = 1, height = .37) +
  draw_plot_label(label = c("A", "B", "C"), size = 11,
                  x = c(0, .63, 0), y = c(1, 1, .38))
pfig5

pdf("figures/Fig5.pdf", height = 10, width = 11)
pfig5
dev.off()
```


## 5. Protective alleles & viral proteins

### Case of HIV protective alleles

```{r}
rbind(
get_annots(data.pred.sel.s %>% filter(sel == 1, hla == "HLA-B5701") %>% .$entrez.id,
           data.pred.sel.s %>% filter(sel == 1, hla == "HLA-B0801") %>% .$entrez.id) %>%
  mutate(sel = 1),

get_annots(data.pred.sel.s %>% filter(sel == 1, hla == "HLA-B0801") %>% .$entrez.id,
           data.pred.sel.s %>% filter(sel == 1, hla == "HLA-B5701") %>% .$entrez.id) %>%
  mutate(sel = -1)) %>%
  group_by(Term.ID) %>%
  filter(fold == max(fold)) -> go.enr.hiv
```

Via UniProt / QuickGO https://www.ebi.ac.uk/QuickGO/annotations?geneProductId=P59595

```{r}
go.hiv.prot <- fread("data/hiv_go_terms.txt") %>%
  select(-hiv.go.tmp) %>%
  unique %>%
  group_by(Term.ID) %>%
  summarize(hiv.gene = paste(hiv.gene %>% sort, collapse = ","))
```

```{r FigS7, fig.width=8, fig.height=10}
go.enr.hiv %>%
  merge(go.hiv.prot) %>%
  group_by(sel) %>%
  summarize(count = sum(N))

pfigs7 <- go.enr.hiv %>%
  merge(go.hiv.prot, all.x = T) %>%
  #filter(Term.ID %in% go.top.1) %>%
  #mutate(Term = trunc_str(Term)) %>%
  mutate(direction = ifelse(sel > 0, "More in B5701", "More in B0801")) %>%
  ggplot(aes(y = fold * sel,
             x = fct_reorder(paste(Ont, Term), fold * sel))) +
  geom_bar(aes(fill = direction), stat = "identity") + #, position = "dodge") + 
  geom_text(aes(label = hiv.gene), hjust = 0) +
  geom_hline(yintercept = 0, color = "grey", linetype = "dashed") +
  coord_flip() +
  #geom_text(aes(label = paste0(round(100 * DE / total.DE, 0), "%"))) +
  scale_x_discrete("", position = "top") + ylab("Log2 GO enrichment fold") +
  scale_fill_manual("Direction", values = c("#56b4df", "#e69d00")) +
  #scale_color_distiller("-log10 Padj", palette = "Reds", direction = 1) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  ggtitle("HIV protein associated terms")

pfigs7

pdf("figures/FigS7.pdf", height = 10, width = 8)
pfigs7
dev.off()
```

... The fraction of HIV-derived peptides should be greater


### Analysis of presentation of viral genes by 12 alleles

```{r}
#! to filter off HLA-B57:05 entries
data.viruses <- fread("data/cov_hiv_flu_netmhcpan.txt") %>%
  filter(hla != "HLA-B57:05") %>%
  mutate(uniprot.id = str_split_fixed(ID, "_", 3)[,2])

virus_prots <- readAAStringSet("data/cov_hiv_flu_prot.fasta") %>%
  as.character

meta.virusprot <- data.table(
  uniprot.id = str_split_fixed(names(virus_prots), "[\\| ]", 4)[,2],
  uniprot.id2 = str_split_fixed(names(virus_prots), "[\\| ]", 4)[,3],
  protein.len = nchar(virus_prots)) %>%
  mutate(name = str_split_fixed(uniprot.id2, "_", 2)[,1],
         species.long = str_split_fixed(uniprot.id2, "_", 2)[,2],
         species = case_when(
           species.long == "I34A1" ~ "FluA",
           species.long == "HV1H2" ~ "HIV1",
           species.long == "CVHSA" ~ "CoV1",
           species.long == "SARS2" ~ "CoV2",
           species.long == "EBOZ5" ~ "Ebol"
         ))

data.viruses <- data.viruses %>%
  merge(meta.virusprot)
```

```{r}
data.viruses.s <- data.viruses %>%
  group_by(hla, species, name) %>%
  summarize(count = length(unique(Peptide))) %>%
  ungroup %>%
  merge(merge(data.viruses %>% select(species, name) %>% unique %>% as.data.frame,
              data.frame(hla = data.viruses %>% .$hla %>% unique)) %>% mutate(count.x = 1),
        all.y = T) %>%
  mutate(count = ifelse(is.na(count), 0, count)) %>%
  mutate(count = count + 1) %>%
  mutate(total = sum(count)) %>%
  group_by(hla) %>%
  mutate(total.hla = sum(count)) %>%
  group_by(species, name) %>%
  mutate(total.prot = sum(count)) %>%
  ungroup %>%
  mutate(odds = log2(total * count / total.hla / total.prot))
```

```{r}
mat.viruses.s <- data.viruses.s %>%
  dcast(species+name~hla, value.var = "odds")

unique(mat.viruses.s$species)

mat.viruses.s1 <- mat.viruses.s %>%
  select(-species, -name) %>% as.matrix
rownames(mat.viruses.s1) <- paste0(mat.viruses.s$species, "_", mat.viruses.s$name)

spec_col <- c("#8dd3c7", "#ffffb3",  "#80b1d3", "#bc80bd", "#fb8072")
#names(spec_col) <- c("I34A1", "HV1H2", "CVHSA", "SARS2", "EBOZ5")

names(spec_col) <- c("FluA","HIV1","CoV1","CoV2","Ebol")
```

```{r Fig6a, fig.width=8, fig.height=10}
Heatmap(mat.viruses.s1,
        name = "Presentation odds, log",
        col = colorRamp2(c(-2, 0, 2), c("#2b83ba", "#ffffbf", "#d7191c")),
        width = unit(7, "cm"), height = unit(22, "cm"),
        heatmap_legend_param = list(
          legend_direction = "horizontal")) +
  rowAnnotation(nm = anno_text(paste0(mat.viruses.s$species, "|", mat.viruses.s$name),
                               gp = gpar(fill = spec_col[mat.viruses.s$species], 
                                         fontfamily = "mono",
                                         col = "black", border = "white"))) -> hm

pfig6a <- grid::grid.grabExpr(hm %>% draw(heatmap_legend_side = "bottom"))
grid.newpage()
grid.draw(pfig6a)
```

```{r Fig6b}
all.hla.t <- expand.grid(hla.1 = data.viruses.s$hla %>% unique, 
                         hla.2 = data.viruses.s$hla %>% unique) %>%
  mutate(hla.1 = as.character(hla.1),
         hla.2 = as.character(hla.2)) %>%
  filter(hla.1 > hla.2) %>%
  group_by(hla.1, hla.2) %>%
  do(cor.test(mat.viruses.s[, .$hla.1], mat.viruses.s[, .$hla.2]) %>% tidy) %>%
  ungroup %>%
  mutate(p.adj = p.adjust(p.value)) %>%
  arrange(p.value)

pfig6b <- tibble(hla.1 = "none", hla.2 = "none",
       type = "Theor.",
       statistic = rt(20000, all.hla.t$parameter[1])) %>%
  rbind(all.hla.t %>%
          mutate(type = "Obs.") %>%
          select(hla.1, hla.2, type, statistic)) %>%
  mutate(type = factor(type, levels = c("Theor.", "Obs."))) %>%
  ggplot(aes(y = type, x = statistic, fill = type)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey30") +
  stat_halfeye(adjust = 2, alpha = 0.8) +
  #geom_violin(aes(fill = type, color = type), trim = F, adjust = 2) +
  #geom_boxplot(aes(color = type), width = 0.1, outlier.color = NA) +
  scale_x_continuous("T-statistic\nfor Pearson R\n\n", breaks = c(-6,-3,0,3,6)) + ylab("") +
  scale_fill_manual(guide = F, values = c("#878787", "#d6604d")) +
  #scale_color_manual(guide = F, values = c("grey", "grey30")) +
  theme_pubr()

pfig6b
```

```{r Fig6c}
pfig6c <- data.viruses.s %>%
  filter(hla %in% c("HLA-A11:01", "HLA-A02:01", "HLA-B27:05")) %>%
  mutate(hla = substr(hla, 5, 7)) %>%
  arrange(hla, name) %>%
  ggplot(aes(x = paste(species, name) %>% fct_reorder2(odds, hla),
             y = odds, color = hla, group = hla)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey30") +
  geom_line() +
  geom_point() +
  coord_flip() +
  #scale_x_discrete("", position = "top") +
  #ylab("") +
  xlab("Viral proteins") + scale_y_continuous("log odds", limits = c(-2.2, 2.2)) +
  #facet_grid(species~., space = "free", scales = "free") +
  scale_color_manual("", values = c("#dfc27d", "#80cdc1", "#018571")) +
  #theme_pubclean() +
  #scale_linetype("") +
  theme_pubr() +
  theme(legend.position = "bottom",
        legend.box = "vertical",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

pfig6c

cor.test(mat.viruses.s$`HLA-A11:01`, mat.viruses.s$`HLA-A02:01`)
cor.test(mat.viruses.s$`HLA-A11:01`, mat.viruses.s$`HLA-B27:05`)
```

```{r Fig6e}
go.enr.1 %>%
  ungroup %>%
  filter(sel == 1) %>%
  select(hla, Term.ID, fold) %>%
  dcast(Term.ID ~ hla, value.var = "fold", fun.aggregate = mean, fill = 0) -> mat.go.enr.1

rownames(mat.go.enr.1) <- mat.go.enr.1$Term.ID
mat.go.enr.1$Term.ID <- NULL
mat.go.enr.1 <- as.matrix(mat.go.enr.1)

merge(mat.go.enr.1 %>% t %>%
        dist() %>%
        #dist(method = "manhattan") %>%
        as.matrix %>% 
        melt %>%
        filter(Var1 != Var2) %>%
        mutate(dist.l = value  / nrow(mat.go.enr.1)) %>%
        mutate(Var1 = substr(Var1, 5, 7),
               Var2 = substr(Var2, 5, 7)) %>%
        select(-value),
      mat.viruses.s1 %>% t %>% 
        dist() %>%
        #dist(method = "manhattan") %>%
        as.matrix %>% 
        melt %>%
        filter(Var1 != Var2) %>%
        mutate(Var1 = gsub(":", "", Var1, fixed = T),
               Var2 = gsub(":", "", Var2, fixed = T),
               dist.v = value / nrow(mat.viruses.s1))  %>%
        mutate(Var1 = substr(Var1, 5, 7),
               Var2 = substr(Var2, 5, 7)) %>%
        select(-value)) -> dist.viruses.ann

with(dist.viruses.ann %>% filter(Var1 > Var2), cor.test(dist.l, dist.v))

dist.viruses.ann %>%
  group_by(Var1) %>%
  group_modify(~cor.test(.x$dist.l, .x$dist.v) %>% tidy) -> dist.viruses.ann.stat

 pfig6e <- dist.viruses.ann %>%
  merge(dist.viruses.ann.stat) %>%
  mutate(Var1.1 = paste0(Var1, " R=", round(estimate, 2))) %>%
  ggplot(aes(x = dist.l, y = dist.v)) +
  geom_point() +
  # geom_point(aes(color = Var1)) +
  geom_smooth(method = "lm", linetype = "dashed", color = "grey30") +
  geom_text_repel(aes(label = substr(Var2, 5, 7))) +
  scale_x_continuous("GO enrichment profile distance (human)", n.breaks = 4) + 
  scale_y_continuous("Viral presentation profile distance", n.breaks = 4) +
  #scale_color_brewer("HLA", palette = "Paired") +
  facet_wrap(~Var1.1, scales = "free", ncol = 4) +
  theme_pubr() +
  theme(legend.position = "bottom",
        axis.text = element_blank(),
        axis.ticks = element_blank())
        #axis.text.x = element_blank(),
        #axis.text.y = element_blank())

pfig6e
```

```{r Fig6d}
 pfig6d <- data.viruses.s %>%
  filter(species %in% c("CoV1", "CoV2")) %>%
  ggplot(aes(x = substr(hla, 5, 10) %>% fct_reorder(odds), y = odds, 
             color = species,
             group = species)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey40") +
  geom_line() +
  geom_point() +
  geom_point(data = tibble(species = "CoV1", hla = "HLA-A02:01", odds = -0.79, name = "R1A"), color = NA) +
 # coord_flip() +
  #scale_x_discrete("", position = "top") +
  #scale_y_continuous("log odds", n.breaks = 5) +
  scale_y_continuous("log odds", ) +
  xlab("HLA allele") +
  facet_wrap(~name, scales = "free_y") +
  #facet_grid(species~., space = "free", scales = "free") +
  scale_color_manual("", values = c("#80b1d3", "#bc80bd")) +
  #theme_pubclean() +
  theme_pubr() +
  theme(legend.position = "bottom",
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) #element_text(angle = 90, vjust = 0.5))

pfig6d

#t.test(mat.viruses.s$`HLA-A01:01`, mat.viruses.s$`HLA-A02:01`, paired = T)
```

```{r Fig6, fig.width=10, fig.height=12}
pfig6 <- ggdraw() +
  draw_plot(pfig6a, x = 0, y = 0, width = .45, height = 1) +
  draw_plot(pfig6b, x = .46, y = .7, width = .27, height = .3) +
  draw_plot(pfig6c, x = .73, y = .7, width = .27, height = .3) +
  draw_plot(pfig6d, x = .46, y = .35, width = .54, height = .35) +
  draw_plot(pfig6e, x = .46, y = 0, width = .54, height = .35) +
  draw_plot_label(label = c("A", "B", "C", "D", "E"), size = 11,
                  x = c(0, .45, .72, .45, .45), y = c(1, 1, 1, .7, .35))
pfig6

pdf("figures/Fig6.pdf", height = 12, width = 10)
pfig6
dev.off()
```


## 6. Compensation of HLA presentation bias in haplotypes

Populations included into the analysis
```{r TableS4}
haplotypes <- fread("data/haplotypes.tsv") %>% 
  filter(Frequency > 0.01)

pst4 <- haplotypes %>% 
  select(Population, Sample.Size, HLA.A, HLA.B, HLA.C) %>% 
  distinct() %>% 
  group_by(Population, Sample.Size) %>% 
  summarise(n = n()) %>% 
  arrange(-n) %>% 
  setNames(., c("Population", "Sample size", "Number of\n haplotypes"))

ggtexttable(pst4, rows = NULL,
                  theme = ttheme("classic", base_size = 8, padding = unit(c(2, 2), "mm")))

pst4
```

Compute distances between GO enrichment profiles for all pairs of HLA alleles

```{r}
go.dist <- go.enr.all %>%
  ungroup %>%
  mutate(fold.sign = fold * sel) %>%
  select(hla, Term.ID, fold.sign) %>%
  dcast(Term.ID ~ hla, value.var = "fold.sign", fun.aggregate = mean, fill = 0) %>%
  select(-Term.ID) %>% 
  t %>%
  dist() %>%
  as.matrix %>% 
  melt %>%
  filter(Var1 != Var2) %>%
  mutate(dist.l = value  / (length(go.enr.all$Term.ID %>% unique()))) %>%
  select(MHC.1 = Var1, MHC.2 = Var2, dist.l)
```

Distance between alleles within frequent haplotypes

```{r Fig7}
haplotypes.a <- haplotypes %>% 
  select(Population, Frequency, MHC.1 = HLA.A, MHC.2 = HLA.B) %>%
  rbind(haplotypes %>% 
          select(Population, Frequency, MHC.1 = HLA.A, MHC.2 = HLA.C)) %>%
  rbind(haplotypes %>% 
          select(Population, Frequency, MHC.1 = HLA.B, MHC.2 = HLA.C)) %>%
  group_by(Population, MHC.1, MHC.2) %>%
  summarise(Frequency = sum(Frequency)) %>%
  mutate(haplotype = 1) %>% 
  ungroup()

haplotypes.control <- c(haplotypes.a$MHC.1, haplotypes.a$MHC.2) %>% 
  unique() %>% 
  expand.grid(MHC.1 = ., MHC.2 = .) %>% 
  filter(substr(MHC.1, 1, 5) < substr(MHC.2, 1, 5)) %>% 
  mutate(MHC.1.short = substr(MHC.1, 1, 7),
         MHC.2.short = substr(MHC.2, 1, 7)) %>%   
  merge(haplotypes.a %>% 
          mutate(MHC.1.short = substr(MHC.1, 1, 7),
                 MHC.2.short = substr(MHC.2, 1, 7)) %>% 
          select(MHC.1.short, MHC.2.short, haplotype), all = T) %>% 
  mutate(haplotype = replace_na(haplotype, 0)) %>% 
  filter(haplotype == 0) %>% 
  select(MHC.1, MHC.2, haplotype)

haplotypes.dist <- haplotypes.a %>% 
  bind_rows(haplotypes.control) %>% 
  merge(go.dist, all.x = T) %>% 
  mutate(genes = paste(substr(MHC.1,1,5), substr(MHC.2,1,5), sep = "/")) %>% distinct(MHC.1, MHC.2, .keep_all = T) %>% 
  mutate(haplotype = ifelse(haplotype == 1, "Haplotype", "Control") %>% 
           factor(levels = c("Haplotype", "Control"))) 

stat.test <- haplotypes.dist %>%
  group_by(genes) %>%
  wilcox_test(dist.l ~ haplotype) %>%
  adjust_pvalue(method = "bonferroni") %>%
  add_significance() %>% 
  add_y_position(y.trans = function(x) x + 0.005)

pfig7 <- haplotypes.dist %>% 
  mutate(haplotype = factor(haplotype, levels = c("Control", "Haplotype"))) %>%
  ggplot(aes(x = haplotype, y = dist.l)) +
  facet_wrap(~genes) +
  #geom_quasirandom(size = .5, color = "grey50") +
  geom_violin(aes(fill = haplotype), adjust = 1, trim = F) +
  geom_boxplot(fill = "white", width = 0.3) +
  #stat_summary(fun.y="median", geom='point', shape = 95, size = 15, color = 'red') +
  stat_pvalue_manual(stat.test, label = "p.adj.signif") +
  scale_fill_manual(guide = F, values = c("#878787", "#d6604d")) +
  theme_pubclean() +
  xlab("") + ylab("Distance between GO enrichment profiles")

pfig7

pdf("figures/Fig7.pdf", height = 4, width = 7)
pfig7
dev.off()
```

## 7. Supplementary Note 1
### Independent validation of HLA presentation biases using MHCflurry and DAVID software tools

```{r}
#data.pred.flurry <- fread("mhcflurry_output/mhcflurry_human_proteome_pred.csv") %>% 
#data.pred.flurry <- data.pred.flurry %>% 
#  mutate(ligand.len = nchar(peptide)) %>%
#  filter(presentation_score > 0.5)
```

HLE(D)Ps

```{r FigSN1a}
#data.pred.full.flurry <- data.pred.flurry %>%
#  mutate(hla = gsub("[\\*\\:]", "", best_allele),
#         uniprot.id = str_split_fixed(sequence_name, "[\\| ]", 3)[,2]) %>%
#  group_by(uniprot.id, hla, ligand.len) %>%
#  summarise(count = n()) 

#fwrite(data.pred.full.flurry, "data/human_proteome_mhcflurry_summary.txt", sep = '\t')

data.pred.full.flurry <- fread("data/human_proteome_mhcflurry_summary.txt")

data.pred.sel.s.flurry <- data.pred.full.flurry %>%
  filter(ligand.len == 9) %>%
  merge(meta.prot, by = "uniprot.id") %>%
  group_by(hla) %>%
  mutate(P0 = sum(count) / sum(protein.len - ligand.len)) %>%
  ungroup %>%
  rowwise() %>%
  mutate(odds = log2((count + 1) / (protein.len + 1 - ligand.len) / P0),
         p.value = binom.test(count + 1, protein.len + 1 - ligand.len, 
                              p = P0, alternative = "two.sided")$p.value) %>%
  group_by(hla) %>%
  mutate(p.value.adj = p.adjust(p.value, method = "BH")) %>%
  ungroup %>%
  mutate(sel = ifelse(odds >= 1 & p.value.adj < 0.05, 1, 
                      ifelse(odds <= -1 & p.value.adj < 0.05, -1,
                             0)),
         protein.len.q = cut(protein.len, quantile(protein.len)))  

pfigsn1a <- data.pred.sel.s.flurry %>%
  filter(hla %in% hla_incl) %>%
  mutate(hla = factor(hla, levels = hla_incl_vert)) %>%
  ggplot(aes(x = odds, y = pmin(-log10(p.value), 20), size = count,
             color = sel %>% as.factor())) +
  geom_point() +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey") +
  scale_size_continuous("Number of\npeptides", breaks = c(1, 10, 100, 1000)) +
  scale_color_manual("Ligand -", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted", "No change", "Enriched")) +
  scale_x_continuous(TeX("$log_{2}\\,\\left[\\frac{ligands_{obs}}{ligands_{exp}}\\right]$"), limits = c(-5, 5)) +
  ylab(TeX("$-log_{10}\\~Pvalue_{adj}$")) +
  facet_wrap(~hla, ncol = 2) + 
  theme_pubclean() + 
  theme(legend.position = "bottom", 
        legend.box = "vertical")
```

Protein length bias of HLE(D)Ps
```{r FigSN2}
data.pred.sel.s.len.flurry <- data.pred.sel.s.flurry %>%
  filter(hla %in% hla_incl) %>%
  group_by(hla, sel, protein.len.q) %>%
  summarize(count = n()) %>%
  group_by(hla, sel) %>%
  mutate(total = sum(count)) %>%
  mutate(p = count / total)

pfigsn2 <- data.pred.sel.s.len.flurry %>%
  merge(expand.grid(hla = data.pred.sel.s.len.flurry$hla %>% unique, 
                    sel = data.pred.sel.s.len.flurry$sel %>% unique,
                    protein.len.q = data.pred.sel.s.len.flurry$protein.len.q %>% unique), all = T) %>%
  mutate(p = ifelse(is.na(p), 0, p)) %>%
  mutate(hla = factor(hla, levels = hla_incl)) %>%
  ggplot(aes(x = protein.len.q %>% as.integer, 
             y = p,
             fill = sel %>% as.factor,
             color = sel %>% as.factor)) +
  geom_errorbar(aes(ymin = p, ymax = p + sqrt(p * (1-p) / total)),
                position = "dodge") +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual("", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted","No change","Enriched")) +
  scale_color_manual("", 
                     values = c("#56b4df", "#000000", "#e69d00"),
                     labels=c("Depleted","No change","Enriched")) +
  xlab("Protein length quartile") + ylab("Fraction of proteins") +
  facet_wrap(~hla, nrow = 2) +
  theme_pubclean() + 
  theme(aspect = 1, legend.position = "bottom")

pdf("figures/FigSN2.pdf", height = 4, width = 7)
pfigsn2
dev.off()
```

GO enrichment analysis
```{r}
data.pred.sel.s.flurry %>%
  select(hla, sel) %>%
  unique %>%
  filter(sel != 0) %>%
  group_by(sel, hla) %>%
  group_modify(~ get_annots(filter(data.pred.sel.s.flurry, hla == .y$hla, sel == .y$sel)$entrez.id,
                            filter(data.pred.sel.s.flurry, hla == .y$hla, sel == 0)$entrez.id)) -> go.enr.1.flurry

go.top.1.flurry <- go.enr.1.flurry %>%
  group_by(sel, hla) %>%
  filter(rank(P.DE.adj) <= 20) %>%
  .$Term.ID %>% unique

pfigsn1b <- go.enr.1.flurry %>%
  filter(#!(hla %in% excl_for_now),
         Term.ID %in% go.top.1.flurry) %>%
  mutate(Term = trunc_str(Term, 4),
         direction = ifelse(sel > 0, "enriched", "depleted")) %>%
  ggplot(aes(x = factor(hla, levels = hla_incl_vert),
             y = fct_reorder2(paste(Ont, Term), fold, paste(sel, hla)))) +
  geom_quasirandom(aes(size = fold, fill = direction), shape = 21, width = 0.2) + 
  scale_y_discrete("", position = "right") + 
  xlab("") +
  scale_fill_manual("HLA ligand -", values = c("#56b4df", "#e69d00")) +
  scale_size_continuous("GO term enrinchment\nfold, log2", breaks = c(1, 2 ,4)) +
  theme_pubclean() +
  theme(legend.position = "bottom", 
        legend.box = "vertical",
        axis.text.x = element_text(angle = 90, vjust = 0.5),
        axis.text.y = element_text(family = "mono", size = 8))
```

```{r FigSN1, fig.width=14, fig.height=14}
pfigsn1 <- ggdraw() +
  draw_plot(pfigsn1a, x = 0, y = 0, width = .42, height = 1) +
  draw_plot(pfigsn1b, x = .43, y = -.1, width = .57, height = 1.1) +
  draw_plot_label(label = c("A", "B"), size = 11,
                  x = c(0, .42), y = c(1, 1))

pfigsn1

pdf("figures/FigSN1.pdf", height = 14, width = 14)
pfigsn1
dev.off()
```

---

```{r}
print("Done")
```