4b_scRNA-TCR_pbmc.Rmd

---
title: "scRNA/TCR analysis - PBMC"
author: "Daniel Shu"
date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`'
output: 
  html_document:
    keep_md: yes
    toc: true
    toc_float: true
    collapsed: true
    toc_depth: 3
    number_sections: true
    theme: lumen
editor_options: 
  markdown: 
    wrap: 72
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = F,
                      warning = F, cache=F, 
                      tidy = T,
                      dpi = 600, fig.width = 12, fig.height = 8)
```

# I. Setup
## A. Load libraries

```{r libraries}
library(Seurat)
library(ggplot2)
library(ggpubr)
library(patchwork)
library(ggprism)
library(tidyverse)
library(tools)
library(scRepertoire)
library(gridExtra)
library(kableExtra)
library(RColorBrewer)
library(pals)
library(ggnewscale)

library("grid")
library("gridExtra")
library("cowplot")

```

## B. Define export settings

```{r settings}
source = "10x"
analysis = "sc"
#Define the repertoire under study: "T" or "B"
repertoire = "T" 
#define type of repertoire
type = "pbmc" #pbmc or til
#set azimuth level for analysis
az_level = "l3" 
output.path = paste0("./output/single_cell/", 
                     ifelse(repertoire=="T", "T", "B"),
                     "_", type, "_azimuth_", az_level,
                     "/")
#creates output.path directory if not already present
ifelse(!dir.exists(output.path), dir.create(output.path), paste0(output.path, " ", "directory already exists")) 
```

## C. Load scripts

```{r scripts}
source("scripts/multibarHeatmap.R") # Load multibarHeatmap script
source("./scripts/T_cell_goi.R") # Define GOI
```

# II. Load data

### A. First load TCR/BCR filtered contig files

```{r}
#Create data.dir for clonotypes and filtered contig files
vdj.dir <- "./data/single_cell/vdj"

#Create list of filtered contig files, named for each sample, sample type (PBMC or TIL), and type of repertoire (T or B cell)
fc.filenames <- list.files(vdj.dir, pattern="filtered_contig_annotations.csv", recursive=T, full.names = T)

# the files now need to be reordered such in same order as the order in which the suffixes were barcode suffixes were applied in the pre-processing code. otherwise, matching TCRs to cell barcodes will fail due to the incorrect suffix.
fc.filenames = c(
  fc.filenames[grep("PBMC_P02_",fc.filenames)],
  fc.filenames[grep("PBMC_P03_",fc.filenames)],
  fc.filenames[grep("PBMC_P07_",fc.filenames)],
  fc.filenames[grep("PBMC_P08_",fc.filenames)],
  fc.filenames[grep("PBMC_P12_",fc.filenames)],
  fc.filenames[grep("PBMC_OT1_",fc.filenames)],
  fc.filenames[grep("PBMC_OT6_",fc.filenames)],
  fc.filenames[grep("TIL_OT6_",fc.filenames)]) #this preserves order where the _B repertoire comes before its corresponding _T repertoire 

fc.list <- list()

for(i in fc.filenames){
   fc.list[[i]] = read.table(i,
                   header = T,
                   stringsAsFactors = F,
                   as.is = T,
                   fill = T,
                   comment.char = "",
                   sep = ',')
}

fc.filenames.short <- list.files(vdj.dir, pattern="filtered_contig_annotations.csv", recursive=T, full.names = F) %>% 
  str_replace("/.*", "")

names(fc.list) = fc.filenames.short
names(fc.list)

# For the filtered contigs, the barcode suffixes for each sample are "-1". To facilitate integration with the seurat object, for each patient we need to remove  "-1" from the suffix of the barcode and append "-" followed by the corresponding number of the sample ("1" through "8"). This makes the barcodes for each sample distinct.

# first create vector of character strings to make barcodes of filtered contig$barcode match barcodes in seurat object
suffix <- rep(c(1:8),each=2) %>% as.character() 

# then replace "-1" with "-1", "-2", "-3"..."-8" according to the  sample number 
for(i in 1:length(fc.list)) {
  fc.list[[i]][,1] = gsub("-.*", 
                          paste0("-", 
                                 suffix[i]
                                 )
                          , fc.list[[i]][,1]
                          )
}

#subset the list according to repertoire under analysis ("T or B")
if (repertoire == "T") {
  contig_list <- fc.list[grep("_T",names(fc.list))] 
} else {
  contig_list <- fc.list[grep("_B",names(fc.list))] 
}
names(contig_list)
rm(fc.list)

# Combine the T and B cell contigs using combineTCR and combineBCR functions 
#Code below is based on 'Starting work with scRepertoire v1.5.2.' (https://ncborcherding.github.io/vignettes/vignette.html) Website is dated 5/12/2022. I accessed it on 5/24/2022

if (repertoire == "T") {
    combined <- combineTCR(contig_list,
                         samples = gsub(".*[_]([^_]+)[_].*", "\\1", names(contig_list)), #returns "HCC02" "HCC08" "HCC09" "HCC13" "HCC14" "OT1"   "OT6"   "OT6"   
                         ID = sub("\\_.*", "", names(contig_list)), #returns: "PBMC" "PBMC" "PBMC" "PBMC" "PBMC" "PBMC" "PBMC" "TIL" 
                         cells = "T-AB")
    rm(contig_list)
} else {
  combined <- combineBCR(contig_list, 
                         samples = gsub(".*[_]([^_]+)[_].*", "\\1", names(contig_list)), #returns "HCC02" "HCC08" "HCC09" "HCC13" "HCC14" "OT1"   "OT6"   "OT6"
                         ID = sub("\\_.*", "", names(contig_list)) #returns: "PBMC" "PBMC" "PBMC" "PBMC" "PBMC" "PBMC" "PBMC" "TIL"
                         )
  rm(contig_list)

}

#Cleanup the 'combined' object for subsequent analysis by replacing the initial string from the barcodes in each list with T or B -- this facilitates later integration with the seurat object
if (repertoire == "T") {
  for(i in 1:length(combined)) {
    combined[[i]][,1] = gsub(".*[_]([^_]+)[_]_*", "T-", combined[[i]][,1])
  }
} else {
  for(i in 1:length(combined)) {
    combined[[i]][,1] = gsub(".*[_]([^_]+)[_]_*", "B-", combined[[i]][,1])
  }
}

# saveRDS(combined, file = paste0(output.path,  "combined.rds"))
# combined <- readRDS(paste0(output.path,"combined.rds"))
```

### B. Load the seurat object produced from data pre-processing.

For preprocessing code, see "./scRNA-all-preprocessing.Rmd."

Note that in the object seurat, the scaled data in the SCT assay is not present in the refAssay. This website (<https://github.com/satijalab/seurat/issues/5959>) notes that the refAssay lacks corrected UMI counts, so I switched the active.assay to SCT. see also here <https://github.com/satijalab/seurat/issues/2163>

```{r load_seurat}
#load seurat 
seurat <- readRDS("./output/single_cell/10x_scRNASeq_all_seurat_az_pbmc.rds")
#set active assay to SCT
DefaultAssay(seurat) <- "SCT"

#Assign identities based on azimuth
Idents(seurat) <- paste0("predicted.celltype.", az_level)

#Re-order idents, which are out of order. 

if (az_level == "l2") {
levels.manual <-  c("B naive","B intermediate","B memory", "Plasmablast",
                    "CD4 Naive","CD4 Proliferating","CD4 TCM","CD4 TEM","CD4 CTL",
                    "CD8 Naive","CD8 Proliferating","CD8 TCM","CD8 TEM","NK","NK Proliferating",
                    "Treg","dnT","gdT","MAIT",
                    "CD14 Mono","CD16 Mono","Platelet")
} 
if (az_level == "l3") {
levels.manual <- c("B naive kappa","B naive lambda", "B intermediate kappa", "B intermediate lambda", "B memory kappa","B memory lambda", "Plasma",
                   "CD14 Mono","CD16 Mono",
                   "CD4 Naive","CD4 Proliferating", "CD4 TCM_1","CD4 TCM_2","CD4 TCM_3","CD4 TEM_1","CD4 TEM_2","CD4 TEM_3","CD4 CTL", 
                   "CD8 Naive","CD8 Naive_2","CD8 Proliferating",
                   "CD8 TCM_1","CD8 TCM_2","CD8 TCM_3",
                   "CD8 TEM_1","CD8 TEM_2","CD8 TEM_3","CD8 TEM_4","CD8 TEM_5","CD8 TEM_6",
                   "NK Proliferating","NK_1","NK_2",
                   "Treg Naive", "Treg Memory","dnT_1","dnT_2", "gdT_1","gdT_2","gdT_3","gdT_4","MAIT",  
                   "ILC","Platelet")
}
Idents(seurat) <- factor(x = Idents(seurat), levels = levels.manual)
Idents(seurat) %>% levels

cells.by.type <- table(Idents(seurat)) %>% as.data.frame() 
ggplot(cells.by.type, aes(x = Var1, #reorder(Var1, desc(Freq)), 
                          y = Freq, fill=Var1))+
  geom_col()+ #scale_fill_manual(values=pals::kovesi.rainbow_bgyr_35_85_c72(nrow(cells.by.type)))+
  ggprism::theme_prism()+theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))+theme(legend.position="null")+ggtitle("Single cells per cluster")+xlab("Cluster")+ylab("Cell count")
  ggsave(paste0(output.path,"summary_all_B_and_T_asbarplot.pdf"), height=10, width=10)

cells.by.type %>% 
  bind_rows(summarise_all(., ~if(is.numeric(.)) {sum(.)} else "Total"))%>% 
  kbl(caption = "Summary of all cell types (B and T)", align = 'c') %>%
  kable_classic(full_width=F) %>% 
  save_kable(., paste0(output.path, "summary_all_B_and_T.pdf"))

cells.by.type %>% 
  bind_rows(summarise_all(., ~if(is.numeric(.)) {sum(.)} else "Total"))

DimPlot(seurat)+guides(colour = guide_legend(ncol = 2))
ggsave(paste0(output.path, "dimplot_all_T_and_B.pdf"), height=10, width=10)

#Subset for sample under analysis and add initial "T-" or "B-" to barcode
if (repertoire == "T") {
  T.clustOfInterest = "CD4|CD8|Treg|dnT|gdT|ILC|MAIT|NK"
  seurat.idents = unique(Idents(seurat))
  T.idents <- seurat.idents[grep(T.clustOfInterest, seurat.idents)] 
  seurat <- subset(seurat, idents = T.idents)
  seurat <- RenameCells(seurat, new.names = paste0("T-", rownames(seurat[[]])))
  } else {
  # this is the case where repertoire == "B"
  B.clustOfInterest = "B naive|B intermediate|B memory|Plasma"
  seurat.idents = unique(Idents(seurat))
  B.idents <- seurat.idents[grep(B.clustOfInterest, seurat.idents)]
  seurat <- subset(seurat, idents = B.idents)
  seurat <- RenameCells(seurat, new.names = paste0("B-", rownames(seurat[[]])))
}

table(Idents(seurat)) %>% as.data.frame() %>% 
    bind_rows(summarise_all(., ~if(is.numeric(.)) {sum(.)} else "Total"))


## Subset to remove outlier singletons using cellSelector 
DimPlot(seurat#,cols=cluster_colors
        ) +ggtitle(paste0("T cell clusters in TIL prior to singletons removal (n=", nrow(seurat@meta.data), ")"))
ggsave(filename = paste0(output.path, "dimplot_before_singleton_removal.pdf"),width = 12, height = 8)

plot <- DimPlot(seurat)
likelyB <- CellSelector(plot = plot) #select cells on far right where B cells clusters were
likelyTIL <- CellSelector(plot = plot) #select singleton cells in northeast of UMAP where TIL was 
expected = seurat@meta.data %>% nrow - length(c(likelyB,likelyTIL)) #predict expected # of cells after subsetting out likelyB and likelyTIL
expected
seurat <- subset(seurat, cells = c(likelyB, likelyTIL), invert=T)
nrow(seurat@meta.data) == expected #confirm correct subsetting
rm(plot)

DimPlot(seurat#,cols=cluster_colors
        ) +ggtitle(paste0("T cell clusters in PBMC after singletons removal (n=", nrow(seurat@meta.data), ")"))
ggsave(filename = paste0(output.path, "dimplot_after_singleton_removal.pdf"),width = 12, height = 8)

# saveRDS(seurat, paste0(output.path, "seurat_after_singleton_removal.rds"))
# seurat <- readRDS(file=paste0(output.path,"seurat_after_singleton_removal.rds"))
                
```

### C. Run FindAllMarkers and make heatmap to refine cluster idents

```{r}
markers <- FindAllMarkers(seurat, test.use = "MAST", only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25)

# saveRDS(markers, file=paste0(output.path,"findAllMarkers.rds"))
# markers <- readRDS(paste0(output.path,"findAllMarkers.rds"))

markers %>%
    group_by(cluster) %>%
    slice_max(n = 3, order_by = avg_log2FC)
markers %>%
    group_by(cluster) %>%
    top_n(n =10, wt = avg_log2FC) -> top10
markers %>%
    group_by(cluster) %>%
    top_n(n =5, wt = avg_log2FC) -> top5
markers %>%
    group_by(cluster) %>%
    top_n(n =3, wt = avg_log2FC) -> top3

#getting this error when I try to do DoHeatmap
# Error in Seurat::DoHeatmap(seurat, features = top10, size = 2) : 
#   No requested features found in the scale.data slot for the SCT assay.
# so rescaling the data per https://github.com/satijalab/seurat/issues/2960

seurat<-ScaleData(seurat, features=c(markers$gene,goi.all), verbose = FALSE)#this line makes sure that all of the variable features (in markers object) and all of the genes of interest are included in the scaledata used for the heatmap
                  
pdf(paste0(output.path,"FindAllMarkers_before_cluster_annotation.pdf"),width=12,height=8)
Seurat::DoHeatmap(seurat,features=top5$gene,size=1)+
 theme(legend.text = element_text(size = 4),
      axis.text.y = element_text(size = 5))
dev.off()
Seurat::DoHeatmap(seurat,features=top3$gene,size=1)+
 theme(legend.text = element_text(size = 4),
      axis.text.y = element_text(size = 5))
dev.off()
pdf(paste0(output.path,"GOI_all_before_cluster_annotation.pdf"),width=12,height=8)
Seurat::DoHeatmap(seurat,features=goi.all,size = 1)+
   theme(legend.text = element_text(size = 4),
      axis.text.y = element_text(size = 5))
dev.off()

# saveRDS(seurat, paste0(output.path, "seurat_after_singleton_removal_findallmarkers_round1.rds"))
# seurat <- readRDS(file=paste0(output.path,"seurat_after_singleton_removal_findallmarkers_round1.rds"))
```
### D.1 Reassign cluster idents
```{r load_seurat2}
## Collapse the clusters that are not the focus of this analysis (i.e. e.g. Treg naive + Treg memory -> Treg, etc.). Here I have applied this to NKT)
cluster_merge = data.frame(old_id = levels(seurat), new_id = NA, level=NA)
write.csv(cluster_merge,"output/single_cell/cluster_merge_PBMC.csv",row.names=F)
#edit that file with manual cluster assignments then reupload
cluster_merge_DS <- read.csv("output/single_cell/cluster_merge_DS.csv")
cluster_merge_DS = cluster_merge_DS[order(cluster_merge_DS$level),] #reorder by level column, which will be used below to facilitate levels assignment

#set colors for clusters 
cluster_colors = pals::alphabet(n=length(unique(cluster_merge_DS$new_id))+2)
cluster_colors = cluster_colors[-grep("iron|ebony",names(cluster_colors))] #drops black
names(cluster_colors) = unique(cluster_merge_DS$new_id)
cluster_colors %>% pal.bands()

#######################
#assign new identities
new.cluster.ids = cluster_merge_DS$new_id
names(new.cluster.ids) = cluster_merge_DS$old_id

new.cluster.ids
if (all(unique(Idents(seurat)) %in% names(new.cluster.ids)) == T) { #doublechecks that the new.cluster.ids has a new ident for each old ident, before subsetting the new.cluster.ids object for what is in Idents(seura)
new.cluster.ids <- new.cluster.ids[names(new.cluster.ids) %in% Idents(seurat)] #rename the clusteres that are present in the dataset
}

seurat <- RenameIdents(seurat, new.cluster.ids)

#add active.ident column to seurat object (will be used below for stacked barplots)
seurat$active.ident <- Idents(seurat)
Idents(seurat) %>% table #check idents

#set levels
levels.manual = unique(cluster_merge_DS$new_id)
Idents(seurat) <- factor(x = Idents(seurat), levels = levels.manual)

pdf(paste0(output.path,"DimPlot_after_annotation.pdf"))
DimPlot(seurat,cols=cluster_colors)
dev.off()

pdf(paste0(output.path,"DimPlot_after_annotation_labeled.pdf"))
DimPlot(seurat,cols=cluster_colors,label=T,repel=T)
dev.off()

# ## subset to remove clusters where there are fewer than 50 cells (in this case, CD4 Proliferating, CD8 Proliferating)
seurat <- subset(x = seurat, idents = c("CD4 Proliferating", 
                                        "CD8 Proliferating"), invert=T)
cluster_colors = cluster_colors[!names(cluster_colors) %in% c("CD8 Proliferating","CD4 Proliferating")]
levels.manual = levels.manual[levels.manual %in% Idents(seurat)]

Idents(seurat) <- factor(x = Idents(seurat), levels = levels.manual)


pdf(paste0(output.path,"DimPlot_after_annotation_lessthan50removed.pdf"));DimPlot(seurat,cols=cluster_colors);dev.off()
pdf(paste0(output.path,"DimPlot_after_annotation_labeled_lessthan50removed.pdf"));DimPlot(seurat,cols=cluster_colors,label=T,repel=T);dev.off()
# 
# saveRDS(seurat, paste0(output.path, "seurat_after_singleton_removal_after_annotation_final.rds"))
# seurat <- readRDS(file=paste0(output.path, "seurat_after_singleton_removal_after_annotation_final.rds"))
```

### D.2 Manual curation
#### A. Check gdT
There are scattered cells in the MAIT and a few other clusters with increased TRDV2 and TRGV9 expression, suggesting that these should be re-assigned to the gdt cluster.  Here we make plots to identify those cells, then reassign them to the gdT cluster. We define gdt as any cell with both TRDV2 and TRGV9 > 0.

```{r}
pdf(paste0(output.path,"Heatmap_before_MAITgdT_reassignment.pdf"),width=12,height=8)
Seurat::DoHeatmap(seurat,features=top3$gene,size = 2)+
   theme(
     # legend.text = element_text(size = 5),
      axis.text.y = element_text(size = 5))&NoLegend()

Seurat::DoHeatmap(seurat,features = c("TRDV2", "TRGV9"), #features=top3$gene,
                  size = 2)+
   theme(
     # legend.text = element_text(size = 5),
      axis.text.y = element_text(size = 5))&NoLegend()
dev.off()

pdf(paste0(output.path,"gdT_qc_plots.pdf"),width=14,height=8)
RidgePlot(seurat,features=c("TRDV2","TRGV9"))
VlnPlot(seurat,features=c("TRDV2","TRGV9"))
FeaturePlot(seurat,features=c("TRDV2","TRGV9"))
FeaturePlot(seurat,features=c("TRDV2","TRGV9"),blend=T)
FeatureScatter(seurat, feature1 = "TRDV2", feature2 = "TRGV9",pt.size=2,cols=cluster_colors,jitter=T)
dev.off()
```

#### B. Reassign non-gdT -> gdt
what we found is 3 populations. 
- 1 population, with TRDV2 and TRGV9 both > 0. We will pull those out and reassign them to gdT.
- Within the pre-existing gdT population, 1 population TRDV2 and TRGV9 both at 0. We will pull those out and reassign them to another cluster.
- Within the pre-existing gdT population, other cells that have TRDV2 or  TRGV9 > 0, but not both. We will leave those in the gdT population, assuming that the clustering

```{r}
gdt_in  <- subset(x = seurat, subset = TRDV2 > 0 & TRGV9 > 0) 
gdt_in@active.ident %>% table
seurat@active.ident %>% table

select.cells <- colnames(gdt_in)
select.cells

Idents(seurat, cells = select.cells) <- "NewCells"

seurat@active.ident[seurat@active.ident=="gdT"] %>% length
FeatureScatter(seurat, feature1 = "TRDV2", feature2 = "TRGV9",pt.size=2,cols=cluster_colors,jitter=T)

#doublecheck these cells by running findmarkers
newcells.markers <- FindMarkers(seurat, ident.1 = "NewCells", ident.2 = "CD8 TEM_GZMB", min.diff.pct = 0.3,
    only.pos = TRUE)
newcells.markers %>% arrange(.,avg_log2FC)

#findmarkers output is c/w gdT, specifically there is high expression of KLRC1, TRGC1, TRDC, TRGV9, TRGDV2. so we assign this population to gdT
Idents(seurat, cells = select.cells) <- "gdT" 

Idents(seurat) %>% table
```

#### C. Reassign gdT -> non-gdT

```{r}
###################################
#### now part 2, we extract from the gdT cluster those cells that express TRDV2 and TRGV9 at 0
gdt_out  <- subset(x = seurat, idents = "gdT", 
                   subset = TRDV2 == 0 & TRGV9 == 0) 
gdt_out@active.ident %>% table

select.cells <- colnames(gdt_out)
select.cells

Idents(seurat, cells = select.cells) <- "OutCells"

FeatureScatter(gdt_out, feature1 = "TRDV2", feature2 = "TRGV9",pt.size=2,cols=cluster_colors) #checks that expression is 0

#checks whether these are CD8 or CD4
FeatureScatter(seurat, cells=select.cells,feature1 = "CD4", feature2 = "CD8A")
VlnPlot(seurat, idents="OutCells", 
        features=c("CCR7", "LEF1", "TCF7", 
                   "LTB", "NR4A2", 
                   "CD4", "CD8A", "CD8B",
                   "CXCL13","ICOS", "PDCD1", "CTLA4",
                   "GZMK", "GZMB","TOX","MAF",
                   "RTKN2", "FOXP3", "IK2F2",
                   "TRDC"), #Treg
        ncol=4)

VlnPlot(seurat,idents="OutCells", 
        features = c("TRDC", "TRGC1", "TRGC2", 
                     "KLRC1", "NKG7", "TRDV2", "CD7", 
                     "TRGV9", "KLRD1", "KLRG1")) #gdT markers
#based on above markers, these appear to be CD8 Naive or C8 TCM, or gamma delta (based on high TRDC)

#doublecheck these cells by running findmarkers
newcells.markers <- FindMarkers(seurat, ident.1 = "OutCells", ident.2 = "CD8 Naive", min.diff.pct = 0.3,
    only.pos = TRUE)
newcells.markers %>% arrange(.,avg_log2FC)

newcells.markers <- FindMarkers(seurat, ident.1 = "OutCells", ident.2 = "CD8 TCM", min.diff.pct = 0.3,
    only.pos = TRUE)
newcells.markers %>% arrange(.,avg_log2FC)

newcells.markers <- FindMarkers(seurat, ident.1 = "OutCells", ident.2 = "gdT", min.diff.pct = 0.3,
    only.pos = TRUE)
newcells.markers %>% arrange(.,avg_log2FC)

outcell_check <- subset(x = seurat, 
                        idents = c(
                          "CD8 Naive", "CD8 TCM",
                          "gdT", "OutCells"))

pdf(paste0(output.path,"gdT_qc_plots_outCell_check.pdf"),width=14,height=8)
Seurat::DoHeatmap(outcell_check,features=top3$gene,size = 2)+
   theme(
     # legend.text = element_text(size = 5),
      axis.text.y = element_text(size = 5))&NoLegend()
dev.off()

#based on this heatmap, there apepars to be two populations within OutCells. One that has TRDV1 or TRDC high expression, which is likely a gdT population, and a second that has high CD8A/C8B expression. that is likely a naive-like population 
FeatureScatter(seurat, cells=select.cells,feature1 = "TRDV1", feature2 = "CD8A")
FeatureScatter(seurat, cells=select.cells,feature1 = "TRDC", feature2 = "CD8A")

#so i'll put out those expressors of TRDV1 and put them back with the gdTs
gdT_putBack  <- subset(x = seurat, 
                       subset = TRDV1 > 0 | 
                         TRDC > 0 | 
                         TRGC1 > 0 | 
                         TRGC2 > 0, 
                       idents = "OutCells") 
select.cells_putback <- colnames(gdT_putBack)
select.cells_putback
Idents(seurat,cells=select.cells_putback) <- "gdT"

#### now re-plot the previous QC plots
#checks whether these are CD8 or CD4
VlnPlot(seurat, idents="OutCells", 
        features=c("CCR7", "LEF1", "TCF7", 
                   "LTB", "NR4A2", 
                   "CD4", "CD8A", "CD8B",
                   "CXCL13","ICOS", "PDCD1", "CTLA4",
                   "GZMK", "GZMB","TOX","MAF",
                   "RTKN2", "FOXP3", "IK2F2"), #Treg
        ncol=4)
VlnPlot(seurat,idents="OutCells", 
        features = c("TRDC", "TRGC1", "TRGC2", 
                     "KLRC1", "NKG7", "TRDV2", "CD7", 
                     "TRGV9", "KLRD1", "KLRG1")) #gdT markers

#based on above markers, these appear to be CD8 Naive or C8 TCM, or gamma delta (based on high TRDC)

#doublecheck these cells by running findmarkers
newcells.markers <- FindMarkers(seurat, ident.1 = "OutCells", ident.2 = "CD8 Naive", min.diff.pct = 0.3,
    only.pos = TRUE)
newcells.markers %>% arrange(.,avg_log2FC)

newcells.markers <- FindMarkers(seurat, ident.1 = "OutCells", ident.2 = "CD8 TCM", min.diff.pct = 0.3,
    only.pos = TRUE)
newcells.markers %>% arrange(.,avg_log2FC)

newcells.markers <- FindMarkers(seurat, ident.1 = "OutCells", ident.2 = "gdT", min.diff.pct = 0.3,
    only.pos = TRUE)
newcells.markers %>% arrange(.,avg_log2FC)

outcell_check <- subset(x = seurat, 
                        idents = c(
                          "CD8 Naive", "CD8 TCM",
                          "gdT", "OutCells"))

pdf(paste0(output.path,"gdT_qc_plots_outCell_check_2.pdf"),width=14,height=8)
Seurat::DoHeatmap(outcell_check,features=top3$gene,size = 2)+
   theme(
     # legend.text = element_text(size = 5),
      axis.text.y = element_text(size = 5))&NoLegend()
dev.off()

#finally, assign cells to OutCells group to CD8 Naive
outcell_check@active.ident %>% table
gdT_OutCells <- subset(x = seurat, 
                        idents = c("OutCells"))
select.cells.OutCells <- colnames(gdT_OutCells)
select.cells.OutCells
Idents(seurat, cells = select.cells.OutCells) <- "CD8 Naive" 

Idents(seurat) %>% table
```

#### C. Re-set levels and set active.ident column

```{r}
#need to re-set the levels
levels(seurat)
Idents(seurat) <- factor(x = Idents(seurat), levels = levels.manual)

#set active.ident column
seurat$active.ident <- Idents(seurat)

## make new dimplot and save
pdf(paste0(output.path,"DimPlot_after_annotation_gdT_reassigned.pdf"));
DimPlot(seurat,cols=cluster_colors);
dev.off()
pdf(paste0(output.path,"DimPlot_after_annotation_gdT_reassigned.pdf_noLegend.pdf"));
DimPlot(seurat,cols=cluster_colors)&NoLegend();
dev.off()

pdf(paste0(output.path,"DimPlot_after_annotation_gdT_reassigned.pdf_noLegend_noAxes.pdf"));
DimPlot(seurat,cols=cluster_colors)&NoLegend()&NoAxes();
dev.off()

pdf(paste0(output.path,"DimPlot_after_annotation_gdT_reassigned.pdf_yesLegend_noAxes.pdf"));
DimPlot(seurat,cols=cluster_colors)&NoAxes()#&NoLegend() 
dev.off()

pdf(paste0(output.path,"DimPlot_after_annotation_gdT_reassigned.pdf"));
DimPlot(seurat,cols=cluster_colors,label=T,repel=T);
dev.off()

# saveRDS(seurat, paste0(output.path, "seurat_after_annotation_gdT_reassigned.rds"))
# seurat <- readRDS(seurat, file=paste0(output.path, "seurat_after_annotation_gdT_reassigned.rds"))
```

### E.1. Re-run findallmarkers and make new figures
```{r}
markers <- FindAllMarkers(seurat, test.use = "MAST", only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25)
# 
# saveRDS(markers, file=paste0(output.path,"findAllMarkers_post_clusterassignment.rds"))
# markers <- readRDS(paste0(output.path,"findAllMarkers_post_clusterassignment.rds"))

markers.list <- markers %>% split(.,.$cluster)
writexl::write_xlsx(markers.list, path=paste0(output.path,"markers_list.xlsx"))

#subset to remove mitochondrial genes
nrow(markers)
grep("MT-",markers$gene)
markers$gene[grep("MT-",markers$gene)]
markers = markers[-grep("MT-",markers$gene),]
grep("MT-",markers$gene)
nrow(markers)


markers %>%
    group_by(cluster) %>%
    slice_max(n = 3, order_by = avg_log2FC)
markers %>%
    group_by(cluster) %>%
    top_n(n =10, wt = avg_log2FC) -> top10
markers %>%
    group_by(cluster) %>%
    top_n(n =5, wt = avg_log2FC) -> top5
markers %>%
    group_by(cluster) %>%
    top_n(n =3, wt = avg_log2FC) -> top3


seurat<-ScaleData(seurat, features=c(markers$gene,goi.all), verbose = FALSE)

#make new heatmaps
pdf(paste0(output.path,"FindAllMarkers_after_singleton_removal_after_cluster_annotation.pdf"),width=12,height=8)
Seurat::DoHeatmap(seurat,features=top5$gene,size=2)+
 theme(legend.text = element_text(size = 4),
      axis.text.y = element_text(size = 6))
Seurat::DoHeatmap(seurat,features=top10$gene,size=2)+
 theme(legend.text = element_text(size = 4),
      axis.text.y = element_text(size = 5))
dev.off()

pdf(paste0(output.path,"FindAllMarkers_after_singleton_removal_after_cluster_annotation_top3.pdf"),width=12,height=8)
Seurat::DoHeatmap(seurat,features=top3$gene,size=2)+
 theme(legend.text = element_text(size = 4),
      axis.text.y = element_text(size = 6))
dev.off()

#make another version of this that is downsampled for the heatmap
seurat_downsampled <- subset(x = seurat, downsample = 150)
pdf(paste0(output.path,"FindAllMarkers_after_singleton_removal_after_cluster_annotation_top3_downsampled.pdf"),width=12,height=8)
Seurat::DoHeatmap(seurat_downsampled,features=top3$gene,size=2)+
 theme(legend.text = element_text(size = 4),
      axis.text.y = element_text(size = 6))
dev.off()
rm(seurat_downsampled)


pdf(paste0(output.path,"GOI_all__after_singleton_removal_after_cluster_annotation.pdf"),width=12,height=8)
Seurat::DoHeatmap(seurat,features=goi.all,size = 2)+
   theme(legend.text = element_text(size = 4),
      axis.text.y = element_text(size = 5))
dev.off()

```

### E.2. make figures and tables for final object

```{r load_seurat_plots}
#############################################################################
##### make figures and tables for final object
# make dimplot of final clustering of all samples
DimPlot(seurat, cols=cluster_colors)+  
ggtitle(paste0(ifelse(repertoire=="T", "T", "B"), 
                                  " cell clusters in sc", str_to_upper(type), 
                                  " (n = ", nrow(seurat@meta.data), ")")) +
  ggprism::theme_prism()+
  theme(legend.position = "right", legend.text = element_text(size=10)) 
ggsave(filename = paste0(output.path, "dimplot_all_including_cells_without_TCR.pdf"),width = 12, height = 8)

#make barplot and table
cells.by.type.t.only <- table(Idents(seurat)) %>% as.data.frame() 
ggplot(cells.by.type.t.only, aes(x = reorder(Var1, Freq), 
                          y = Freq, fill=Var1))+
  geom_col()+  scale_fill_manual(values=cluster_colors)+
  geom_text(aes(label = Freq), hjust =-0.2)+
  ggprism::theme_prism()+theme(axis.text.x = element_text(angle=45, hjust=1,vjust=1))+
  theme(legend.position="null")+ggtitle("Single cells per cluster")+xlab("")+ylab("Cell count")+coord_flip(clip="off")+ 
  theme(plot.margin = unit(c(1,2,1,1), "lines"))

ggsave(paste0(output.path,"summary_all_T_only_asbarplot.pdf"), width=5, height=7)
  
cells.by.type.t.only %>% 
  bind_rows(summarise_all(., ~if(is.numeric(.)) {sum(.)} else "Total")) %>% 
  kbl(caption = "Summary of all cell types (after subsetting for T cells, low counts not removed)", align = 'c') %>%
  kable_classic(full_width=F) %>% 
  save_kable(., paste0(output.path, "summary_all_T_only.pdf"))

#DimPlot by patient
DimPlot(seurat, cols=cluster_colors, split.by = 'Patient', ncol=3)+
  ggprism::theme_prism()
ggsave(filename = paste0(output.path, "dimplot_all_including_cells_without_TCR_byPatient.pdf"),width = 12, height = 8)

df <- seurat@meta.data %>% select(Patient, active.ident) %>% group_by(Patient, active.ident) %>% summarise(activeIdent_n=n()) %>% ungroup() #%>% bind_rows(summarise_all(., ~if(is.numeric(.)) {sum(.)} else "Total"))
ggplot(df, aes(fill = Patient, x=reorder(active.ident, activeIdent_n), y=activeIdent_n))+
  geom_bar(position="stack",stat="identity")+ ggprism::theme_prism()+coord_flip()+
  scale_fill_manual(values=pals::brewer.rdylbu(7))+
  ggtitle("Single cells per cluster, by patient")+xlab("")+ylab("Cell count")
ggsave(paste0(output.path,"dimplot_all_including_cells_without_TCR_byPatient_asbarplot.pdf"), width=5, height=7)
```

### F.1 Add scTCR/BCR data to seurat

```{r add_tcrbcr}
#combine TCR or BCR data with seurat object
#note that I have previously changed the barcodes in the combined file for each sample to match the barcodes in the seurat object
seurat <- combineExpression(combined, seurat, 
                            cloneCall="gene", group.by = "sample", proportion = FALSE, 
                            cloneTypes=c(Single=1, Small=5, Medium=20, Large=100, Hyperexpanded=500))

seurat@meta.data$cloneType <- factor(seurat@meta.data$cloneType, levels = c(
  "Hyperexpanded (100 < X <= 500)",
  "Large (20 < X <= 100)",
  "Medium (5 < X <= 20)",
  "Small (1 < X <= 5)",
  "Single (0 < X <= 1)", NA))

#Subset seurat object to remove cells lacking a barcode (i.e. without TCR/BCR)
#Before doing this, create 2 table to give sense of # of cells removed 
seurat@meta.data$hasTCR = if_else(is.na(seurat@meta.data$barcode), "No", "Yes")
seurat@meta.data %>% count(hasTCR, sort=T) %>%   
  bind_rows(summarise_all(., ~if(is.numeric(.)) {sum(.)} else "Total")) %>% 
  kbl(caption = "Summary of cells with and without TCRs", align = 'c') %>%
  kable_classic(full_width=F) %>%
  save_kable(., paste0(output.path, "summary_cells_with_without_TCR.pdf"))

seurat@meta.data %>% count(active.ident, hasTCR) %>% group_by(active.ident) %>% arrange(desc(hasTCR), .by_group=T) %>% ungroup %>% bind_rows(summarise_all(., ~if(is.numeric(.)) {sum(.)} else "Total")) %>% 
  kbl(caption = "Summary of cells with and without TCRs, by cluster", align = 'c') %>%
  kable_classic(full_width=F) %>%
  save_kable(., paste0(output.path, "summary_cells_with_without_TCR_byCluster.pdf"))

# 
# #Save seurat object
# seurat <- readRDS(paste0(output.path,"seurat_after_TCR.rds"))
# saveRDS(seurat, file=paste0(output.path,"seurat_after_TCR.rds"))
```

### G. Add Adaptive TLS TCR/IGH data
```{r map_clonotypes}
#load adaptive HCC_immdata with additional info re: specific TLS
adaptiveTLS <- readRDS("output/adaptive/TCRB/HCC_immdata_TLS_TCRB_final_for_seurat.rds")

adaptiveTLS$meta$Patient.ID = factor(adaptiveTLS$meta$Patient.ID, levels = unique(adaptiveTLS$meta$Patient.ID))
names(adaptiveTLS$data)
names(adaptiveTLS$data)  = names(adaptiveTLS$data) %>% str_replace("J17136", "") 
adaptiveTLS$meta$Sample = adaptiveTLS$meta$Sample %>% str_replace("J17136", "")

#make pseudoTIL object 
adaptive <- adaptiveTLS$data %>%
  bind_rows(.,.id="Patient") %>% 
  mutate_at("Patient",str_replace,"-.*", "") %>% 
  uncount(.,Clones) %>% #uncount clones column 
  select(-Proportion) %>% 
  select(Patient,CDR3.aa) %>% #,V.name,D.name,J.name,Sequence) %>% 
  group_by(Patient,CDR3.aa) %>%
  mutate(Clones=n(), .after = "Patient") %>%
  distinct(CDR3.aa, .keep_all=T) %>% 
  ungroup() %>% group_by(Patient) %>%  
  mutate(Proportion = Clones/sum(.$Clones), .after=Clones) %>% 
  arrange(desc(Clones)) %>% 
  mutate(Rank=1:n(), .before = "Patient") %>% ungroup() %>% 
  split(.,.$Patient)

adaptive %>% names
adaptive = adaptive[levels(adaptiveTLS$meta$Patient.ID)] #reorder adaptive object
adaptive %>% names

#this is housekeeping to doublecheck the number of duplicated CDR3aa and CDRnt per sample in the adaptive data
x<- lapply(1:length(adaptive), function(i) {
  adaptive[[i]][duplicated(adaptive[[i]]$CDR3.aa) | duplicated(adaptive[[i]]$CDR3.aa, fromLast =T),] %>% 
    nrow
}) %>% unlist

df <- tibble(names(adaptive), sapply(adaptive, nrow), x)
colnames(df) = c("sample", "total.clonotypes", "duplicated.cdr3.aa")#, "duplicated.cdr3.nt"
df

#########
#create adaptive.bindrows object, which  will be used to determine which TCRs are present in TLS
adaptive.bindrows <- adaptive %>% #removes OT1-TDLN and OT6-TDLN
  bind_rows(., .id = "Sample") %>%    #collapses list into one dataframe
  dplyr::select(-'Patient') #removes "Patient" column, leaving "Sample" which will be used below

#Add column to seurat object for TCRB or IGH data (this allows cross-referencing of single cell  against Adaptive datasets)
seurat$TCRB_or_IGH <- gsub(".*_", "", seurat@meta.data$CTaa)

# add the second column for TCRB or IGH
seurat$TCRB_or_IGH2 = NA
# find entries that should be split by ; 
tosplit = grep(";",seurat$TCRB_or_IGH ) 
# split by ";" and add the second part to TCRB_or_IGH2 
seurat@meta.data[tosplit,"TCRB_or_IGH2"] <- sapply(strsplit(seurat@meta.data[tosplit,"TCRB_or_IGH"],";"), function(x) x[[2]]) 
# and replace TCRB_or_IGH with the first half 
# you need to do it in this order, otherwise you will loose the second part
seurat@meta.data[tosplit,"TCRB_or_IGH"] <- sapply(strsplit(seurat@meta.data[tosplit,"TCRB_or_IGH"],";"), function(x) x[[1]])

#these 3 lines doublecheck that this worked
seurat@meta.data[tosplit, "TCRB_or_IGH"]
seurat@meta.data[tosplit, "TCRB_or_IGH2"]
grep(";",seurat$TCRB_or_IGH ) %>% length

##########

#now add column to seurat metadata and adaptive.bindrows with sample name_tcrb -- this will facilitate adding a column to single cell metadata that shows if the TCRB is present in patient's TLS  
seurat$patient_tcrb <- paste(seurat@meta.data$Patient, seurat@meta.data$TCRB_or_IGH, sep="_") #this and belone line does create instances of HCC02_NA, etc. but this is okay because there are no NAs in the CDR3.aa in the adaptive data
seurat$patient_tcrb2 <- paste(seurat@meta.data$Patient, seurat@meta.data$TCRB_or_IGH2, sep="_") 
adaptive.bindrows$Sample_CDR3.aa <- paste(adaptive.bindrows$Sample, adaptive.bindrows$CDR3.aa, sep="_")

#Add column to seurat object if there is any match in TLS sequencing data for that particular patient
seurat$TLS.present1 <- ifelse(seurat@meta.data$patient_tcrb %in% adaptive.bindrows$Sample_CDR3.aa, 1, 0)
seurat$TLS.present2 <- ifelse(seurat@meta.data$patient_tcrb2 %in% adaptive.bindrows$Sample_CDR3.aa, 1, 0)

#consolidate the two columns into one column, which is true if either of the two TCRBs were found in the patient's TLS
seurat$TLS.present = ifelse(seurat$TLS.present1+seurat$TLS.present2 >= 1, 1, 0)

seurat@meta.data[tosplit,c("TLS.present", "TLS.present1", "TLS.present2")]

#########
#Now create adaptive.excludeSingletons which is adaptive.bindrows subsetted for Clones > 1, i.e. the TLS data excluding singletons
#Then add column to seurat object if the clone is expanded in TLS (count > 1), i.e. present in the now .excludeSingletons object 
adaptive.excludeSingletons = adaptive.bindrows[adaptive.bindrows$Clones > 1,] 
seurat$TLS.expanded1 <- ifelse(seurat@meta.data$patient_tcrb %in% adaptive.excludeSingletons$Sample_CDR3.aa, 1,0)
seurat$TLS.expanded2 <- ifelse(seurat@meta.data$patient_tcrb2 %in% adaptive.excludeSingletons$Sample_CDR3.aa, 1,0)
seurat$TLS.expanded = ifelse(seurat$TLS.expanded1+seurat$TLS.expanded2 >= 1, 1, 0)

#create object seurat.TLS, which is a subset of seurat metadata that contains only cells present in TLS (clones >=1)
seurat.TLS <- seurat@meta.data %>% filter(TLS.present==1) 

#Create object seurat.TLS.excludeSingletons, which is a subset of seurat metadata that contains only cells with TCRs expanded in TLS (clones >1)
seurat.TLS.excludeSingletons <- seurat@meta.data %>% filter(TLS.expanded == 1)

#create 2 lists with the suffix .bypatient, which are seurat.TLs and seurat.TLS.excludeSingletons split by orig.ident
seurat.TLS.bypatient <- split(seurat.TLS, f=seurat.TLS$orig.ident)
seurat.TLS.excludeSingletons.bypatient <- split(seurat.TLS.excludeSingletons, f=seurat.TLS.excludeSingletons$orig.ident)

#remove adaptive.bindrows object
rm(adaptive.bindrows)

# saveRDS(seurat, file=paste0(output.path,"seurat_post_addition_of_TCR.rds"))
# seurat <- readRDS(paste0(output.path,"seurat_post_addition_of_TCR.rds"))
```

### H. Summarize cross-referenced single cell and TLS data

```{r basic statistics-1}
summaryTable_by_cluster <- seurat@meta.data %>% group_by(active.ident) %>% 
  summarise(
    Total_cells=n(),
    Total_cells_with_TCR=sum(hasTCR=="Yes"),
    Total_cells_with_TCR_expanded=sum(Frequency>1, na.rm=T),
    # Total_unique_TCRB=(n_distinct(c(TCRB_or_IGH,TCRB_or_IGH2))),
    Cells_with_TCR_in_TLS=sum(TLS.present)
    ) %>%
  bind_rows(summarise(.,across(where(is.numeric), sum), across(where(is.factor), ~"Total"))) %>%
  mutate(Total_cells_with_TCR_expanded = paste0(Total_cells_with_TCR_expanded, " (", round(100*Total_cells_with_TCR_expanded/Total_cells_with_TCR,1), ")")) %>% 

  mutate(Cells_with_TCR_in_TLS = paste0(Cells_with_TCR_in_TLS, " (", round(100*Cells_with_TCR_in_TLS/Total_cells_with_TCR,1), ")")) %>% 
  
  rename(c(
    'Cluster' = active.ident,
    'Cells, n' = Total_cells,
    'Cells with TCR, n' = Total_cells_with_TCR,
    'Cells with expanded TCRs, n (%)' = Total_cells_with_TCR_expanded,
    'Cells with TCR\u03B2 in TLS, n (%)' = Cells_with_TCR_in_TLS#,
    # 'Unique TCR\u03B2 in TLS, n (%)'= Unique_TCRs_in_TLS
           )) 

summaryTable_by_cluster
summaryTable_by_cluster %>% 
  writexl::write_xlsx(.,paste0(output.path, "summaryTable_by_cluster_final.xlsx"))
summaryTable_by_cluster %>% 
  kbl(caption = "Summary of cross-referenced single cell data by cluster", align = 'c') %>%
  kable_classic(full_width=F) %>% 
  save_kable(., paste0(output.path, "summaryTable_by_cluster_final.pdf"))

rm(summaryTable_by_cluster)

#####################
names.Patient <- unique(seurat@meta.data$Patient)

all.TCR = c()
all.TCR.recovered = c()
unique.TCR = c()
unique.TCR.recovered = c()
unique.TCR.singleton = c()
unique.TCR.singleton.recovered = c()
unique.TCR.expanded = c()
unique.TCR.expanded.recovered = c()

for (i in 1:length(names.Patient)) {
  all.TCR = c(all.TCR, sum(adaptive[[i]]$Clones))
  all.TCR.recovered = c(all.TCR.recovered, length(seurat.TLS[seurat.TLS$Patient == names.Patient[i],]$TCRB_or_IGH))
  unique.TCR = c(unique.TCR, nrow(adaptive[[i]]))
  unique.TCR.recovered = c(unique.TCR.recovered, length(unique(seurat.TLS[seurat.TLS$Patient == names.Patient[i],]$TCRB_or_IGH)))
  
  uniqueSharedTCR= unique(seurat.TLS[seurat.TLS$Patient == names.Patient[i],]$TCRB_or_IGH)
  unique.TCR.singleton = c(unique.TCR.singleton, nrow(adaptive[[i]][adaptive[[i]]$Clones == 1,]))
  unique.TCR.singleton.recovered = c(unique.TCR.singleton.recovered, 
                                     nrow(adaptive[[i]][adaptive[[i]]$Clones ==1 & adaptive[[i]]$CDR3.aa%in% uniqueSharedTCR,]))
  
  unique.TCR.expanded = c(unique.TCR.expanded, nrow(adaptive[[i]][adaptive[[i]]$Clones > 1,]))
  unique.TCR.expanded.recovered = c(unique.TCR.expanded.recovered, 
                                    nrow(adaptive[[i]][adaptive[[i]]$Clones > 1 & adaptive[[i]]$CDR3.aa%in% uniqueSharedTCR,]))
}

top10.pct = c()
top10.pct.recovered = c()
top1.pct = c()
top1.pct.recovered = c()
top01.pct = c()
top01.pct.recovered = c()

for (i in 1:length(names.Patient)) {
  uniqueSharedTCR= unique(seurat.TLS[seurat.TLS$Patient == names.Patient[i],]$TCRB_or_IGH)
  top10.pct = c(top10.pct, round(nrow(adaptive[[i]]) *.1))
  top10.pct.recovered = c(top10.pct.recovered, 
                          sum(adaptive[[i]][1:round(nrow(adaptive[[i]]) *.1),]$CDR3.aa %in% uniqueSharedTCR))
  top1.pct = c(top1.pct, round(nrow(adaptive[[i]]) *.01))
  top1.pct.recovered = c(top1.pct.recovered, 
                         sum(adaptive[[i]][1:round(nrow(adaptive[[i]]) *.01),]$CDR3.aa %in% uniqueSharedTCR))
  top01.pct = c(top01.pct, round(nrow(adaptive[[i]]) *.001))
  top01.pct.recovered = c(top01.pct.recovered, 
                          sum(adaptive[[i]][1:round(nrow(adaptive[[i]]) *.001),]$CDR3.aa %in% uniqueSharedTCR))
}

df3 = data.frame("Patient" = names.Patient, 
                 unique.TCR,unique.TCR.recovered,
                 top10.pct, top10.pct.recovered, 
                 top1.pct, top1.pct.recovered,
                 top01.pct,top01.pct.recovered) %>%  
  bind_rows(summarise_all(., ~if(is.numeric(.)) {sum(.)} else "Total")) %>% 
  mutate(., 'Total TCR\u03B2 matched, n (%)' = paste0(unique.TCR.recovered, "/", unique.TCR, 
                                                      " (", signif(unique.TCR.recovered/unique.TCR *100, digits = 2), ")"),  
         .after = unique.TCR) %>% 
  mutate(., 'Top 10% of TCR\u03B2 matched, n (%)' = paste0(top10.pct.recovered,  "/", top10.pct,
                                                      " (", 
                                                      signif(top10.pct.recovered/top10.pct *100, digits = 2), 
                                                      ")"),  .after = top10.pct) %>% 
  mutate(., 'Top 1% of TCR\u03B2 matched, n (%)' = paste0(top1.pct.recovered, "/", top1.pct,
                                                     " (", 
                                                     signif(top1.pct.recovered/top1.pct *100, digits = 3), 
                                                     ")"),  .after = top1.pct) %>% 
  mutate(., 'Top 0.1% of TCR\u03B2 matched, n (%)' = paste0(top01.pct.recovered, "/", top01.pct,
                                                       " (", 
                                                       signif(top01.pct.recovered/top01.pct *100, digits = 2), 
                                                       ")"),  .after = top01.pct) %>% 
  select(-c(unique.TCR,
            unique.TCR.recovered,
                 top10.pct, top10.pct.recovered, 
                 top1.pct, top1.pct.recovered,
                 top01.pct,top01.pct.recovered)) # %>%   #removes redundant columns
  # rename('Unique TCR\u03B2 in TLS, n' = unique.TCR#, 
  # 'Top 10% of TCR\u03B2, n' = top10.pct, 'Top 1% of TCR\u03B2, n' = top1.pct, 'Top 0.1% of TCR\u03B2, n' = top01.pct
  # ) 

df3
df3 %>% 
  kbl(caption = "Summary of cross-referenced TLS data (v. 3)", align = 'c') %>%
  kable_classic(full_width=F) %>% 
  # add_header_above(c(" " = 1, "Unique" = 2, "Top 10%" = 2, "Top 1%" = 2, "Top 0.1%" = 2)) %>% 
  save_kable(., paste0(output.path, "summaryTable_TLS_TCRB_matching_to_singleCell_final.pdf"))

df3 %>% 
  writexl::write_xlsx(.,paste0(output.path, "summaryTable_TLS_TCRB_matching_to_singleCell_final.xlsx"))
```
### I. Set levels
```{r}
seurat@meta.data$Patient = factor(seurat@meta.data$Patient, levels=unique(seurat@meta.data$Patient))

seurat@meta.data$orig.ident = factor(seurat@meta.data$orig.ident, levels=unique(seurat@meta.data$orig.ident))

# if loading previously saved, uncomment all 3 lines below
saveRDS(seurat, file=paste0(output.path,"seurat_pbmc_final.rds"))
# seurat <- readRDS(paste0(output.path,"seurat_pbmc_final.rds"))
# levels.manual = Idents(seurat) %>% levels

saveRDS(cluster_colors, file=paste0(output.path,"cluster_colors_final.rds"))

# cluster_colors <- readRDS(paste0(output.path,"cluster_colors_final.rds"))

```

# III. Analysis 
### A. RNA-seq analysis
#### 1. FindAllMarkers Heatmaps
```{r plot_FindAllMarkers}
hasTCR_colors = c("darkblue", "lightblue")
names(hasTCR_colors) = c("Yes","No")

cloneType_colors = rev(pals::brewer.purd(3+length(levels(seurat@meta.data$cloneType)))[4:(3+length(levels(seurat@meta.data$cloneType)))])
names(cloneType_colors) = levels(seurat@meta.data$cloneType)

TLS.present_colors = c('gray', 'gray26')
names(TLS.present_colors) = c(0, 1)

cols.use <- list(
  active.ident=cluster_colors,
  hasTCR = hasTCR_colors,
  cloneType = cloneType_colors,
  TLS.present=TLS.present_colors
)

###########################
#build legend
title.list = list("Cluster","TCR", "Clonal expansion","TCR in TLS")

df.list <- lapply(cols.use, function(i) {
  data.frame(LegendData = names(i))})

legend.list <- lapply(1:length(df.list), function(i) {
  gplot <- ggplot(df.list[[i]],
                  aes(LegendData,LegendData,fill=LegendData))+
    geom_bar(stat="identity")+#,color="white",lwd=0.5)+
    scale_fill_manual(values=cols.use[[i]])+
    theme(legend.title = element_text(face="bold"))+
    guides(fill=guide_legend(title=title.list[[i]]))  
  legend<-get_legend(gplot)
  return(legend)
})

grid.newpage()
# grid.draw(legend.list[[4]])
pdf(paste0(output.path,"DoHeatMap_Labels_gridarrange.pdf"))
grid.arrange(grobs= legend.list, nrow=1)
dev.off()

pdf(paste0(output.path,"DoHeatMap_Labels_gridarrange_V2.pdf"))
grid.arrange(grobs= legend.list[2:4], ncol=1)
dev.off()

##########################
pdf(paste0(output.path, "findAllMarkers_top10.pdf"), width=12, height = 8)

DoMultiBarHeatmap(seurat, 
                  features = top10$gene,
                  group.by = "active.ident",
                  additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  additional.group.sort.by=c("hasTCR","TLS.present","cloneType"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ # Left margin
        # legend.position="bottom")
  NoLegend()
dev.off()

### versions of the plot without grouping of TLS expanded TCRs
pdf(paste0(output.path, "findAllMarkers_top10_notSorted.pdf"), width=12, height = 8)
DoMultiBarHeatmap(seurat, 
                  features = top10$gene,
                  group.by = "active.ident",
                  additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  # additional.group.sort.by=c("TLS.expanded"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ # Left margin
        # legend.position="bottom")
  NoLegend()
dev.off()

#plot top 5
pdf(paste0(output.path, "findAllMarkers_top5.pdf"), width=12, height = 8)

DoMultiBarHeatmap(seurat, 
                  features = top5$gene,
                  group.by = "active.ident",
                 additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  additional.group.sort.by=c("hasTCR","TLS.present","cloneType"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6.5, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom")
 NoLegend()
dev.off()


pdf(paste0(output.path, "findAllMarkers_top5_notSorted.pdf"), width=12, height = 8)
DoMultiBarHeatmap(seurat, 
                  features = top5$gene,
                  group.by = "active.ident",
                   additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  # additional.group.sort.by=c("TLS.expanded"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6.5, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom")
NoLegend()
dev.off()

pdf(paste0(output.path, "findAllMarkers_top3.pdf"), width=12, height = 8)
DoMultiBarHeatmap(seurat, 
                  features = top3$gene,
                  group.by = "active.ident",
                 additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  additional.group.sort.by=c("hasTCR","TLS.present","cloneType"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6.5, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom")
 NoLegend()
dev.off()

pdf(paste0(output.path, "findAllMarkers_top3_notSorted.pdf"), width=12, height = 8)
DoMultiBarHeatmap(seurat, 
                  features = top3$gene,
                  group.by = "active.ident",
                 additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  # additional.group.sort.by=c("TLS.expanded"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6.5, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom")
 NoLegend()
dev.off()
```

#### 2. FindAllMarkers Heatmaps (downsampled)
```{r}
seurat_small <- subset(seurat, downsample = 75)

pdf(paste0(output.path, "findAllMarkers_top10_small.pdf"), width=12, height = 8)

DoMultiBarHeatmap(seurat_small, 
                  features = top10$gene,
                  group.by = "active.ident",
                  additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  additional.group.sort.by=c("hasTCR","TLS.present","cloneType"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ # Left margin
        # legend.position="bottom")
  NoLegend()
dev.off()


### versions of the plot without grouping of TLS expanded TCRs
pdf(paste0(output.path, "findAllMarkers_top10_notSorted_small.pdf"), width=12, height = 8)
DoMultiBarHeatmap(seurat_small, 
                  features = top10$gene,
                  group.by = "active.ident",
                  additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  # additional.group.sort.by=c("TLS.expanded"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ # Left margin
        # legend.position="bottom")
  NoLegend()
dev.off()

#plot top 5
pdf(paste0(output.path, "findAllMarkers_top5_small.pdf"), width=12, height = 8)

DoMultiBarHeatmap(seurat_small, 
                  features = top5$gene,
                  group.by = "active.ident",
                 additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  additional.group.sort.by=c("hasTCR","TLS.present","cloneType"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6.5, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom")
 NoLegend()
dev.off()


pdf(paste0(output.path, "findAllMarkers_top5_notSorted_small.pdf"), width=12, height = 8)
DoMultiBarHeatmap(seurat_small, 
                  features = top5$gene,
                  group.by = "active.ident",
                   additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  # additional.group.sort.by=c("TLS.expanded"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6.5, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom")
NoLegend()
dev.off()

pdf(paste0(output.path, "findAllMarkers_top3_small.pdf"), width=12, height = 8)
DoMultiBarHeatmap(seurat_small, 
                  features = top3$gene,
                  group.by = "active.ident",
                 additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  additional.group.sort.by=c("hasTCR","TLS.present","cloneType"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6.5, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom")
 NoLegend()
dev.off()

pdf(paste0(output.path, "findAllMarkers_top3_notSorted_small.pdf"), width=12, height = 8)
DoMultiBarHeatmap(seurat_small, 
                  features = top3$gene,
                  group.by = "active.ident",
                 additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  # additional.group.sort.by=c("TLS.expanded"),
                  cols.use = cols.use,
                  label = T,
                  size = 2,
                  angle = 45,
                  lines.width=3,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6.5, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom")
 NoLegend()
dev.off()
```

#### 3. Make Volcano plots

```{r}
volc_markers <- FindAllMarkers(seurat, test.use = "MAST", #only.pos = TRUE, 
                          # min.pct = 0.25, 
                          # logfc.threshold = 0.25
                          assay="SCT"
                         # assay="RNA"
                         # assay=obj@assay$SCT@scale.data
                          )
#
saveRDS(volc_markers, file=paste0(output.path,"findAllMarkers_volc.rds"))
# volc_markers <- readRDS(paste0(output.path,"findAllMarkers_volc.rds"))

volc_markers.list <- volc_markers %>% split(.,.$cluster)
writexl::write_xlsx(volc_markers.list, path=paste0(output.path,"volc_markers_list.xlsx"))


#subset to remove mitochondrial genes
nrow(volc_markers)
grep("MT-",volc_markers$gene)
volc_markers$gene[grep("MT-",volc_markers$gene)]
volc_markers = volc_markers[-grep("MT-",volc_markers$gene),]
grep("MT-",volc_markers$gene)
nrow(volc_markers)

for (i in unique(volc_markers$cluster)) { 
 res = volc_markers[volc_markers$cluster==i,] 
 res[res$p_val_adj==0,c("p_val","p_val_adj")] <- .Machine$double.xmin #convert 0s to machine minimum for plotting

 p <- EnhancedVolcano::EnhancedVolcano(res,
    lab = res$gene, #use this rather than rowames(res)
    selectLab = c(),
    x = 'avg_log2FC',
    y = 'p_val_adj',
    xlim = c(min(res[['avg_log2FC']], na.rm = TRUE) - 0.75, max(res[['avg_log2FC']], na.rm = TRUE) +
    0.75),
  ylim = c(0, max(-log10(res[['p_val_adj']]), na.rm = TRUE) + 1),
    title = paste0(i, " versus non-",i),
    subtitle="",
    titleLabSize = 16,
    caption = paste0("total = ", nrow(res), " genes"),
    pCutoff = 0.05,
    FCcutoff = 0.5,
    xlab = bquote(~Log[2]~ 'fold change'),
    pointSize = 2,
    labSize=4,
    labCol = 'black',
              # legendPosition = 'right',
                                       # col=c('grey10', 'royalblue4', 'purple4', 'red3'),
    colAlpha = 0.8,
    legendLabSize = 12,
    legendIconSize = 3,
    drawConnectors = TRUE,
    widthConnectors = 0.8,    typeConnectors = "closed",    arrowheads = FALSE,
    gridlines.major = FALSE, gridlines.minor = FALSE
 ) 
print(p)
  pdf(paste0(output.path,"volcano_plot_",i,".pdf"))
  print(p)
  dev.off()
 
}
####
 res = volc_markers[volc_markers$cluster=="CD4 Tph",] 

 res[res$p_val_adj==0,c("p_val","p_val_adj")] <- .Machine$double.xmin #convert 0s to machine minimum for plotting

 p <- EnhancedVolcano::EnhancedVolcano(res,
    lab = res$gene, #use this rather than rowames(res)
    selectLab = c("CTLA4", "TOX", "TOX2","MAF", "ITGB1", "TIGIT", "SLAMF6","CXCR3", "ICOS","IL21"),
    x = 'avg_log2FC',
    y = 'p_val_adj',
    xlim = c(min(res[['avg_log2FC']], na.rm = TRUE) - 0.5, max(res[['avg_log2FC']], na.rm = TRUE) +
    1),
  ylim = c(0, max(-log10(res[['p_val_adj']]), na.rm = TRUE) + 1),
    title = paste0("CD4 Tph versus non-CD4 Tph"),
    subtitle="",
    titleLabSize = 16,
    caption = paste0("total = ", nrow(res), " genes"),
    pCutoff = 0.05,
    FCcutoff = 0.5,
    xlab = bquote(~Log[2]~ 'fold change'),
    pointSize = 2,
    labSize=4,
    labCol = 'black',
              # legendPosition = 'right',
                                       # col=c('grey10', 'royalblue4', 'purple4', 'red3'),
    colAlpha = 0.8,
    legendLabSize = 12,
    legendIconSize = 3,
    drawConnectors = TRUE,
    widthConnectors = 0.8,    typeConnectors = "closed",    arrowheads = FALSE,
    gridlines.major = FALSE, gridlines.minor = FALSE
 ) 
p
   pdf(paste0(output.path,"volcano_plot_Tph_curatedGenes.pdf"))
   print(p)
   dev.off()
```
#### 4. Plot GOI
```{r plot_goi}
pdf(paste0(output.path, "goi_heatmap.pdf"), width=12, height = 8)
# project TCRs and add annotation column to heatmaps
DoMultiBarHeatmap(seurat, 
                  features = goi.allbutTLS, 
                  group.by = "active.ident",
                                    additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  additional.group.sort.by=c("TLS.present"),
                  cols.use = cols.use,
                  label = T,
                  size = 2, 
                  angle = 45,
                  lines.width=5,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom") 
 NoLegend()

DoMultiBarHeatmap(seurat, 
                  features = goi.allbutTLS, 
                  group.by = "active.ident",
                                    additional.group.by = c("hasTCR","cloneType","TLS.present"
),
                  # additional.group.sort.by=c("TLS.expanded"),
cols.use = cols.use,
label = T,
                  size = 2, 
                  angle = 45,
                  lines.width=5,
                  group.bar.height=0.02,
                  combine=T
                  )+
  theme(axis.text.y.left = element_text(size=6, face="italic"),
        plot.margin = margin(t = 20,  # Top margin
                             r = 85,  # Right margin
                             b = 15,  # Bottom margin
                             l = 20))+ #& # Left margin
        # legend.position="bottom") 
 NoLegend()

dev.off()
```
#### 5. Feature plots, ridgePlots, vlnPlots, DotPlots
```{r other_seurat_plots}
#make list of FeaturePlots for each GOI
FeaturePlot.goi <- lapply(1:length(goi.all.list), function(i) {
  FeaturePlot(seurat, features = goi.all.list[[i]], ncol=3, combine=T)  +
    plot_annotation(title=names(goi.all.list)[i]) &
    theme(plot.title=element_text(size=12, face="bold")) & 
    NoLegend()
})

pdf(paste0(output.path, 
           "featureplots.pdf"), width = 12, height = 8)    
for (i in 1:length(FeaturePlot.goi)) {
  plot(FeaturePlot.goi[[i]]) 
  } 
dev.off()

#make list of ridgePlots for each GOI
ridgePlot.goi <- lapply(1:length(goi.all), function(i) {
  RidgePlot(seurat, features = goi.all[[i]])  
})
  
pdf(paste0(output.path, 
           "ridgeplots.pdf"), width = 12, height = 8)    
for (i in 1:length(ridgePlot.goi)) {
  plot(ridgePlot.goi[[i]]) 
  } 
dev.off()

#make list of violin plots for each GOI
vlnPlot.goi <- lapply(1:length(goi.all.list), function(i) {
  VlnPlot(seurat, features = goi.all.list[[i]])
})

pdf(paste0(output.path, 
           "vlnPlots.pdf"), width = 12, height = 8)    
for (i in 1:length(vlnPlot.goi)) {
  plot(vlnPlot.goi[[i]]) 
  } 
dev.off()

#make list of DotPlots for each GOI
DotPlot.goi <- lapply(1:length(goi.all.list), function(i) {
  DotPlot(seurat, features = goi.all.list[[i]]) +
    RotatedAxis()
})

pdf(paste0(output.path, 
           "dotplots.pdf"), width = 12, height = 8)    
for (i in 1:length(DotPlot.goi)) {
  plot(DotPlot.goi[[i]]) 
  } 
dev.off()

########################
goi.for.pub = c("CD3E","CD4","CD8A",
                "TCF7",#"IL7R",
                "CCR7","SELL", 
                "GZMK","GZMB","PDCD1", "CTLA4", "LAG3", "TIGIT","HAVCR2", 
                # "ENTPD1","CXCL13",
                "TOX","ZNF683","NKG7",
                "CXCR5","CXCR3","ICOS"
                #"FOXP3"
                )
#violin plots for specific markers for publication
p1 <- VlnPlot(seurat, cols=cluster_colors,
        features=goi.for.pub,stack=T,flip=T,fill.by='ident',pt.size=1)&NoLegend()
# p1
pdf(paste0(output.path,"vlnPlots_goi_for_pub.pdf"),height=10,width=8);p1;dev.off()

p2 <- DotPlot(seurat, features = goi.for.pub)+scale_x_discrete(limits = rev)+coord_flip()+theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1))
# p2
pdf(paste0(output.path,"DotPlot_goi_for_pub.pdf"),height=10,width=8);p2;dev.off()

goi.for.fp = c("CD3E","CD4","CD8A",
                "CCR7","SELL", "GZMK",#"GZMB",
               "PDCD1", #"CTLA4", #"HAVCR2", #"ENTPD1",
               "CXCL13", "TOX","ZNF683"#,
               # "NKG7"
                #"FOXP3"
                )
p3 <- FeaturePlot(seurat,features=goi.for.fp,keep.scale="all",ncol=2)+
  plot_layout(guides="collect")+
  theme(
    panel.spacing = unit(0,'lines')
  )&
  #NoLegend()&
  NoAxes()
p3

p4 <- FeaturePlot(seurat,features=goi.for.fp,keep.scale="all",ncol=5)+
  plot_layout(guides="collect")+
  theme(
    panel.spacing = unit(0,'lines')
  )&
  #NoLegend()&
  NoAxes()
p4

pdf(paste0(output.path,"FeaturePlot_goi_for_pub.pdf"),height=12,width=8);p3;dev.off()
pdf(paste0(output.path,"FeaturePlot_goi_for_pub_horizontal.pdf"),height=8,width=20);p4;dev.off()

#     
# p4 <- RidgePlot(seurat, features = goi.for.pub[1:4],fill.by = "feature")
# p4
```
### B. Immune repertoire analysis

#### 1. Visualize Clonal expansion on UMAP
```{r clonalExp}
pdf(paste0(output.path, "clonalExp.pdf"), width=12, height=8)

DimPlot(seurat, group.by = "cloneType",pt.size=0.25)+ 
  scale_color_manual(values=cloneType_colors)+
  # scale_color_brewer(palette="PuRd", direction=-1) +
  # scale_color_manual(values = rev(colorRampPalette(brewer.pal(9, "Purples"))(6)[2:6]))+
    ggprism::theme_prism()+theme(legend.position = "right",                                 legend.text=element_text(size=10))+
  ggtitle("")

DimPlot(seurat, group.by = "cloneType", split.by="orig.ident", ncol=4)+ 
    scale_color_manual(values=cloneType_colors)+
# scale_color_brewer(palette="PuRd", direction=-1) +
  # scale_color_manual(values = rev(colorRampPalette(brewer.pal(9, "Purples"))(6)[2:6]))+
    ggprism::theme_prism()+theme(legend.position = "right",legend.text=element_text(size=10))+ggtitle("")
dev.off()
```
#### 2. Visualize TLS TCRs on UMAP
```{r map_adaptive_to_sc}
#define map_adaptive function, which will be used to project onto the single cell data TCRs that either match or are expanded 
map_adaptive <- function(x) {
  seurat <- highlightClonotypes(seurat, 
                                cloneCall= "aa", 
                                sequence = x)
  DimPlot(seurat, group.by = "highlight") + 
    ggprism::theme_prism()+
    NoLegend()
}
####################
####################
pdf(paste0(output.path, "map_adaptive.pdf"),width=12, height=8)
#map all clones present in TLS (clone ct >= 1)
map_adaptive(seurat.TLS$CTaa)+ 
      labs(title = paste0("All ", ifelse(repertoire=="T", "TCRB", "IGH"),
                         "(count >= 1)  in TLS mapped onto sc ", "data (n = ", length(seurat.TLS$CTaa), ")"))

#map all clones expanded in TLS (clone ct > 1)
map_adaptive(seurat.TLS.excludeSingletons$CTaa)+ 
      labs(title = paste0("Expanded ", ifelse(repertoire=="T", "TCRB", "IGH"),
                         " (count > 1) in TLS mapped onto SC ", "data (n = ", length(seurat.TLS.excludeSingletons$CTaa), ")"))

###########################
#Plot cells present in TLS using cell.highlight
DimPlot(seurat, cells.highlight =names(which(seurat$TLS.present ==1)), split.by = "orig.ident", ncol=4, sizes.highlight=0.05) + 
  ggprism::theme_prism()+
  ggtitle("All TLS TCRs (count >= 1) projected onto sc data")+
  NoLegend()

#Plot cells expanded in TLS using cell.highlight
DimPlot(seurat, cells.highlight =names(which(seurat$TLS.expanded ==1)), split.by='orig.ident', ncol=4, sizes.highlight = 0.05) +
  ggprism::theme_prism()+
  ggtitle("Expanded TLS TCRs (count > 1) projected onto sc data")+
  NoLegend()
dev.off()
```

#### 3. Visualize mapped clonotypes with stacked barplot

```{r barplot_clonotypes, height = 7, width = 14}
#run stackedBP script, which creates 2 functions (stackedBP and prepforBarplot) which will be used for making stacked barplots of individual clonotypes
source("./scripts/stackedBP.R")

pdf(paste0(output.path, "stackedBP.pdf"), width=12, height=10)

########################
########################
# Now use the 2 functions defined above to create 4 pairs of objects, named such that s = seurat repertoire, t = TLS repertoires

# first pair of objects from totality of seurat objects and TLS dissections. these are created from two objects,seurat.TLS and seurat.TLS.bypatient
# 1A. s.plus.T.forBarplot
# 1B. s.plus.T.forBarplot.bypatient

# second pair of objects, where I have removed singletons from the single cell repertoires. These two are created from above objects 1A and 1B
# 1C. s.plus.T.forBarplot.noSingletons
# 1D. s.plus.T.forBarplot.noSingletons.bypatient

#########################
# third pair of objects, where I have removed singletons from TLS (i.e. only included expanded TLS clones). these were created from two objects
# seurat.TLS.excludeSingletons 
# seurat.TLS.excludeSingletons.bypatient

# 2A. s.plus.T.excludeSingletons.forBarplot
# 2B. s.plus.T.excludeSingletons.forBarplot.bypatient
 
# fourth pair of objects, where I removed singletons from seurat object and removed singletons from TLS repertoires. these are created from 2A and 2B.
# 2C. s.plus.T.excludeSingletons.forBarplot.noSingletons
# 2D. s.plus.T.excludeSingletons.forBarplot.noSingletons.bypatient

########################
########################

#Create objects 1A, 1B, 1C, 1D
# create 1A using prepforBarplot function
s.plus.T.forBarplot <- prepforBarplot(seurat.TLS)
# plot 1A using stackedBP

p <- stackedBP(s.plus.T.forBarplot) + 
    ggtitle(
      paste0(ifelse(repertoire=="T", "T cells", "B cells"),
             " with a matching ", 
             ifelse(repertoire=="T", "TCRB", "IGH"),
             " in TLS ", " all samples",
             " (n=", 
             sum(s.plus.T.forBarplot$count), ")"
      )) 

# create 1B using prepforBarplot function
s.plus.T.forBarplot.bypatient <- lapply(seurat.TLS.bypatient,prepforBarplot)
# plot 1B using stackedBP function
spt_list <- lapply(1:length(s.plus.T.forBarplot.bypatient), function(i) {
  stackedBP(s.plus.T.forBarplot.bypatient[[i]])+
    ggtitle(
      paste0(names(s.plus.T.forBarplot.bypatient[i]),
             " (n=", 
             sum(s.plus.T.forBarplot.bypatient[[i]]$count), ")"
      ))
})
patchwork::wrap_plots(spt_list, guides="collect")+
  plot_annotation(
    title = paste0(ifelse(repertoire=="T", "T cells", "B cells"),
                   " with a matching ",
                   ifelse(repertoire=="T", "TCRB", "IGH"),
                   " in TLS ")
  )& theme(legend.position = "none")

#Create 1C by subsetting 1A for clones with count > 1, then plot with stackedBP function
s.plus.T.forBarplot.noSingletons <- s.plus.T.forBarplot[s.plus.T.forBarplot$countSum>1,]
stackedBP(s.plus.T.forBarplot.noSingletons)+
  ggtitle(paste0(ifelse(repertoire=="T", "T cells", "B cells"),
                 " matching expanded ", 
                 ifelse(repertoire=="T", "TCRB", "IGH"),
                 " in TLS of all samples", 
                 ", sc clone ct > 1", " (n=",
                 sum(s.plus.T.forBarplot.noSingletons$count), ")"
  ))

#Create 1D by subsetting 1B for cloens with count > 1, then plot with stackedBP function
s.plus.T.forBarplot.noSingletons.bypatient <- lapply(1:length(s.plus.T.forBarplot.bypatient), function(i) {
  s.plus.T.forBarplot.bypatient[[i]][s.plus.T.forBarplot.bypatient[[i]]$countSum>1,]}) %>% `names<-`(names(s.plus.T.forBarplot.bypatient))

spt_noSingletons_list <- lapply(1:length(s.plus.T.forBarplot.noSingletons.bypatient), function(i) {
  stackedBP(s.plus.T.forBarplot.noSingletons.bypatient[[i]])+
    ggtitle(
      paste0(names(s.plus.T.forBarplot.noSingletons.bypatient[i]),
             " (n=", 
             sum(s.plus.T.forBarplot.noSingletons.bypatient[[i]]$count), ")"
      ))
})

patchwork::wrap_plots(spt_noSingletons_list, guides="collect")+ 
  plot_annotation(
    title = paste0(ifelse(repertoire=="T", "T cells", "B cells"),
            " with a matching ",
            ifelse(repertoire=="T", "TCRB", "IGH"),
            " in TLS (sc clone ct > 1)")
  ) & theme(legend.position = "none")

########################### 
#Create objects 2A-2D, which are built by excluding TLS singletons 
#make stacked barplot for all clonotypes with clone count > 1 in  TLS that map onto peripheral sc data
#make s.plus.T.forBarplot object, which collapses the seurat object into unique clonotypes (rather than unique cells)

#2A
s.plus.T.excludeSingletons.forBarplot <- prepforBarplot(seurat.TLS.excludeSingletons)

stackedBP(s.plus.T.excludeSingletons.forBarplot) + 
  labs(title = paste0(ifelse(repertoire=="T", "T cells", "B cells"),
                     " matching expanded ", 
                     ifelse(repertoire=="T", "TCRB", "IGH"),
                     " (clones > 1) in TLS",
                     " (n=", length(unique(s.plus.T.excludeSingletons.forBarplot$TCRB_or_IGH)), ")"
                     )
  )

#2B
s.plus.T.excludeSingletons.forBarplot.bypatient <- lapply(seurat.TLS.excludeSingletons.bypatient, prepforBarplot)
spt_excludeSingletons_list <- lapply(1:length(s.plus.T.excludeSingletons.forBarplot.bypatient), function(i) {
  stackedBP(s.plus.T.excludeSingletons.forBarplot.bypatient[[i]])+
    ggtitle(paste0(names(s.plus.T.excludeSingletons.forBarplot.bypatient[i])," (n=", sum(s.plus.T.excludeSingletons.forBarplot.bypatient[[i]]$count), ")"))
})

patchwork::wrap_plots(spt_excludeSingletons_list, guides="collect")+ 
  plot_annotation(
    title =paste0(ifelse(repertoire=="T", "T cells", "B cells"),
           " with a matching expanded ",
           ifelse(repertoire=="T", "TCRB", "IGH"),
           " (clones > 1) in TLS ")
  ) & theme(legend.position = "none")


#Create 2C by subsetting 2A for clones with count > 1, then plot with stackedBP function
s.plus.T.excludeSingletons.forBarplot.noSingletons <- s.plus.T.excludeSingletons.forBarplot[s.plus.T.excludeSingletons.forBarplot$countSum>1,]
stackedBP(s.plus.T.excludeSingletons.forBarplot.noSingletons)+
  ggtitle(paste0(ifelse(repertoire=="T", "T cells", "B cells"),
                     " matching expanded ", 
                     ifelse(repertoire=="T", "TCRB", "IGH"),
                     " (clones > 1) in TLS",
                     ", sc clone ct > 1", " (n=",
                     sum(s.plus.T.excludeSingletons.forBarplot.noSingletons$count), ")"
                     ))

#Create 2D by subsetting 2B for clones with count > 1, then plot with stackedBP function
s.plus.T.excludeSingletons.forBarplot.noSingletons.bypatient <- lapply(1:length(s.plus.T.excludeSingletons.forBarplot.bypatient), function(i) {
  s.plus.T.excludeSingletons.forBarplot.bypatient[[i]][s.plus.T.excludeSingletons.forBarplot.bypatient[[i]]$countSum>1,]}) %>% `names<-`(names(s.plus.T.excludeSingletons.forBarplot.bypatient))

spt_excludeSingletons_noSingletons_list <- lapply(1:length(s.plus.T.excludeSingletons.forBarplot.noSingletons.bypatient), function(i) {
  stackedBP(s.plus.T.excludeSingletons.forBarplot.noSingletons.bypatient[[i]])+
    ggtitle(
      paste0(names(s.plus.T.excludeSingletons.forBarplot.noSingletons.bypatient[i]),
             " (n=", 
             sum(s.plus.T.excludeSingletons.forBarplot.noSingletons.bypatient[[i]]$count), ")"
      ))
})

patchwork::wrap_plots(spt_excludeSingletons_noSingletons_list, guides="collect")+ 
  plot_annotation(
    title =paste0(ifelse(repertoire=="T", "T cells", "B cells"),
                  " with a matching ",
                  ifelse(repertoire=="T", "TCRB", "IGH"),
                  " (clones > 1) in TLS",
                  " in TLS (sc clone ct > 1)")
 ) & theme(legend.position = "none")

dev.off()
```

#### 4. Occupied repertoire

```{r occupiedRepertoire}
#prep metadata for occcupiedscRepertoire function by creating $test column in metadata, which will be used for the figure legend of the occupied repertoire function
#note this requires changing the $clonotype column in the metadata which will be appended with the prefix "TLS present_" or "TLS absent_". This is then 'reset' at the conclusion of this code chunk

occRepWrapper <- function(x, my_proportion) {
  x$cloneType <- paste0(ifelse(x$TLS.present==1, "TLS present", "TLS absent"), 
                           "_", 
                           x$cloneType)
  x@meta.data$cloneType <- factor(x@meta.data$cloneType, levels = c(
  "TLS present_Hyperexpanded (100 < X <= 500)",
  "TLS present_Large (20 < X <= 100)",
  "TLS present_Medium (5 < X <= 20)",
  "TLS present_Small (1 < X <= 5)",
  "TLS present_Single (0 < X <= 1)",
  "TLS absent_Hyperexpanded (100 < X <= 500)",
  "TLS absent_Large (20 < X <= 100)",
  "TLS absent_Medium (5 < X <= 20)",
  "TLS absent_Small (1 < X <= 5)",
  "TLS absent_Single (0 < X <= 1)", NA))

  TLS_clonetype_col = c(brewer.pal(6, "Blues")[6:2], brewer.pal(6, "Reds")[c(6:2)])
  names(TLS_clonetype_col) = levels(x@meta.data$cloneType)
  p1 <- occupiedscRepertoire(x, label=F, x.axis = "ident", proportion=my_proportion) + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    theme(legend.title=element_blank(),legend.position="right")
  set_palette(p1, TLS_clonetype_col)
}

pdf(paste0(output.path, "occRep.pdf"))

occRep_output <-occupiedscRepertoire(seurat, label=F, x.axis="ident", proportion=T)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  theme(legend.title=element_blank(),legend.position="right")
set_palette(occRep_output, pals::brewer.spectral(5))

occRep_output2 <-occupiedscRepertoire(seurat, label=F, x.axis="ident", proportion=F)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  theme(legend.title=element_blank(),legend.position="right")
set_palette(occRep_output2, pals::brewer.spectral(5))

occRepWrapper(seurat, T)
occRepWrapper(seurat, F)

#make plot of occRep by patient (single patient plots, assembled as patchwork)
if (length(unique(seurat$Patient))>1) {
  occRep.by.patient <- lapply(1:length(unique(seurat$Patient)), function(i) {
    temp <- subset(x = seurat, subset = Patient == unique(seurat$Patient)[i])
    occRepWrapper(temp, T)+ggtitle(unique(seurat$Patient)[i])
  })
  patchwork::wrap_plots(occRep.by.patient, guides="collect") & theme(legend.position = "none")
} 
dev.off()
```

### C. Fisher's tests
#### 1. fisher's test for clonal expansion

```{r}
#do fisher's exact test for association between CD8 and clonal expansion
#CD8 TEM vs non-CD8 TEM, expanded (defined as clone count >2) vs non expanded (clone count 1-2)
fisher.df = select(seurat@meta.data, c(Frequency, hasTCR, active.ident,cloneType)) %>% 
  #subset for only the cells with a TCR
  filter(hasTCR=="Yes") %>%
  # create expansion column, which is binary categorical variable where expanded = frequency > 5 and nonexpanded = frequency 1-5
  mutate(expansion = case_when(Frequency > 5 ~ "Expanded", 

                               .default= "Non-Expanded"))  
fisher.df
OR_table = c()
for (i in unique(fisher.df$active.ident)) {  
    # create simplified cluster column 
  # mutate(CD8binary = case_when(active.ident == "Treg" ~ "Treg",  # .default="Non-Treg"))
  temp <- fisher.df %>% mutate(binary = case_when(active.ident == i# %in% c("CD8 TEM"#, "CD8 TEM", "CD8 Proliferating"
                                                 ~ i,
                                                 .default=paste0("Non-", i)))#CD8"))
# table(fisher.df$active.ident, fisher.df$expansion)
  two_by_two = table(temp$binary, temp$expansion) 
  test <- fisher.test(two_by_two)
  # test <- table(temp$CD8binary, temp$expansion) %>% fisher.test
  # mosaicplot(dat,
  #          main = "Mosaic plot",
  #          color = TRUE)
# chisq.result <- chisq.test(dat)
# chisq.test(dat)
  # test <- fisher.test(dat)
#return odds ratio
  OR_table = c(OR_table, c(i, 
                           ifelse(test$p.value < 0.001, "< 0.001",
                                  round(test$p.value, 3)),
                           round(as.numeric(test$estimate),2),
                           round(test$conf.int,2)
                           ))
  # print(i)
  # print(test$estimate)
}
OR_table = as.data.frame(matrix(OR_table,ncol =5,byrow = T)) 
colnames(OR_table) <- c("Cluster", "p-value","OR", "95%_CI_Lower","95%_CI_Upper")
OR_table$OR =as.numeric(OR_table$OR)
OR_table = arrange(OR_table,desc(OR))
OR_table
write.csv(OR_table, paste0(output.path, "fisher_exact_test_clonal_expansion_greater_than_5_clones_by_cluster.csv"), row.names = F)

#############
## same fisher's exact test, but by patient
#############
fisher.df.bypatient = select(seurat@meta.data, c(Patient, Frequency, hasTCR, active.ident,cloneType)) %>% 
  #subset for only the cells with a TCR
  filter(hasTCR=="Yes") %>%
  # create expansion column, which is binary categorical variable where expanded = frequency > 5 and nonexpanded = frequency 1-5
  mutate(expansion = case_when(Frequency > 5 ~ "Expanded", 

                               .default= "Non-Expanded"))  %>% 
  split(., f=.$Patient)

lapply(1:length(fisher.df.bypatient),function(x) { 
  fisher.df = fisher.df.bypatient[[x]]
  OR_table = c()
  for (i in unique(fisher.df$active.ident)) {  
    # create simplified cluster column 
  # mutate(CD8binary = case_when(active.ident == "Treg" ~ "Treg",  # .default="Non-Treg"))
  temp <- fisher.df %>% mutate(binary = case_when(active.ident == i# %in% c("CD8 TEM"#, "CD8 TEM", "CD8 Proliferating"
                                                 ~ i,
                                                 .default=paste0("Non-", i)))#CD8"))
# table(fisher.df$active.ident, fisher.df$expansion)
  test <- table(temp$binary, temp$expansion) %>% fisher.test
  # test <- table(temp$CD8binary, temp$expansion) %>% fisher.test
  # mosaicplot(dat,
  #          main = "Mosaic plot",
  #          color = TRUE)
# chisq.result <- chisq.test(dat)
# chisq.test(dat)
  # test <- fisher.test(dat)
#return odds ratio
  OR_table = c(OR_table, c(i, 
                           ifelse(test$p.value < 0.001, "< 0.001",round(test$p.value, 3)),
                           round(as.numeric(test$estimate),2),
                           round(test$conf.int,2)
                           ))
  # print(i)
  # print(test$estimate)
  }
  OR_table = as.data.frame(matrix(OR_table,ncol =5,byrow = T)) 
  colnames(OR_table) <- c("Cluster", "p-value","OR", "95%_CI_Lower","95%_CI_Upper")
  OR_table$OR <- as.numeric(as.character(OR_table$OR))
  OR_table = arrange(OR_table,desc(OR))
  return(OR_table)
# write.csv(OR_table, paste0(output.path, "fisher_exact_test_clonal_expansion_greater_than_5_clones_by_cluster.csv"), row.names = F)
}) %>% 
  `names<-`(.,names(fisher.df.bypatient)) %>% 
  writexl::write_xlsx(., paste0(output.path, "fisher_exact_test_clonal_expansion_greater_than_5_clones_by_cluster_byPatient.xlsx"))
```

#### 2. fisher's exact test for cluster assocation w/ TLS

```{r}
# CD8 vs non-other
fisher.df = select(seurat@meta.data, c(hasTCR, active.ident,TLS.present)) %>%
  #subset for only the cells with a TCR
  filter(hasTCR=="Yes")
nrow(fisher.df)
fisher.df$TLS.present <- factor(fisher.df$TLS.present, levels=c(1,0), labels=c("TLS present","TLS absent"))
table(fisher.df$active.ident,fisher.df$TLS.present)
fisher.df
OR_table = c()
for (i in unique(fisher.df$active.ident)) {
    # create simplified cluster column
  # mutate(CD8binary = case_when(active.ident == "Treg" ~ "Treg",  # .default="Non-Treg"))
  temp <- fisher.df %>% mutate(binary = case_when(active.ident == i# %in% c("CD8 TEM"#, "CD8 TEM", "CD8 Proliferating"
                                                 ~ i,
                                                 .default=paste0("Non-", i)))#CD8"))
# table(fisher.df$active.ident, fisher.df$expansion)
  dat <- table(temp$TLS.present,temp$binary)#
  test <- fisher.test(dat)
  # test <- table(temp$CD8binary, temp$expansion) %>% fisher.test
  # mosaicplot(dat,
  #          main = "Mosaic plot",
  #          color = TRUE)
# chisq.result <- chisq.test(dat)
# chisq.test(dat)
  # test <- fisher.test(dat)
#return odds ratio
  OR_table = c(OR_table, c(i,
                           ifelse(test$p.value < 0.001, "< 0.001",
                                  round(test$p.value, 3)),
                           round(as.numeric(test$estimate),2),
                           round(test$conf.int,2)
                           ))
  # print(i)
  # print(test$estimate)
}
OR_table = as.data.frame(matrix(OR_table,ncol =5,byrow = T))
colnames(OR_table) <- c("Cluster", "p-value","OR", "95%_CI_Lower","95%_CI_Upper")
OR_table$OR <- as.numeric(as.character(OR_table$OR))
OR_table = arrange(OR_table,desc(OR))
OR_table
write.csv(OR_table, paste0(output.path, "fisher_exact_test_TLS_presence_vs_cluster.csv"), row.names = F)

```

# IV. Export for OT6 PBMC data for OT6 TIL analysis

```{r export OT6 PBMC data for OT6 only analysis, eval=F}
#export OT6 object
seurat.pbmc.OT6<-subset(seurat, subset = Patient == "OT6")
nrow(seurat.pbmc.OT6@meta.data)
saveRDS(seurat.pbmc.OT6, file=paste0(output.path, "seurat_pbmc_azimuth_l3_OT6.rds"))
```

# V. Plot TLS composition by cluster
## A. cluster colors
```{r}
  cluster_colors=c(cluster_colors, "lightgray")
  names(cluster_colors)[length(cluster_colors)] <- "unmatched" #name last item, "lightgray", as unmatched
  levels(cluster_colors) = cluster_colors
  cluster_colors
```

## B. plots
Do just HCC02, HCC08, HCC09, HCC13, HCC14, and OT1 here

```{r TLS_composition, eval=F}
#####################################################
source("scripts/TLS_composition_plot.R") #load TLS-compoisition function

### make figures for all patients but OT6
patients <- adaptiveTLS$meta$Patient.ID %>% unique

#TLS_composition requires that object factor not have levels that are not present in the vector 
patients = patients[patients %in% unique(seurat@meta.data$Patient)]
patients = factor(patients, levels = levels(patients)[levels(patients) %in% unique(seurat@meta.data$Patient)])

for(i in 1:length(patients)) {
TLS_composition(patients[i])
}


### Make summary figure for all patients
#### note this uses code from the first part of the TLS_composition function, to create a list object all, which has all of the TCR data for each TLS collapsed

all = list()
for (i in patients){
   meta_subset = adaptiveTLS$meta[grep(i, adaptiveTLS$meta$Sample),]
  TLS_sampNames <- meta_subset$Sample
    list <- adaptiveTLS$data[TLS_sampNames]
    TLSrep <- bind_rows(list,.id="TLS") %>% select(TLS, Clones, Proportion, CDR3.nt, CDR3.aa)  
    # TLSrep %>% head
    TLSrep %>% group_by(TLS) %>% summarise(total_proportion=sum(Proportion)) #this shows that the Proportion column doesnt add up to 100%, because non-productive sequences have been excluded, so we have to make a new one based on Clones column
    TLSrep = TLSrep %>% group_by(TLS) %>% mutate(new_prop = Clones/sum(Clones)) %>% ungroup()
    TLSrep %>% group_by(TLS) %>% summarise(total_new_prop = sum(new_prop)) #creates new proportion column, doublechecks that this adds up to 100 by TLS now
    TLSrep %>% head #look at new_prop, which should be slightly higher than Proportion
    TLSrep = select(TLSrep, -Proportion) %>% rename(., Proportion = new_prop) #drops old Proportion then renames new one as Proportion
    TLSrep %>% head
    #####################################################
    # Build single cell table for that same patient
    scRep <- seurat@meta.data %>% filter(Patient==i) %>% select(active.ident, TCRB_or_IGH, TCRB_or_IGH2)
    scRep <- mutate_all(scRep, funs(replace(., .=='NA', NA))) # some TCRB_or_IGH have character string "NA" rather than NA, so change these to form that is.na will recognize and omit
    
    # scRep$TCRB_or_IGH %>% is.na() %>% table
    
    #remove rows with NA in TCRB_or_IGH or TCRB_or_IGH2
    scRep.dropNA <- scRep[!with(scRep,is.na(TCRB_or_IGH)&is.na(TCRB_or_IGH2)),]
    #scRep.dropNA$TCRB_or_IGH %>% is.na() %>% table
    
    # scRep.dropNA %>%  group_by(TCRB_or_IGH, active.ident) %>% summarise(TCRB_or_IGH_n = n()) %>% View
    
    scRep.dropNA.summarised <- scRep.dropNA %>%  group_by(TCRB_or_IGH, active.ident) %>% summarise(TCRB_or_IGH_n = n())
    
    scRep.dropNA.summarised.wider <- scRep.dropNA.summarised %>% pivot_wider(., names_from = active.ident, values_from = TCRB_or_IGH_n)
    scRep.dropNA.summarised.wider$subsets = rowSums( !is.na(scRep.dropNA.summarised.wider[,2:ncol(scRep.dropNA.summarised.wider)]))
    # scRep.dropNA.summarised.wider %>% View
    scRep_final = scRep.dropNA.summarised.wider %>% 
      group_by(TCRB_or_IGH) %>% summarise_each((funs(sum))) %>% 
      mutate(top_r=apply(.[,2:(ncol(.)-1)], 1, function(x) names(x)[which.max(x)]))
    
    # View(scRep_final)
    
    ####################################################
    #make df by joining the two objects above (TLS data and single cell data)
    df <- left_join(TLSrep, select(scRep_final, c(TCRB_or_IGH,top_r)), 
                    by=c("CDR3.aa" = "TCRB_or_IGH"))
    df$top_r = df$top_r %>% replace_na("unmatched")
    df$top_r <- factor(x=df$top_r, levels=c(as.character(levels.manual), "unmatched")) #set levels and add "unmatched" to levels
    cluster_colors = c(cluster_colors,"lightgray")
    names(cluster_colors)[length(cluster_colors)] = "unmatched"
    df$expanded = if_else(df$Clones==1, "singleton", "expanded")
    df$known = if_else(df$top_r == "unmatched", 0, 1)
    df = merge(df,meta_subset[,c("Sample","TLS_type")], by.x="TLS",by.y="Sample") 
    df$TLS = df$TLS %>% str_replace("-T_TCRB","")
    
    df.expanded = df[df$expanded=="expanded",] 
    df.expanded = df.expanded %>% group_by(TLS) %>% mutate(id = row_number())
  
    ############### everything to here in this function is identical to TLS_composition function ################# 
    ### in future, consolidate these two functions, and try to return df which I then plot outside the function ############
    
    data3=df[!df$top_r %in% ("unmatched"),] %>%  #another iteration, drop the unmatched and plot all knowns, expanded or not expanded
      # group_by(TLS) %>%
      mutate(new_prop = Clones/sum(Clones)) %>%  #recalculates proportion 
      select(-Proportion) %>% rename(., Proportion = new_prop) %>%  ungroup()
    data3=data3 %>% 
      group_by(#TLS,
               top_r, ### ADD TLS_type here????
               #,expanded
               ) %>%
      summarise(Proportion_sum=sum(Proportion),
                Clones_sum=sum(Clones)) %>% ungroup()
    data3 = data3 %>% #group_by(TLS) %>% 
      arrange(.,#TLS,
              # expanded,
              desc(Proportion_sum)) %>%  #arrange in desired order
      mutate(ymax = cumsum(Proportion_sum),
             ymin = c(0,head(ymax,n=-1)))
  all[[i]] <- data3 
}

all_df <- bind_rows(all, .id="Patient")
all_df$Patient = factor(all_df$Patient, levels = unique(all_df$Patient))
p <- ggplot(all_df, aes(fill=top_r, y=Clones_sum, x=Patient)) + 
  geom_bar(position="fill", stat="identity",color="white")+
  scale_fill_manual(values=c(cluster_colors))+
  labs(x = "Patient", y = "Proportion (%)") +
  # theme_void()+
  ggprism::theme_prism()+
  theme(axis.text.x = element_text(angle=0),
        axis.ticks.x = element_blank(),
        # axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        strip.background = element_blank(),
        strip.text=element_text(face="bold")#,
        # legend.title = element_blank(),legend.text=element_text(face="bold")
  )+
  coord_cartesian(clip="off")+
  labs(title=paste0("Phenotype of TCR\u03B2 in TLS inferred by\n  matching with peripheral single cell data"),
          # caption="*PBMC only, unmatched excluded"
       )+
  # guides(fill=guide_legend(title=""))+
  scale_y_continuous(labels = scales::percent_format(suffix=""))
p

pdf(paste0(output.path,"TLS_composition_plot_all.pdf"))
p
dev.off()
```


# VI. what % of cells that had TCR were detected in TLS

```{r}
summary_by_cluster <- seurat@meta.data[seurat@meta.data$hasTCR=="Yes",] %>% 
  group_by(active.ident,TLS.present) %>% summarise(n=n()) %>% ungroup() %>% 
  group_by(active.ident) %>% mutate(total = sum(n)) %>% 
  mutate(pct = (n/total) * 100)
summary_by_cluster$pct=signif(summary_by_cluster$pct,2)
summary_by_cluster$n_pct = paste0(summary_by_cluster$n," \n (",summary_by_cluster$pct,"%)")

summary_by_cluster$TLS.present = factor(summary_by_cluster$TLS.present, levels=c(1,0), label=c("Yes", "No")) 

summary_by_cluster

pdf(paste0(output.path,"matched_TCRs_by_single_cell_cluster.pdf"))
ggplot(summary_by_cluster,aes(x=active.ident, y= n, fill=TLS.present#,label=paste0(pct,"%")
                              ))+
  geom_bar(position="stack",stat="identity",color="white")+
  # scale_fill_discrete(name = "Present In TLS")+
  ggsci::scale_fill_aaas(alpha=0.9,name = "Present In TLS")+
  xlab("Cluster")+ylab("Single cells")+
  # scale_y_continuous(labels = scales::percent)+
  # geom_text(size = 3, position = position_stack(vjust = 0.5), color="white")+
  ggtitle("Peripheral single cell clusters associated with TLS")+
  ggprism::theme_prism()+
    theme(axis.text.x = element_text(angle=45,hjust=1,vjust=1),
          legend.title = element_text())

dev.off()

pdf(paste0(output.path,"matched_TCRs_by_single_cell_cluster_fill.pdf"))
ggplot(summary_by_cluster,aes(x=active.ident, y= n, fill=TLS.present#,label=paste0(pct,"%")
                              ))+
  geom_bar(position="fill",stat="identity",color="white")+
  ggsci::scale_fill_aaas(alpha=0.9,name = "Present In TLS")+
  xlab("Cluster")+ylab("Proportion (%)")+
  scale_y_continuous(labels = scales::percent_format(suffix=""))+
  # geom_text(size = 3, position = position_fill(vjust = 0.5), color="white")+
  ggtitle("Peripheral single cell clusters associated with TLS")+
  ggprism::theme_prism()+
    theme(axis.text.x = element_text(angle=45,hjust=1,vjust=1),
          legend.title = element_text())
dev.off()
```

# VII. Session Info
```{r sessioninfo}
sessionInfo()
writeLines(capture.output(sessionInfo()), "sessionInfo_scRNA-TCR_PBMC.txt")
```