6.2 Functions
I’m likely going to wrap all of this into a package soon, but here we will start with a set of packages and functions used throughout the study. I need to go through and annotate with package was used with with function, and that will take a little time. So for the sake of making the code available as soon as possible, I’ll present the intermediate product here.
Packages The following are a list of packages that are used during the study. I will update this list with the minial set and begin to put together a package soon.
library(ape)
library(circlize)
library(cooccur)
library(corrplot)
library(cowplot)
library(data.table)
library(dplyr)
library(forcats)
library(gdata)
library(ggbeeswarm)
library(ggcorrplot)
library(ggnet)
library(ggplotify)
library(ggplot2)
library(ggrepel)
library(ggridges)
library(grid)
library(gridExtra)
library(ggthemes)
library(ggtree)
library(igraph)
library(knitr)
library(miscTools)
library(network)
library(pals)
library(parallel)
library(pheatmap)
library(phylobase)
library(purrr)
library(qgraph)
library(RColorBrewer)
library(rcompanion)
library(ReinforcementLearning)
library(reshape2)
library(sna)
library(tidygraph)
library(tidyr)
library(tidyverse)
library(TRONCO)
library(vegan)
library(widyr)Custom functions
options(stringsAsFactors=FALSE)
random_distribution_NGT_CI <- function(NGT_to_clone_list,clonal_abundance,sample_to_test,replicates,clone_cutoff){
mat_of_interest <- NGT_to_clone[[sample_to_test]][,1:(ncol(NGT_to_clone[[sample_to_test]])-1)]
bulk_VAF_order <-names(sort(colSums(mat_of_interest),decreasing=TRUE))
test<-replicate(n=replicates,apply(mat_of_interest,MARGIN=c(2),function(x){
z<-sample(x,size=length(x),replace=FALSE)
return(as.numeric(z))}),simplify = "array")
test2<-apply(test,3,function(q){
x<-data.frame(q[,bulk_VAF_order],
"Clone"=apply(q[,bulk_VAF_order],1, function(p){paste(p,sep="_",collapse="_")}))
y<-data.table(data.frame("Count"=as.matrix(table(x[,"Clone"])),
"Clone"=names(table(x[,"Clone"]))),
key="Count")
y$Clone<-factor(y$Clone,levels=rev(as.character(y$Clone)))
return(data.frame(y))})
if(class(test2)=="list"){y <- lapply(test2,data.frame) %>% reduce(full_join, by = "Clone")}
if(class(test2)=="array"){y <- apply(test2,3,data.frame) %>% reduce(full_join, by = "Clone")}
y[is.na(y)]<-0
rownames(y)<-y[,"Clone"]
y<-as.matrix(y[,-2])
mode(y)<-'numeric'
z<-t(apply(y,1,function(p){quantile(p,probs=c(0.025,0.975))}))
z_mat<-data.frame(z,"Clone"=rownames(z))
set<-setNames(data.frame(full_join(data.frame(clonal_abundance[[sample_to_test]]),z_mat,by="Clone")),c("Count","Clone","LCI","UCI"))%>%filter(LCI>=clone_cutoff&!is.na(Count))
return(set)
}
bootstrap_allele_dropout_NGT_mat <- function(NGT_to_clone_list,clonal_abundance,sample_to_test,replicates,clone_cutoff){
mat_of_interest <- NGT_to_clone[[sample_to_test]][,1:(ncol(NGT_to_clone[[sample_to_test]])-1)]
bulk_VAF_order <-names(sort(colSums(mat_of_interest),decreasing=TRUE))
test<-replicate(n=replicates,apply(mat_of_interest,MARGIN=c(1,2),function(x){
if(x==1){z<-x}
if(x==0){z<-sample(x=c(0,1),size=1,prob=c(0.9,0.1))}
if(x==2){z<-sample(x=c(2,1),size=1,prob=c(0.9,0.1))}
return(as.numeric(z))}),simplify = "array")
test2<-apply(test,3,function(q){
x<-data.frame(q[,bulk_VAF_order],
"Clone"=apply(q[,bulk_VAF_order],1, function(p){paste(p,sep="_",collapse="_")}))
y<-data.table(data.frame("Count"=as.matrix(table(x[,"Clone"])),
"Clone"=names(table(x[,"Clone"]))),
key="Count")
y$Clone<-factor(y$Clone,levels=rev(as.character(y$Clone)))
return(data.frame(y))})
if(class(test2)=="list"){y <- lapply(test2,data.frame) %>% reduce(full_join, by = "Clone")}
if(class(test2)=="array"){y <- apply(test2,3,data.frame) %>% reduce(full_join, by = "Clone")}
y[is.na(y)]<-0
rownames(y)<-y[,"Clone"]
y<-as.matrix(y[,-2])
mode(y)<-'numeric'
z<-t(apply(y,1,function(p){quantile(p,probs=c(0.025,0.975))}))
z_mat<-data.frame(z,"Clone"=rownames(z))
set<-setNames(data.frame(full_join(data.frame(clonal_abundance[[sample_to_test]]),z_mat,by="Clone")),c("Count","Clone","LCI","UCI"))%>%filter(LCI>=clone_cutoff&!is.na(Count))
return(set)
}
bootstrap_NGT_mat <- function(NGT_to_clone_list,clonal_abundance,sample_to_test,replicates,clone_cutoff){
test<-replicate(n=replicates,resample_fun(NGT_to_clone[[sample_to_test]]),simplify = "array")
if(class(test)=="list"){
y <- lapply(test,data.frame) %>% reduce(full_join, by = "Clone")
}
if(class(test)=="array"){
y <- apply(test,3,data.frame) %>% reduce(full_join, by = "Clone")
}
y[is.na(y)]<-0
rownames(y)<-y[,"Clone"]
y<-as.matrix(y[,-2])
mode(y)<-'numeric'
z<-t(apply(y,1,function(p){
quantile(p,probs=c(0.025,0.975))
}))
z_mat<-data.frame(z,"Clone"=rownames(z))
set<-setNames(data.frame(inner_join(data.frame(clonal_abundance[[sample_to_test]]),z_mat,by="Clone")),c("Count","Clone","LCI","UCI"))%>%filter(LCI>=clone_cutoff)
return(set)
}
resample_fun<-function(data){
x <- data[sample(x=1:nrow(data),replace=TRUE),]
return(as.matrix(data.table(data.frame("Count"= as.matrix(table(x[,"Clone"])),
"Clone"=names(table(x[,"Clone"]))),
key="Count")))
}
extract_NGT_files <- function(sample,variants){
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep=""))
return(NGT)
}
count_unknown_cells<- function(sample,variants){
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep=""))
cells_with_unknown<-NGT[apply(NGT[,variants],MARGIN=1,FUN=function(x){sum(x==3)>0}),"Cell"]
return(c(length(cells_with_unknown)/nrow(NGT)))
}
extract_DP_files <- function(sample,variants){
DP <- read.csv(paste("./",sample,"/","DP.csv",sep=""))
cells_with_unknown<-DP[apply(DP[,variants],MARGIN=1,FUN=function(x){sum(x==3)>0}),"Cell"]
matrix_of_interest<-DP[!DP$Cell%in%cells_with_unknown,variants]
return(matrix_of_interest)
}
extract_SNP_info <- function(sample){
AF <- read.csv(paste("./",sample,"/","AF.csv",sep=""))
if(any(grepl("SNP_INFO.csv",list.files(paste("./",sample,"/",sep=""))))){
SNP_INFO <- read.csv(paste("./",sample,"/","SNP_INFO.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cdna"]!="",paste(x["gene"],x["cdna"],sep="_"),
ifelse(x["amplicon"]%in%c("MSK_RL_AMP54", "MSK_RL_AMP55", "MSK_RL_AMP56","MSK_RL_AMP57","MSK_RL_AMP58"),paste("FLT3_INS",x["POS"],x["ALT"],sep="_"),"")))
})
} else {
SNP_INFO <- read.csv(paste("./",sample,"/","Variants.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
colnames(SNP_INFO)[3] <- "protein"
SNP_INFO$cDNA <- as.character(SNP_INFO$cDNA)
SNP_INFO$protein <- as.character(SNP_INFO$protein)
SNP_INFO$Variant <- as.character(SNP_INFO$Variant)
SNP_INFO$Gene <- as.character(SNP_INFO$Gene)
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cDNA"]!="",paste(x["Gene"],x["cDNA"],sep="_"),
ifelse(grepl("^chr13",x["Variant"]),paste("FLT3_INS",x["Variant"],sep="_"),"")))
})
}
return(SNP_INFO)
}
# run the following just once to load the function
plot_variants_and_selection <- function(sample){
#Load in the data
AF <- read.csv(paste("./",sample,"/","AF.csv",sep="")) # - variant allele frequency
DP <- read.csv(paste("./",sample,"/","DP.csv",sep="")) # - read depth
GQ <- read.csv(paste("./",sample,"/","GQ.csv",sep="")) # - quality scores
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep="")) # - genotype call converted to categorical (0-reference, 1-heterozygous mutation, 2-homozygous mutation, 3-unknown)
if(any(grepl("SNP_INFO.csv",list.files(paste("./",sample,"/",sep=""))))){
SNP_INFO <- read.csv(paste("./",sample,"/","SNP_INFO.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cdna"]!="",paste(x["gene"],x["cdna"],sep="_"),
ifelse(x["amplicon"]%in%c("MSK_RL_AMP54", "MSK_RL_AMP55", "MSK_RL_AMP56","MSK_RL_AMP57","MSK_RL_AMP58"),paste("FLT3_INS",x["POS"],x["ALT"],sep="_"),"")))
})# - variant metadata
} else {
SNP_INFO <- read.csv(paste("./",sample,"/","Variants.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
colnames(SNP_INFO)[3] <- "protein"
SNP_INFO$cDNA <- as.character(SNP_INFO$cDNA)
SNP_INFO$protein <- as.character(SNP_INFO$protein)
SNP_INFO$Variant <- as.character(SNP_INFO$Variant)
SNP_INFO$Gene <- as.character(SNP_INFO$Gene)
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cDNA"]!="",paste(x["Gene"],x["cDNA"],sep="_"),
ifelse(grepl("^chr13",x["Variant"]),paste("FLT3_INS",x["Variant"],sep="_"),"")))
})# - variant metadata
}
protein_changes_of_interest <- (SNP_INFO %>% filter(protein!="")%>%select(protein))[,1] # select column of variants
SNP_changes_of_interest <- rownames(SNP_INFO)[SNP_INFO[,"protein"]%in%protein_changes_of_interest]
SNP_protein_key <- data.frame("Mutant"=SNP_changes_of_interest,"Protein"=protein_changes_of_interest)
# Cell and clone selection and Average depth per cell per allele
distribution_of_unknowns_by_variant <- apply(NGT[,SNP_changes_of_interest],MARGIN=2,table)
NGT_subset<-NGT[,c("Sample","Cell",SNP_changes_of_interest)]
#NGT_interest$Clone <- apply(NGT_interest[,-c(1,2)],1,function(x){paste(x,sep="_",collapse="_")})
#clonal_abundance <- sort(table(NGT_interest$Clone))
DP_subset<- DP[,c("Sample","Cell",SNP_changes_of_interest)]
DP_NGT_melt<-inner_join(melt(DP_subset[,-1],id.var="Cell"),melt(NGT_subset[,-1],id.var="Cell"),by=c("Cell","variable"))
colnames(DP_NGT_melt) <- c("Cell","Mutant","Depth","Genotype")
DP_NGT_melt$Genotype <- ifelse(DP_NGT_melt$Genotype==3,"Unknown",
ifelse(DP_NGT_melt$Genotype==0,"WT",
ifelse(DP_NGT_melt$Genotype==1,"Heterozygous",
ifelse(DP_NGT_melt$Genotype==2,"Homozygous",DP_NGT_melt$Genotype))))
DP_NGT_melt$Genotype <- factor(DP_NGT_melt$Genotype ,levels=c("WT","Heterozygous","Homozygous","Unknown"))
summarized<-data.frame(DP_NGT_melt%>%group_by(Mutant,Genotype)%>%summarise("Depth"=mean(Depth)))
summarized_no_unknown <- summarized %>% filter(Genotype!="Unknown")
grouped <- group_by(summarized_no_unknown, Mutant)
output <- data.frame(summarise(grouped, mean=mean(Depth), sd=sd(Depth)))
output$Include <- rep("Include",length(rownames(output)))
output_final <- inner_join(SNP_protein_key,output)
depth_plot<- ggplot(summarized_no_unknown,aes(x=Mutant,y=Depth,color=Genotype))+geom_point()+coord_flip()+
scale_x_discrete(limits = rev(levels(summarized_no_unknown$Mutant)))+theme_bw()#+
write.csv(output_final,paste0("./",sample,"/",sample,"_VARIANT_SELECTION.csv"),row.names=FALSE)
ggsave(paste0("./",sample,"/",sample,"_VARIANT_SELECTION.pdf"),depth_plot,width=10,height=20)
}
run_mbio_analysis<-function(sample,diversity){
#Load in the data
if(any(grepl("VARIANT_SELECTION_LAM",list.files(paste("./",sample,"/",sep=""))))){
LAM_VARIANTS_MAT <- read.csv(paste("./",sample,"/",sample,"_VARIANT_SELECTION_LAM.csv",sep=""))
LAM_VARIANTS_SNP <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Mutant))[,1])
LAM_VARIANTS_Protein <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Protein))[,1])
} else {
print("No variant selection performed!")
return("next")
}
AF <- read.csv(paste("./",sample,"/","AF.csv",sep="")) # - variant allele frequency
DP <- read.csv(paste("./",sample,"/","DP.csv",sep="")) # - read depth
GQ <- read.csv(paste("./",sample,"/","GQ.csv",sep="")) # - quality scores
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep="")) # - genotype call converted to categorical (0-reference, 1-heterozygous mutation, 2-homozygous mutation, 3-unknown)
if(any(grepl("SNP_INFO.csv",list.files(paste("./",sample,"/",sep=""))))){
SNP_INFO <- read.csv(paste("./",sample,"/","SNP_INFO.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cdna"]!="",paste(x["gene"],x["cdna"],sep="_"),
ifelse(x["amplicon"]%in%c("MSK_RL_AMP54", "MSK_RL_AMP55", "MSK_RL_AMP56","MSK_RL_AMP57","MSK_RL_AMP58"),paste("FLT3_INS",x["POS"],x["ALT"],sep="_"),"")))
})# - variant metadata
} else {
SNP_INFO <- read.csv(paste("./",sample,"/","Variants.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
colnames(SNP_INFO)[3] <- "protein"
SNP_INFO$cDNA <- as.character(SNP_INFO$cDNA)
SNP_INFO$protein <- as.character(SNP_INFO$protein)
SNP_INFO$Variant <- as.character(SNP_INFO$Variant)
SNP_INFO$Gene <- as.character(SNP_INFO$Gene)
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cDNA"]!="",paste(x["Gene"],x["cDNA"],sep="_"),
ifelse(grepl("^chr13",x["Variant"]),paste("FLT3_INS",x["Variant"],sep="_"),"")))
})# - variant metadata
}
SNP_changes_of_interest <- LAM_VARIANTS_SNP
protein_changes_of_interest <- LAM_VARIANTS_Protein
SNP_protein_key <- data.frame("Mutant"=SNP_changes_of_interest,"Protein"=protein_changes_of_interest)
# Cell and clone selection
distribution_of_unknowns_by_variant <- apply(NGT[,SNP_changes_of_interest],MARGIN=2,table)
#print(distribution_of_unknowns_by_variant)
cells_with_unknown<-NGT[apply(NGT[,SNP_changes_of_interest],MARGIN=1,FUN=function(x){sum(x==3)>0}),"Cell"]
matrix_of_interest<-NGT[!NGT$Cell%in%cells_with_unknown,SNP_changes_of_interest]
bulk_VAF_order<-SNP_INFO[colnames(matrix_of_interest)[order(colSums(matrix_of_interest),decreasing=TRUE)],"protein"]
bulk_VAF_order <- bulk_VAF_order[!duplicated(bulk_VAF_order)]
matrix_of_interest$Clone <- apply(matrix_of_interest,1,function(x){paste(x,sep="_",collapse="_")})
clonal_abundance <- sort(table(matrix_of_interest$Clone))
clones_to_include<-names(clonal_abundance)[clonal_abundance>length(rownames(matrix_of_interest))*0]
matrix_subset_clones <- matrix_of_interest[matrix_of_interest$Clone%in%clones_to_include,]
#Average depth per cell per allele
DP_subset<- DP[-cells_with_unknown,c("Sample","Cell",rownames(SNP_INFO)[SNP_INFO$protein%in%protein_changes_of_interest])]
NGT_subset <- NGT[NGT$Cell%in%DP_subset$Cell,colnames(DP_subset)]
DP_NGT_melt<-inner_join(melt(DP_subset[,-1],id.var="Cell"),melt(NGT[,-1],id.var="Cell"),by=c("Cell","variable"))
colnames(DP_NGT_melt) <- c("Cell","Mutant","Depth","Genotype")
DP_NGT_melt$Genotype <- ifelse(DP_NGT_melt$Genotype==3,"Unknown",
ifelse(DP_NGT_melt$Genotype==0,"WT",
ifelse(DP_NGT_melt$Genotype==1,"Heterozygous",
ifelse(DP_NGT_melt$Genotype==2,"Homozygous",DP_NGT_melt$Genotype))))
DP_NGT_melt$Genotype <- factor(DP_NGT_melt$Genotype ,levels=c("WT","Heterozygous","Homozygous","Unknown"))
summarized<-data.frame(DP_NGT_melt%>%group_by(Mutant,Genotype)%>%summarise("Depth"=mean(Depth)))
# Verify variants selected are unique and encode protein changes
!any(duplicated(SNP_INFO[colnames(matrix_subset_clones),"protein"])) #Should return TRUE
colnames(matrix_subset_clones)<- c(as.character(SNP_INFO[colnames(matrix_subset_clones)[1:length(SNP_changes_of_interest)],"protein"]),"Clone")
# Aggregate data to get clone counts and architecture.
dedup<-matrix_subset_clones[!duplicated(matrix_subset_clones),]
clonal_architecture <- melt(dedup)
colnames(clonal_architecture) <- c("Clone","Mutant","Genotype")
clonal_architecture$Genotype <- ifelse(clonal_architecture$Genotype==3,NA,
ifelse(clonal_architecture$Genotype==0,"WT",
ifelse(clonal_architecture$Genotype==1,"Heterozygous",
ifelse(clonal_architecture$Genotype==2,"Homozygous",clonal_architecture$Genotype))))
clonal_architecture$Genotype <- factor(clonal_architecture$Genotype ,levels=c("WT","Heterozygous","Homozygous"))
clonal_abundance <- data.frame("Count"=as.matrix(table(matrix_subset_clones$Clone)))
clonal_abundance$Clone <- rownames(clonal_abundance)
clonal_abundance <- data.table(clonal_abundance,key="Count")
clonal_abundance$Clone <- factor(clonal_abundance$Clone,levels=rev(c(clonal_abundance$Clone)))
clonal_architecture$Clone <- factor(as.character(clonal_architecture$Clone),levels=rev(as.character(clonal_abundance$Clone)))
clonal_architecture$Mutant <-factor(clonal_architecture$Mutant , levels=rev(as.character(bulk_VAF_order)))
# Generate clonal architecture heatmap
gg_heatmap <- ggplot(data = clonal_architecture, aes(x = Clone, y = factor(Mutant), fill = Genotype)) +
geom_tile() +
scale_fill_manual(values=c("WT"=brewer.pal(7,"Reds")[1],
"Heterozygous"=brewer.pal(7,"Reds")[3],
"Homozygous"=brewer.pal(7,"Reds")[6],
"Unknown"="grey50"),"Genotype") +
theme_classic() +
ylab("Mutation")+
theme(legend.position = "bottom", legend.direction = "horizontal",
axis.text.x = element_blank(),
axis.line=element_blank(),
axis.title.x=element_blank(),
axis.ticks.x = element_blank(),
plot.margin=unit(c(0,1,1,1),"cm"))
# Generate clonal abundance barplot
gg_clonal_barplot <- ggplot(data = data.frame(clonal_abundance), aes(x = factor(Clone), y = Count)) +
geom_bar(stat = "identity", aes(fill = Count)) + theme_gray() +
theme_classic()+
ylim(0,max(clonal_abundance$Count)*1.3) +
ylab("Cell Count")+
geom_text(aes(label=Count), position=position_dodge(width=0.9), vjust=-0.25)+
scale_fill_distiller(name = "Value", palette = "Reds", direction = 1) +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.line=element_blank(),
legend.position = "none",
plot.margin=unit(c(1,1,-0.2,1),"cm"))
# Generate genotype calls by allele
distribution_of_unknowns_by_variant <- t(apply(NGT[,SNP_changes_of_interest],MARGIN=2,function(x){table(factor(x,levels=c(0,1,2,3)))}))
rownames(distribution_of_unknowns_by_variant) <- SNP_INFO[rownames(distribution_of_unknowns_by_variant),"protein"]
colnames(distribution_of_unknowns_by_variant) <- c("WT","Heterozygous","Homozygous","Unknown")
allele_melted <-melt(distribution_of_unknowns_by_variant)
colnames(allele_melted) <- c("Mutant","Genotype","Cells")
allele_melted$Genotype <- factor(allele_melted$Genotype,levels=rev(c("WT","Heterozygous","Homozygous","Unknown")))
allele_melted$Mutant <-factor(allele_melted$Mutant , levels=as.character(bulk_VAF_order))
#Generate allele barplot
gg_alleles <- ggplot(data = data.frame(allele_melted), aes(x = factor(Mutant), y = Cells)) +
geom_bar(stat = "identity", aes(fill = Genotype),position="stack") + theme_gray() +
theme_minimal()+
ylab("Cell Count")+
scale_fill_manual(values=c("WT"=brewer.pal(7,"Reds")[1],
"Heterozygous"=brewer.pal(7,"Reds")[3],
"Homozygous"=brewer.pal(7,"Reds")[6],
"Unknown"="grey50"),"Genotype") +
theme(axis.text.x=element_text(angle=30,hjust=1,vjust=1))
#Generate depth by alelle by mutant
gg_depth <- ggplot(summarized,aes(x=Mutant,color=Genotype,y=Depth))+
geom_point(position=position_dodge(width=0.4))+
scale_color_manual(values=c("WT"=brewer.pal(7,"Reds")[1],
"Heterozygous"=brewer.pal(7,"Reds")[3],
"Homozygous"=brewer.pal(7,"Reds")[6],
"Unknown"="grey50"),"Genotype") +
theme(axis.text.x=element_text(angle=30,hjust=1,vjust=1),
plot.margin=unit(c(1,1,1,2),"cm"))
# Compute spacing of graphs
plots <- list(gg_depth,gg_clonal_barplot,gg_alleles,gg_heatmap)
grobs <- list()
widths <- list()
for (i in 1:length(plots)){
grobs[[i]] <- ggplotGrob(plots[[i]])
widths[[i]] <- grobs[[i]]$widths[2:5]
}
maxwidth <- do.call(grid::unit.pmax, widths)
for (i in 1:length(grobs)){
grobs[[i]]$widths[2:5] <- as.list(maxwidth)
}
# Generate plot
grob.title <- textGrob(sample, hjust = 0.5, vjust = 0.5, gp = gpar(fontsize = 20))
pdf(paste(output_folder,"Graphs/",sample,".pdf",sep=""), width = 16, height = 8) # Open a new pdf file
grid.arrange(grobs=grobs, ncol = 2, top = grob.title)
graphics.off()
if(diversity==TRUE){
clonal_abundance_mat <- clonal_abundance
clonal_abundance$Clone <- rownames(clonal_abundance)
clonal_abundance_mat$Clone <- rownames(clonal_abundance_mat)
clonal_abundance_count <- data.table(clonal_abundance,key="Count")
shannon <- diversity(clonal_abundance_mat[,"Count"])
write.csv2(shannon,paste(output_folder,"Diversity_score/",sample,".csv",sep=""))
}
}
stats_output<-function(sample){
#Load in the data
if(any(grepl("VARIANT_SELECTION_LAM",list.files(paste("./",sample,"/",sep=""))))){
LAM_VARIANTS_MAT <- read.csv(paste("./",sample,"/",sample,"_VARIANT_SELECTION_LAM.csv",sep=""))
LAM_VARIANTS_SNP <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Mutant))[,1])
LAM_VARIANTS_Protein <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Protein))[,1])
} else {
print("No variant selection performed!")
return("next")
}
AF <- read.csv(paste("./",sample,"/","AF.csv",sep="")) # - variant allele frequency
DP <- read.csv(paste("./",sample,"/","DP.csv",sep="")) # - read depth
GQ <- read.csv(paste("./",sample,"/","GQ.csv",sep="")) # - quality scores
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep="")) # - genotype call converted to categorical (0-reference, 1-heterozygous mutation, 2-homozygous mutation, 3-unknown)
if(any(grepl("SNP_INFO.csv",list.files(paste("./",sample,"/",sep=""))))){
SNP_INFO <- read.csv(paste("./",sample,"/","SNP_INFO.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cdna"]!="",paste(x["gene"],x["cdna"],sep="_"),
ifelse(x["amplicon"]%in%c("MSK_RL_AMP54", "MSK_RL_AMP55", "MSK_RL_AMP56","MSK_RL_AMP57","MSK_RL_AMP58"),paste("FLT3_INS",x["POS"],x["ALT"],sep="_"),"")))
})# - variant metadata
} else {
SNP_INFO <- read.csv(paste("./",sample,"/","Variants.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
colnames(SNP_INFO)[3] <- "protein"
SNP_INFO$cDNA <- as.character(SNP_INFO$cDNA)
SNP_INFO$protein <- as.character(SNP_INFO$protein)
SNP_INFO$Variant <- as.character(SNP_INFO$Variant)
SNP_INFO$Gene <- as.character(SNP_INFO$Gene)
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cDNA"]!="",paste(x["Gene"],x["cDNA"],sep="_"),
ifelse(grepl("^chr13",x["Variant"]),paste("FLT3_INS",x["Variant"],sep="_"),"")))
})# - variant metadata
}
SNP_changes_of_interest <- LAM_VARIANTS_SNP
protein_changes_of_interest <- LAM_VARIANTS_Protein
SNP_protein_key <- data.frame("Mutant"=SNP_changes_of_interest,"Protein"=protein_changes_of_interest)
# Cell and clone selection
distribution_of_unknowns_by_variant <- apply(NGT[,SNP_changes_of_interest],MARGIN=2,table)
#print(distribution_of_unknowns_by_variant)
cells_with_unknown<-NGT[apply(NGT[,SNP_changes_of_interest],MARGIN=1,FUN=function(x){sum(x==3)>0}),"Cell"]
matrix_of_interest<-NGT[!NGT$Cell%in%cells_with_unknown,SNP_changes_of_interest]
bulk_VAF_order<-SNP_INFO[colnames(matrix_of_interest)[order(colSums(matrix_of_interest),decreasing=TRUE)],"protein"]
bulk_VAF_order <- bulk_VAF_order[!duplicated(bulk_VAF_order)]
matrix_of_interest$Clone <- apply(matrix_of_interest,1,function(x){paste(x,sep="_",collapse="_")})
# Aggregate data to get clone counts and architecture.
dedup<-matrix_of_interest[!duplicated(matrix_of_interest),]
clonal_abundance <- data.frame("Count"=as.matrix(table(matrix_of_interest$Clone)))
clonal_abundance$Clone <- factor(rownames(clonal_abundance),levels=rev(c(rownames(clonal_abundance))))
clonal_abundance$Count <- as.numeric(clonal_abundance$Count)
clonal_abundance$Dominance <- apply(clonal_abundance,1,function(x){
(as.numeric(x["Count"])/sum(clonal_abundance["Count"]) ) / (max(as.numeric(clonal_abundance[,"Count"]))/sum(clonal_abundance["Count"]))
})
clonal_abundance$Mutation_count <- apply(clonal_abundance,1,function(x){
sum(as.numeric(strsplit( as.character(x[2]),split="_")[[1]]))
})
shannon <- diversity(clonal_abundance[,"Count"])
print(shannon)
matrix_output <- dedup%>%select(-Clone)
matrix_output_t <- t(matrix_output)
rownames(matrix_output_t) <- 0:(dim(matrix_output_t)[[1]]-1)
write.table(shannon,paste0(output_folder,"Diversity_score/",sample,".txt"),sep="\t",row.names=FALSE)
write.table(matrix_output_t,paste0(output_folder,"Genotype_matrix/",sample,".txt"),sep=" ",row.names=TRUE,quote=FALSE,col.names = FALSE)
write.table(clonal_abundance,paste0(output_folder,"Summary_mat/",sample,".txt"),sep="\t",row.names=FALSE)
}
permute_data <- function(sample,nrun){
#Load in the data
if(any(grepl("VARIANT_SELECTION_LAM",list.files(paste("./",sample,"/",sep=""))))){
LAM_VARIANTS_MAT <- read.csv(paste("./",sample,"/",sample,"_VARIANT_SELECTION_LAM.csv",sep=""))
LAM_VARIANTS_SNP <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Mutant))[,1])
LAM_VARIANTS_Protein <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Protein))[,1])
} else {
print("No variant selection performed!")
return("next")
}
AF <- read.csv(paste("./",sample,"/","AF.csv",sep="")) # - variant allele frequency
DP <- read.csv(paste("./",sample,"/","DP.csv",sep="")) # - read depth
GQ <- read.csv(paste("./",sample,"/","GQ.csv",sep="")) # - quality scores
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep="")) # - genotype call converted to categorical (0-reference, 1-heterozygous mutation, 2-homozygous mutation, 3-unknown)
if(any(grepl("SNP_INFO.csv",list.files(paste("./",sample,"/",sep=""))))){
SNP_INFO <- read.csv(paste("./",sample,"/","SNP_INFO.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cdna"]!="",paste(x["gene"],x["cdna"],sep="_"),
ifelse(x["amplicon"]%in%c("MSK_RL_AMP54", "MSK_RL_AMP55", "MSK_RL_AMP56","MSK_RL_AMP57","MSK_RL_AMP58"),paste("FLT3_INS",x["POS"],x["ALT"],sep="_"),"")))
})# - variant metadata
} else {
SNP_INFO <- read.csv(paste("./",sample,"/","Variants.csv",sep=""))
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
colnames(SNP_INFO)[3] <- "protein"
SNP_INFO$cDNA <- as.character(SNP_INFO$cDNA)
SNP_INFO$protein <- as.character(SNP_INFO$protein)
SNP_INFO$Variant <- as.character(SNP_INFO$Variant)
SNP_INFO$Gene <- as.character(SNP_INFO$Gene)
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cDNA"]!="",paste(x["Gene"],x["cDNA"],sep="_"),
ifelse(grepl("^chr13",x["Variant"]),paste("FLT3_INS",x["Variant"],sep="_"),"")))
})# - variant metadata
}
SNP_changes_of_interest <- LAM_VARIANTS_SNP
protein_changes_of_interest <- LAM_VARIANTS_Protein
SNP_protein_key <- data.frame("Mutant"=SNP_changes_of_interest,"Protein"=protein_changes_of_interest)
# Cell and clone selection
distribution_of_unknowns_by_variant <- apply(NGT[,SNP_changes_of_interest],MARGIN=2,table)
#print(distribution_of_unknowns_by_variant)
cells_with_unknown<-NGT[apply(NGT[,SNP_changes_of_interest],MARGIN=1,FUN=function(x){sum(x==3)>0}),"Cell"]
matrix_of_interest<-NGT[!NGT$Cell%in%cells_with_unknown,SNP_changes_of_interest]
bulk_VAF_order<-SNP_INFO[colnames(matrix_of_interest)[order(colSums(matrix_of_interest),decreasing=TRUE)],"protein"]
bulk_VAF_order <- bulk_VAF_order[!duplicated(bulk_VAF_order)]
matrix_of_interest$Clone <- apply(matrix_of_interest,1,function(x){paste(x,sep="_",collapse="_")})
# Aggregate data to get clone counts and architecture.
dedup<-matrix_of_interest[!duplicated(matrix_of_interest),]
clonal_abundance <- data.frame("Count"=as.matrix(table(matrix_of_interest$Clone)))
clonal_abundance$Clone <- factor(rownames(clonal_abundance),levels=rev(c(rownames(clonal_abundance))))
clonal_abundance$Count <- as.numeric(clonal_abundance$Count)
clonal_architecture <- melt(dedup)
colnames(clonal_architecture) <- c("Clone","Mutant","Genotype")
clonal_architecture$Genotype <- ifelse(clonal_architecture$Genotype==3,NA,
ifelse(clonal_architecture$Genotype==0,"WT",
ifelse(clonal_architecture$Genotype==1,"Heterozygous",
ifelse(clonal_architecture$Genotype==2,"Homozygous",clonal_architecture$Genotype))))
clonal_architecture$Genotype <- factor(clonal_architecture$Genotype ,levels=c("WT","Heterozygous","Homozygous"))
results_holder <- unlist(list(1:nrun))
randomization_list<-lapply(results_holder,function(x){
apply(matrix_of_interest[,1:length(colnames(matrix_of_interest))-1],2,function(x){
y <- sample(x,length(x),replace=FALSE)
})})
#randomization_list<-plyr::alply(randomization_array,3)
# names(randomization_list) <- NULL
empirical_distribution<-suppressMessages(lapply(randomization_list, function(randomization){
randomization$Clone <- apply(randomization,1,function(x){paste(x,sep="_",collapse="_")})
loop_dedup<-matrix_of_interest[!duplicated(randomization),]
loop_clonal_architecture <- melt(loop_dedup)
colnames(loop_clonal_architecture) <- c("Clone","Mutant","Genotype")
loop_clonal_architecture$Genotype <- ifelse(loop_clonal_architecture$Genotype==3,NA,
ifelse(loop_clonal_architecture$Genotype==0,"WT",
ifelse(loop_clonal_architecture$Genotype==1,"Heterozygous",
ifelse(loop_clonal_architecture$Genotype==2,"Homozygous",clonal_architecture$Genotype))))
loop_clonal_architecture$Genotype <- factor(loop_clonal_architecture$Genotype ,levels=c("WT","Heterozygous","Homozygous"))
loop_clonal_abundance <- data.frame("Count"=as.matrix(table(randomization$Clone)))
loop_clonal_abundance$Clone <- rownames(loop_clonal_abundance)
return(data.frame(loop_clonal_abundance))
}))
empirical_distribution_mat<-empirical_distribution%>%purrr::reduce(dplyr::full_join,by="Clone")
colnames(empirical_distribution_mat)<-1:length(colnames(empirical_distribution_mat))
rownames(empirical_distribution_mat) <- empirical_distribution_mat[,2]
empirical_distribution_mat <- empirical_distribution_mat[,-2]
empirical_distribution_mat[is.na(empirical_distribution_mat)]<-0
clonal_abundance$Clone <- as.character(clonal_abundance$Clone)
clonal_abundance$Count <- as.numeric(clonal_abundance$Count)
clonal_abundance$pvalue <-apply(clonal_abundance,1,function(x){
1- sum(as.numeric(x[["Count"]])>empirical_distribution_mat[x[["Clone"]],])/nrun
})
#print(dedup)
colnames(clonal_abundance)[2] <- paste(colnames(dedup)[1:length(SNP_changes_of_interest)],sep="_",collapse="__")
clonal_abundance <- clonal_abundance[order(clonal_abundance[,"Count"],decreasing = TRUE),]
clonal_abundance$Dominance <- apply(clonal_abundance,1,function(x){
(as.numeric(x["Count"])/sum(clonal_abundance["Count"]) ) / (max(as.numeric(clonal_abundance[,"Count"]))/sum(clonal_abundance["Count"]))
})
clonal_abundance$Poisson <- apply(clonal_abundance,1,function(x){
poisson.test(as.numeric(x["Count"]),mean(as.numeric(clonal_abundance$Count)),alternative="greater")$p.value
})
clonal_abundance$Mutation_count <- apply(clonal_abundance,1,function(x){
sum(as.numeric(strsplit( as.character(x[2]),split="_")[[1]]))
})
write.table(clonal_abundance,paste0(output_folder,"Summary_mat/",sample,".txt"),sep="\t",row.names=FALSE)
}
replot_clonal_selection <- function(sample){
if(any(grepl("VARIANT_SELECTION_LAM",list.files(paste("./",sample,"/",sep=""))))){
LAM_VARIANTS_MAT <- read.csv(paste("./",sample,"/",sample,"_VARIANT_SELECTION_LAM.csv",sep=""))
LAM_VARIANTS_SNP <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Mutant))[,1])
LAM_VARIANTS_Protein <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Protein))[,1])
} else {
print("No variant selection performed!")
return("next")
}
DP <- read.csv(paste("./",sample,"/","DP.csv",sep="")) # - read depth
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep="")) # - genotype call converted to categorical (0-reference, 1-heterozygous mutation, 2-homozygous mutation, 3-unknown)
if(any(grepl("SNP_INFO.csv",list.files(paste("./",sample,"/",sep=""))))){
SNP_INFO <- read.csv(paste("./",sample,"/","SNP_INFO.csv",sep=""))
rownames(SNP_INFO) <-colnames(NGT)[-c(1:2)] # - renames SNP info to have the variants as rownames
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cdna"]!="",paste(x["gene"],x["cdna"],sep="_"),
ifelse(x["amplicon"]%in%c("MSK_RL_AMP54", "MSK_RL_AMP55", "MSK_RL_AMP56","MSK_RL_AMP57","MSK_RL_AMP58"),paste("FLT3_INS",x["POS"],x["ALT"],sep="_"),"")))
})# - variant metadata
} else {
SNP_INFO <- read.csv(paste("./",sample,"/","Variants.csv",sep=""))
rownames(SNP_INFO) <-colnames(NGT)[-c(1:2)] # - renames SNP info to have the variants as rownames
colnames(SNP_INFO)[3] <- "protein"
SNP_INFO$cDNA <- as.character(SNP_INFO$cDNA)
SNP_INFO$protein <- as.character(SNP_INFO$protein)
SNP_INFO$Variant <- as.character(SNP_INFO$Variant)
SNP_INFO$Gene <- as.character(SNP_INFO$Gene)
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cDNA"]!="",paste(x["Gene"],x["cDNA"],sep="_"),
ifelse(grepl("^chr13",x["Variant"]),paste("FLT3_INS",x["Variant"],sep="_"),"")))
})# - variant metadata
}
SNP_changes_of_interest <- LAM_VARIANTS_SNP
protein_changes_of_interest <- LAM_VARIANTS_Protein
SNP_protein_key <- data.frame("Mutant"=SNP_changes_of_interest,"Protein"=protein_changes_of_interest)
rownames(SNP_protein_key) <- SNP_protein_key$Mutant
# Cell and clone selection
distribution_of_unknowns_by_variant <- apply(NGT[,SNP_changes_of_interest],MARGIN=2,table)
#print(distribution_of_unknowns_by_variant)
cells_with_unknown<-NGT[apply(NGT[,SNP_changes_of_interest],MARGIN=1,FUN=function(x){sum(x==3)>0}),"Cell"]
matrix_of_interest<-NGT[!NGT$Cell%in%cells_with_unknown,SNP_changes_of_interest]
bulk_VAF_order<-SNP_INFO[colnames(matrix_of_interest)[order(colSums(matrix_of_interest),decreasing=TRUE)],"protein"]
bulk_VAF_order <- bulk_VAF_order[!duplicated(bulk_VAF_order)]
matrix_of_interest$Clone <- apply(matrix_of_interest,1,function(x){paste(x,sep="_",collapse="_")})
dedup<-matrix_of_interest[!duplicated(matrix_of_interest),]
colnames(dedup) <- SNP_protein_key[colnames(dedup),"Protein"]
clonal_architecture <- melt(dedup)
colnames(clonal_architecture) <- c("Clone","Mutant","Genotype")
clonal_architecture$Genotype <- ifelse(clonal_architecture$Genotype==3,NA,
ifelse(clonal_architecture$Genotype==0,"WT",
ifelse(clonal_architecture$Genotype==1,"Heterozygous",
ifelse(clonal_architecture$Genotype==2,"Homozygous",clonal_architecture$Genotype))))
clonal_architecture$Genotype <- factor(clonal_architecture$Genotype ,levels=c("WT","Heterozygous","Homozygous"))
clonal_abundance <- data.frame("Count"=as.matrix(table(matrix_of_interest$Clone)))
clonal_abundance$Clone <- rownames(clonal_abundance)
clonal_abundance <- data.table(clonal_abundance,key="Count")
clonal_abundance$Clone <- factor(clonal_abundance$Clone,levels=rev(c(clonal_abundance$Clone)))
clonal_architecture$Clone <- factor(as.character(clonal_architecture$Clone),levels=rev(as.character(clonal_abundance$Clone)))
clonal_architecture$Mutant <-factor(clonal_architecture$Mutant , levels=rev(as.character(bulk_VAF_order)))
clones_import <- read.delim(paste0("/Volumes/LevineLab/Levine Lab/Bobby/Collaborations/MissionBio/08_08_19/Summary_mat/",sample,".txt"),sep="\t")
colnames(clones_import)[2] <- "Clone"
select_clones_full <- clones_import %>% filter(pvalue<0.1|Poisson<0.05)%>%select(Clone)
select_clones_dominant <- clones_import %>% filter(pvalue<0.1|Poisson<0.05)%>%filter(Dominance>0.05)%>%select(Clone)
if(length(select_clones_dominant[,1])>3){
clonal_architecture_subset <- clonal_architecture%>%filter(Clone%in%select_clones_dominant[,1])
clonal_abundance_subset <- clonal_abundance%>%filter(Clone%in%select_clones_dominant[,1])
name_var <- "VAF Subset"
} else if(length(select_clones_full[,1])>20){
print("Complex clonal picture, skipping")
return(next)
} else if(length(select_clones_full[,1])<2){
print("Simple clonal picture, plotting all clones")
clonal_architecture_subset <- clonal_architecture
clonal_abundance_subset <- clonal_abundance
name_var <- "All Clones"
} else {
clonal_architecture_subset <- clonal_architecture%>%filter(Clone%in%select_clones_full[,1])
clonal_abundance_subset <- clonal_abundance%>%filter(Clone%in%select_clones_full[,1])
name_var <- "Full"
}
# Generate clonal architecture heatmap
gg_heatmap <- ggplot(data = clonal_architecture_subset, aes(x = Clone, y = factor(Mutant), fill = Genotype)) +
geom_tile() +
scale_fill_manual(values=c("WT"=brewer.pal(7,"Reds")[1],
"Heterozygous"=brewer.pal(7,"Reds")[3],
"Homozygous"=brewer.pal(7,"Reds")[6],
"Unknown"="grey50"),"Genotype") +
theme_classic() +
ylab("Mutation")+
theme(legend.position = "bottom", legend.direction = "horizontal",
axis.text.x = element_blank(),
axis.line=element_blank(),
axis.title.x=element_blank(),
axis.ticks.x = element_blank(),
plot.margin=unit(c(0,1,1,1),"cm"))
# Generate clonal abundance barplot
gg_clonal_barplot <- ggplot(data = data.frame(clonal_abundance_subset), aes(x = factor(Clone), y = Count)) +
geom_bar(stat = "identity", aes(fill = Count)) + theme_gray() +
theme_classic()+
ylim(0,max(clonal_abundance$Count)*1.3) +
ylab("Cell Count")+
geom_text(aes(label=Count), position=position_dodge(width=0.9), vjust=-0.25)+
scale_fill_distiller(name = "Value", palette = "Reds", direction = 1) +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.line=element_blank(),
legend.position = "none",
plot.margin=unit(c(1,1,-0.2,1),"cm"))
# Compute spacing of graphs
plots <- list(gg_clonal_barplot,gg_heatmap)
grobs <- list()
widths <- list()
for (i in 1:length(plots)){
grobs[[i]] <- ggplotGrob(plots[[i]])
widths[[i]] <- grobs[[i]]$widths[2:5]
}
maxwidth <- do.call(grid::unit.pmax, widths)
for (i in 1:length(grobs)){
grobs[[i]]$widths[2:5] <- as.list(maxwidth)
}
# Generate plot
grob.title <- textGrob(paste(sample,"-",name_var), hjust = 0.5, vjust = 0.5, gp = gpar(fontsize = 20))
pdf(paste(output_folder,"Graphs_significant/",sample,".pdf",sep=""), width = 16, height = 8) # Open a new pdf file
grid.arrange(grobs=grobs, ncol = 1, top = grob.title)
graphics.off()
}
plot_clonal_heatmap_and_barplot <-function(sample,output_folder,clonal_abundance,clonal_architecture,clone_cell_count,shrink)
{
#clonal_abundance_subset <-data.frame( clonal_abundance[[sample]])%>%filter(Count>=5)
#clonal_architecture_subset <- data.frame(clonal_architecture[[sample]])%>%
# filter(Clone%in%as.character(clonal_abundance_subset$Clone))
clonal_abundance_subset<-final_sample_summary[[sample]]$Clones#clonal_abundance
clonal_abundance_subset$Clone<-factor(final_sample_summary[[sample]]$Clones[,"Clone"],levels=rev(c(final_sample_summary[[sample]]$Clones[,"Clone"])))#clonal_abundance
clonal_architecture_subset<-final_sample_summary[[sample]]$Architecture%>%
filter(Clone%in%as.character(clonal_abundance_subset$Clone))
clonal_architecture_subset$Clone <- factor(clonal_architecture_subset$Clone, levels=levels(clonal_abundance_subset$Clone))
gg_heatmap <- ggplot(data = clonal_architecture_subset, aes(x = Clone, y = factor(Mutant,levels=rev(levels(factor(Mutant)))), fill = Genotype)) +
geom_tile() +# scale_y_discrete(limits = rev(levels(Mutant)))+
scale_fill_manual(values=c("WT"=brewer.pal(7,"Reds")[1],
"Heterozygous"=brewer.pal(7,"Reds")[3],
"Homozygous"=brewer.pal(7,"Reds")[6],
"Unknown"="grey50"),"Genotype") +
theme_classic(base_size=6) +
ylab("Mutation")+
theme(legend.position = "bottom", legend.direction = "horizontal",
axis.text.x = element_blank(),
axis.line=element_blank(),
axis.title.x=element_blank(),
axis.ticks.x = element_blank(),
plot.margin=unit(c(0,1,1,1),"cm"))
# Generate clonal abundance barplot
gg_clonal_barplot <- ggplot(data = data.frame(clonal_abundance_subset), aes(x = Clone, y = Count)) +
geom_bar(stat = "identity", aes(fill = Count)) + theme_gray() +
theme_classic(base_size=6)+
ylim(0,max(clonal_abundance_subset$Count)*1.3) +
ylab("Cell Count")+
geom_text(aes(label=Count), position=position_dodge(width=0.9), vjust=-0.25,size=1)+
scale_fill_distiller(name = "Value", palette = "Reds", direction = 1) +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.line=element_blank(),
legend.position = "none",
plot.margin=unit(c(1,1,-0.2,1),"cm"))
grob.title <- textGrob(paste(sample,"-"), hjust= 0.5, vjust = 0.5, gp = gpar(fontsize = 12))
final_plot<-plot_grid(grob.title,gg_clonal_barplot,gg_heatmap,ncol=1,align="hv",axis="l",rel_heights = c(0.1,1,0.75))
save_plot(paste(output_folder,sample,".pdf",sep=""),final_plot, ncol=1) # Open a new pdf file
}
generate_and_plot_cooccurence <- function(mut_mat){
cooccur_mat <- cooccur(mat=t(mut_mat),type="spp_site",only_effects = FALSE,eff_matrix=TRUE,thresh=FALSE,eff_standard=FALSE,spp_names=TRUE)$results
cooccur_mat$score<- ifelse(cooccur_mat[,"p_lt"]<=0.05,-1,ifelse(cooccur_mat[,"p_gt"]<=0.05,1,0))
cooccur_mat_subset <- rbind(cooccur_mat[,c("sp1_name","sp2_name","score")],c(setdiff(cooccur_mat$sp2_name,cooccur_mat$sp1_name),setdiff(cooccur_mat$sp1_name,cooccur_mat$sp2_name),0))
cooccur_data_mat <- dcast(cooccur_mat_subset, sp1_name ~ sp2_name, value.var="score")
rownames(cooccur_data_mat) <- cooccur_data_mat[,1]
cooccur_data_mat2<-cooccur_data_mat[,-1]
cooccur_data_mat2[is.na(cooccur_data_mat2)]<-0
cooccur_data_mat2[lower.tri(cooccur_data_mat2)]<-NA
cooccur_data_mat2$Gene <- rownames(cooccur_data_mat2)
cooccur_data_mat_melt <- na.omit(melt(cooccur_data_mat2, 'Gene', variable_name='Gene2'))
cooccur_data_mat_melt$variable <- factor(cooccur_data_mat_melt$variable, levels=rev(levels(cooccur_data_mat_melt$variable)))
# Triangle heatmap to compare cohorts
grob_corrplot<-ggplot(cooccur_data_mat_melt, aes(Gene, variable))+
geom_tile(aes(fill = factor(value)), color='grey90') +
scale_fill_manual(values=c("-1"="firebrick3","0"="white","1"="steelblue2"),"Correlation",
labels=c("Mutually Exclusive","Not Significant","Mutually Inclusive"))+
theme_classic(base_size=10)+xlab("")+ylab("")+
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1),
axis.line = element_blank(),
legend.position = c(0.8,1),
legend.justification = c(1, 1),
legend.direction = "vertical")+
theme(legend.key.size = unit(0.5,"line"))
return(list("plot"=grob_corrplot,
"data"=cooccur_mat))
}
diversity_from_bulk_VAF <- function(sample){
if(any(grepl("VARIANT_SELECTION_LAM",list.files(paste("./",sample,"/",sep=""))))){
LAM_VARIANTS_MAT <- read.csv(paste("./",sample,"/",sample,"_VARIANT_SELECTION_LAM.csv",sep=""))
LAM_VARIANTS_SNP <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Mutant))[,1])
LAM_VARIANTS_Protein <- as.character((LAM_VARIANTS_MAT %>% filter(Include=="Include")%>%select(Protein))[,1])
} else {
print("No variant selection performed!")
return("next")
}
#AF <- read.csv(paste("./",sample,"/","AF.csv",sep="")) # - variant allele frequency
#DP <- read.csv(paste("./",sample,"/","DP.csv",sep="")) # - read depth
#GQ <- read.csv(paste("./",sample,"/","GQ.csv",sep="")) # - quality scores
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep="")) # - genotype call converted to categorical (0-reference, 1-heterozygous mutation, 2-homozygous mutation, 3-unknown)
if(any(grepl("SNP_INFO.csv",list.files(paste("./",sample,"/",sep=""))))){
SNP_INFO <- read.csv(paste("./",sample,"/","SNP_INFO.csv",sep=""))
rownames(SNP_INFO) <-colnames(NGT)[-c(1:2)] # - renames SNP info to have the variants as rownames
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cdna"]!="",paste(x["gene"],x["cdna"],sep="_"),
ifelse(x["amplicon"]%in%c("MSK_RL_AMP54", "MSK_RL_AMP55", "MSK_RL_AMP56","MSK_RL_AMP57","MSK_RL_AMP58"),paste("FLT3_INS",x["POS"],x["ALT"],sep="_"),"")))
})# - variant metadata
} else {
SNP_INFO <- read.csv(paste("./",sample,"/","Variants.csv",sep=""))
rownames(SNP_INFO) <-colnames(NGT)[-c(1:2)] # - renames SNP info to have the variants as rownames
colnames(SNP_INFO)[3] <- "protein"
SNP_INFO$cDNA <- as.character(SNP_INFO$cDNA)
SNP_INFO$protein <- as.character(SNP_INFO$protein)
SNP_INFO$Variant <- as.character(SNP_INFO$Variant)
SNP_INFO$Gene <- as.character(SNP_INFO$Gene)
SNP_INFO[,c("protein")] <- apply(SNP_INFO,1,function(x){
ifelse(x["protein"]!="",x["protein"],
ifelse(x["cDNA"]!="",paste(x["Gene"],x["cDNA"],sep="_"),
ifelse(grepl("^chr13",x["Variant"]),paste("FLT3_INS",x["Variant"],sep="_"),"")))
})# - variant metadata
}
SNP_changes_of_interest <- LAM_VARIANTS_SNP
protein_changes_of_interest <- LAM_VARIANTS_Protein
SNP_protein_key <- data.frame("Mutant"=SNP_changes_of_interest,"Protein"=protein_changes_of_interest)
# Cell and clone selection
distribution_of_unknowns_by_variant <- apply(NGT[,SNP_changes_of_interest],MARGIN=2,table)
#print(distribution_of_unknowns_by_variant)
cells_with_unknown<-NGT[apply(NGT[,SNP_changes_of_interest],MARGIN=1,FUN=function(x){sum(x==3)>0}),"Cell"]
matrix_of_interest<-NGT[!NGT$Cell%in%cells_with_unknown,SNP_changes_of_interest]
bulk_VAF_order<-SNP_INFO[colnames(matrix_of_interest)[order(colSums(matrix_of_interest),decreasing=TRUE)],"protein"]
bulk_VAF_order <- bulk_VAF_order[!duplicated(bulk_VAF_order)]
matrix_of_interest$Clone <- apply(matrix_of_interest,1,function(x){paste(x,sep="_",collapse="_")})
output<-diversity(colSums(matrix_of_interest[colnames(matrix_of_interest)!="Clone"])/length(rownames(matrix_of_interest)))
write.table(output,paste0(output_folder,"Diversity_score_faux_VAF/",sample,".txt"),sep="\t")
}
## Old Functions
plot_variants_AF_and_DP <- function(sample){
#Load in the data
AF <- read.csv(paste("./",sample,"/","AF.csv",sep="")) # - variant allele frequency
DP <- read.csv(paste("./",sample,"/","DP.csv",sep="")) # - read depth
GQ <- read.csv(paste("./",sample,"/","GQ.csv",sep="")) # - quality scores
NGT <- read.csv(paste("./",sample,"/","NGT.csv",sep="")) # - genotype call converted to categorical (0-reference, 1-heterozygous mutation, 2-homozygous mutation, 3-unknown)
SNP_INFO <- read.csv(paste("./",sample,"/","SNP_INFO.csv",sep="")) # - variant metadata
rownames(SNP_INFO) <-colnames(AF)[-c(1:2)] # - renames SNP info to have the variants as rownames
AF_NGT_Melt<-inner_join(AF %>% unite("Sample_cell",c("Sample","Cell")) %>%melt("Sample_cell"),
NGT %>% unite("Sample_cell",c("Sample","Cell")) %>%melt("Sample_cell"),
by=c("Sample_cell","variable"))
colnames(AF_NGT_Melt) <- c("Cell","Variant","AF","NGT")
gg_AF_NGT<- ggplot(AF_NGT_Melt,aes(x=as.factor(NGT),y=AF,color=NGT))+ geom_quasirandom()+facet_wrap(~Variant,ncol=3)
DP_NGT_Melt<-inner_join(DP %>% unite("Sample_cell",c("Sample","Cell")) %>%melt("Sample_cell"),
NGT %>% unite("Sample_cell",c("Sample","Cell")) %>%melt("Sample_cell"),
by=c("Sample_cell","variable"))
colnames(DP_NGT_Melt) <- c("Cell","Variant","DP","NGT")
gg_DP_NGT<- ggplot(DP_NGT_Melt,aes(x=as.factor(NGT),y=DP,fill=NGT))+geom_boxplot()+facet_wrap(~Variant,ncol=3,scale="free_y")
ggsave(paste0("./",sample,"/",sample,"_DP_NGT.pdf"),gg_DP_NGT,width=10,height=40)
ggsave(paste0("./",sample,"/",sample,"_AF_NGT.pdf"),gg_AF_NGT,width=10,height=40)
}
plot_COMPASS_data <-function(sample,output_folder,melted_protein_mat,clonal_abundance,clonal_architecture,clone_cell_count,shrink)
{
melted_protein_mat <-melted_protein_mat[[sample]]
clonal_abundance_subset <-data.frame( clonal_abundance[[sample]])#%>%filter(Count>=5)
clonal_architecture_subset <- data.frame(clonal_architecture[[sample]])#%>%
# filter(Clone%in%as.character(clonal_abundance_subset$Clone))
genes_to_display <-unique(as.character(clonal_architecture_subset[clonal_architecture_subset$Genotype%in%c("Heterozygous","Homozygous"),"Mutant"]))
if(shrink==TRUE){
clonal_architecture_subset <-clonal_architecture_subset%>%filter(Mutant%in%genes_to_display)
}
clonal_architecture_subset$Clone <- factor(clonal_architecture_subset$Clone, levels=levels(clonal_abundance_subset$Clone))
melted_protein_mat_subset<-melted_protein_mat%>%filter(Clone%in%as.character(clonal_architecture_subset$Clone))
melted_protein_mat_subset$Clone <-factor(melted_protein_mat_subset$Clone, levels=levels(clonal_abundance_subset$Clone))
gg_protein_heatmap<-ggplot(melted_protein_mat_subset, aes(y=variable, x=(Clone))) + geom_tile(aes(fill = value),colour = "white") +
scale_fill_distiller(palette = "PRGn")+
theme_minimal(base_size=6) +
theme( axis.text.x = element_blank(), axis.title.x = element_blank(),
legend.position = "right",legend.direction = "vertical",
axis.ticks.x = element_blank(),
plot.margin=unit(c(0,0,0,0),"cm"))
gg_heatmap <- ggplot(data = clonal_architecture_subset, aes(x = Clone, y = factor(Mutant,levels=rev(levels(factor(Mutant)))), fill = Genotype)) +
geom_tile() +# scale_y_discrete(limits = rev(levels(Mutant)))+
scale_fill_manual(values=c("WT"=brewer.pal(7,"Reds")[1],
"Heterozygous"=brewer.pal(7,"Reds")[3],
"Homozygous"=brewer.pal(7,"Reds")[6],
"Unknown"="grey50"),"Genotype") +
theme_classic(base_size=6) +
ylab("Mutation")+
theme(legend.position = "right", legend.direction = "vertical",
axis.text.x = element_blank(),
axis.line=element_blank(),
axis.title.x=element_blank(),
axis.ticks.x = element_blank(),
plot.margin=unit(c(0,0,0,0),"cm"))
clone_colors <- alphabet()[1:length(unique(clonal_abundance_subset$Clone))]
names(clone_colors) <-levels(clonal_abundance_subset$Clone)
# Generate clonal abundance barplot
gg_clonal_barplot <- ggplot(data = data.frame(clonal_abundance_subset), aes(x = Clone, y = Count)) +
geom_bar(stat = "identity", aes(fill = Clone)) + theme_gray() +
theme_classic(base_size=6)+
ylim(0,max(clonal_abundance_subset$Count)*1.3) +
ylab("Cell Count")+
geom_text(aes(label=Count), position=position_dodge(width=0.9), vjust=-0.25,size=1)+
scale_fill_manual(values=clone_colors)+
theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.line=element_blank(),
legend.position = "none",
plot.margin=unit(c(0,0,0,0),"cm"))
grob.title <- textGrob(paste(sample,"-"), hjust= 0.5, vjust = 0.5, gp = gpar(fontsize = 12))
final_plot<-plot_grid(gg_clonal_barplot,addSmallLegend(gg_heatmap),addSmallLegend(gg_protein_heatmap),ncol=1,align="hv",axis="lrtb",rel_heights = c(1,(length(genes_to_display)*0.15),1))
save_plot(paste(output_folder,sample,".pdf",sep=""),final_plot, ncol=1) # Open a new pdf file
}
addSmallLegend <- function(myPlot, pointSize = 3, textSize = 8, spaceLegend = 0.5) {
myPlot +
guides(shape = guide_legend(override.aes = list(size = pointSize)),
color = guide_legend(override.aes = list(size = pointSize))) +
theme(legend.title = element_text(size = textSize),
legend.text = element_text(size = textSize),
legend.key.size = unit(spaceLegend, "lines"))
}
create_reward_matrix_deprecated<-function(Known_mat,weights){
set.seed(68864)
names(weights) <- apply(Known_mat,2,function(x){paste(x,sep="_",collapse="_")})
num_type <- 2
num_mutations <- nrow(Known_mat);
num_clones <- ncol(Known_mat)
num_states <- num_type^num_mutations
states<-data.frame(expand.grid(rep(list(0:num_type), num_mutations)))
state_interactions<-expand.grid(apply(states[,1:num_mutations],1,function(x){paste(x,collapse="_",sep="_")}),
apply(states[,1:num_mutations],1,function(x){paste(x,collapse="_",sep="_")}))
states$Reward_states<-ifelse(apply(states,1,function(x){
any(apply(Known_mat,2,function(y){ all(x==t(y))}))
}),2,NA)
states$Clone <- apply(states[,1:num_mutations],1,function(x){paste(x,collapse="_",sep="_")})
state_interactions$possible<-ifelse(apply(state_interactions,1,function(x){
A<-as.numeric(do.call(cbind,strsplit(as.character(x[1]),split="_")))
B<-as.numeric(do.call(cbind,strsplit(as.character(x[2]),split="_")))
sum(abs(A-B))<=1
}),2,NA)
dat<-acast(Var1~Var2,value.var="possible",data=data.frame(state_interactions,
"value"=-1))
diag(dat)<-0
lowerTriangle(dat)<-NA
test_dat<-do.call(cbind,lapply(colnames(dat),function(Clone){
if(Clone%in%names(weights)){
output<-ifelse(dat[,Clone]==2,weights[Clone],dat[,Clone])
} else{
output<-dat[,Clone]
}
return(output)
}))
colnames(test_dat) <-rownames(test_dat)
graph<-graph.adjacency(test_dat,mode="directed",weighted=TRUE)
graph_mat <- get.data.frame(graph)%>% drop_na()
subgraphs<-make_ego_graph(graph_from_data_frame(graph_mat,directed=TRUE), order=1,nodes=names(weights), mode="all")
subgraph_bind<-do.call(rbind,lapply(subgraphs,get.data.frame)) %>% distinct(to,from,weight, .keep_all = TRUE)
subgraph_subset<-subgraph_bind%>%filter(!to%in%setdiff(setdiff(subgraph_bind$to,subgraph_bind$from),names(weights)))
return(subgraph_subset)
}
create_reward_matrix<-function(Known_mat,weights){
set.seed(68864)
names(weights) <- apply(Known_mat,2,function(x){paste(x,sep="_",collapse="_")})
num_type <- 2
num_mutations <- nrow(Known_mat);
mutant_names<-rownames(Known_mat)
num_clones <- ncol(Known_mat)
num_states <- num_type^num_mutations
possible_mut_list<- unlist(apply(Known_mat,1,function(x){list(0:max(unique(x))) }),recursive = FALSE)
states<-data.frame(expand.grid(possible_mut_list))
state_interactions<-data.frame(expand.grid(apply(states[,1:num_mutations],1,function(x){paste(x,collapse="_",sep="_")}),
apply(states[,1:num_mutations],1,function(x){paste(x,collapse="_",sep="_")})))
state_interactions$possible<-ifelse(apply(state_interactions,1,function(x){
A<-as.numeric(do.call(cbind,strsplit(as.character(x[1]),split="_")))
B<-as.numeric(do.call(cbind,strsplit(as.character(x[2]),split="_")))
sum(abs(A-B))<=1
}),0,NA)
state_interactions$action<-apply(state_interactions,1,function(x){
A<-as.numeric(do.call(cbind,strsplit(as.character(x[1]),split="_")))
B<-as.numeric(do.call(cbind,strsplit(as.character(x[2]),split="_")))
if(!is.na(x["possible"])){
if(sum(abs(B-A))==0){
return("stay")
} else{
return(mutant_names[which((B-A)==1)])
}
}
})
dat<-setNames(state_interactions%>%filter(action%in%c(mutant_names,"stay")),
c("State","NextState","Reward","Action"))[,c(1,4,2,3)]
dat$Reward <- as.numeric(apply(dat,1,function(x){
ifelse(x$NextState%in%names(weights),weights[x$NextState],x$Reward)
}))
dat$Reward <- as.numeric(apply(dat,1,function(x){
ifelse(x$Action%in%"stay",0,x$Reward)
}))
dat$State <- as.character(dat$State)
dat$NextState <- as.character(dat$NextState)
dat$Action <- as.character(dat$Action)
control <- list(alpha = 0.8, gamma = 0.9)
model <- ReinforcementLearning(data = dat, s = "State", a = "Action", r = "Reward", s_new = "NextState", iter = 1,control=control)
x<- model$Q
rownames(x) <- substring(rownames(x),2)
Q_mat <- setNames(melt(x),c("State","Action","Q"))
set<-inner_join(dat,Q_mat,by=c("State","Action"))
set$Valid <- TRUE
return(set)
}
plot_clonal_heatmap_and_barplot_with_SD <-function(sample,output_folder,clonal_abundance,clonal_architecture,clone_cell_count,shrink)
{
clonal_abundance_subset <-data.frame( clonal_abundance[[sample]])#%>%filter(Count>=5)
clones_to_use <- intersect(as.character(clonal_abundance_subset$Clone),as.character(clonal_architecture[[sample]]$Clone))
clonal_architecture_subset <- data.frame(clonal_architecture[[sample]])%>%
filter(data.frame(clonal_architecture[[sample]])$Clone%in%clones_to_use)
clonal_abundance_subset <- data.frame(clonal_abundance_subset)%>%
filter(Clone%in%clones_to_use)
genes_to_display <-unique(as.character(clonal_architecture_subset[clonal_architecture_subset$Genotype%in%c("Heterozygous","Homozygous"),"Mutant"]))
if(shrink==TRUE){
clonal_architecture_subset <-clonal_architecture_subset%>%filter(Mutant%in%genes_to_display)
}
clonal_architecture_subset$Clone <- factor(clonal_architecture_subset$Clone, levels=rev(clonal_abundance_subset$Clone))
clonal_abundance_subset$Clone <- factor(clonal_abundance_subset$Clone, levels=levels(clonal_architecture_subset$Clone))
gg_heatmap <- ggplot(data = clonal_architecture_subset, aes(x = Clone, y = factor(Mutant,levels=rev(levels(factor(Mutant)))), fill = Genotype)) +
geom_tile() +# scale_y_discrete(limits = rev(levels(Mutant)))+
scale_fill_manual(values=c("WT"=brewer.pal(7,"Reds")[1],
"Heterozygous"=brewer.pal(7,"Reds")[3],
"Homozygous"=brewer.pal(7,"Reds")[6],
"Unknown"="grey50"),"Genotype") +
theme_classic(base_size=6) +
ylab("Mutation")+
theme(legend.position = "bottom", legend.direction = "horizontal",
axis.text.x = element_blank(),
axis.line=element_blank(),
axis.title.x=element_blank(),
axis.ticks.x = element_blank(),
plot.margin=unit(c(0,1,1,1),"cm"))
# Generate clonal abundance barplot
gg_clonal_barplot <- ggplot(data = data.frame(clonal_abundance_subset), aes(x = Clone, y = Count)) +
geom_bar(stat = "identity", aes(fill = Count)) + theme_gray() +
theme_classic(base_size=6)+
ylim(0,max(clonal_abundance_subset$Count)*1.3) +
ylab("Cell Count")+
geom_errorbar(aes(ymin = X2.5., ymax = X97.5.), width = 0.2)+
geom_text(aes(label=Count), position=position_dodge(width=0.9), vjust=-0.25,size=1)+
scale_fill_distiller(name = "Value", palette = "Reds", direction = 1) +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.line=element_blank(),
legend.position = "none",
plot.margin=unit(c(1,1,-0.2,1),"cm"))
grob.title <- textGrob(paste(sample,"-"), hjust= 0.5, vjust = 0.5, gp = gpar(fontsize = 12))
final_plot<-plot_grid(grob.title,gg_clonal_barplot,gg_heatmap,ncol=1,align="hv",axis="l",rel_heights = c(0.1,1,0.75))
save_plot(paste(output_folder,sample,".pdf",sep=""),final_plot, ncol=1) # Open a new pdf file
}
query_initiating_mutations<-function(graph_results){
start_index<-paste(rep(0,length(strsplit(graph_results$State[1],split="_")[[1]])),sep="_",collapse="_")
possible_starting_actions<-graph_results%>%filter(State==start_index&Action!="stay")%>%pull(Action)
final_results<-list()
initating_action_count<-0
for(initating_action in possible_starting_actions){
print(initating_action)
set <- graph_results
initating_action_count<-initating_action_count+1
storage_results<- list()
branches<-0
state_to_kill <- set%>%filter(State==start_index&Action==initating_action)%>%pull(NextState)
start_killed <- sum(set%>%filter(State==state_to_kill)%>%pull(Valid))
while(start_killed>0){
#print(branches)
# print(start_killed)
branches <- branches +1
number_of_mutations<-0
state_log<- list()
optimal_reward<-list()
action_log<-list()
current_state<- start_index
indicator<-TRUE
nextState<-0
while(current_state!=nextState) {
# print(number_of_mutations)
number_of_mutations <- number_of_mutations+1
if(number_of_mutations==1){
state_log[[number_of_mutations]] <- start_index
}
current_state <- state_log[[number_of_mutations]]
nextState_indicator<- FALSE
while(nextState_indicator==FALSE){
if(number_of_mutations==1){
max_potential_action_index<- set%>%
filter(State==current_state&Action==initating_action)
} else {
max_potential_action_index <- set%>%
filter(State==current_state&Valid==TRUE)%>%
filter(Q==max(Q))%>%sample_n(1)
}
if(nrow(max_potential_action_index)==0){
break
}
max_potential_action <- max_potential_action_index%>%pull(NextState)
next_valid_action <- any(set%>%filter(State==max_potential_action&Action!="stay")%>%pull(Valid))
if(next_valid_action==TRUE){
nextState <-max_potential_action
current_action <- max_potential_action_index%>%pull(Action)
nextState_indicator==TRUE
break
} else{
set[set$State%in%max_potential_action_index["State"]&
set$Action%in%max_potential_action_index["Action"],"Valid"] <- FALSE
}
}
if(nrow(set%>%filter(State==current_state&Action==current_action))==0){
optimal_reward[[number_of_mutations]] <-NA
} else {
optimal_reward[[number_of_mutations]] <- set%>%
filter(State==current_state&Action==current_action)%>%
pull(Reward)
}
state_log[[number_of_mutations+1]]<- nextState
action_log[[number_of_mutations]] <- current_action
if(current_action==nextState){
indicator==FALSE
state_log[[number_of_mutations+1]]<-NULL
break
}
}
optimal_reward[[number_of_mutations+1]] <- NA
action_log[[number_of_mutations+1]] <- NA
storage_results[[branches]] <-data.frame("states"=do.call(rbind,state_log),#[1:(length(state_log)-1)]),
"actions"=do.call(rbind,action_log),
"reward"=do.call(rbind,optimal_reward),
"nextState"=do.call(rbind,c(state_log[2:length(state_log)],NA)) )
storage_results[[branches]] <- storage_results[[branches]]%>%
filter(states!=nextState)
storage_results[[branches]]$cumulative_reward <- cumsum(storage_results[[branches]]$reward)
#storage_results[[branches]] <-storage_results[[branches]][1:which.max(storage_results[[branches]]$cumulative_reward), ]
set[set$State%in%current_state&set$Action%in%current_action,"Valid"] <- FALSE
start_killed <- sum(set%>%filter(State==state_to_kill)%>%pull(Valid))
}
final_results[[initating_action_count]]<-storage_results[!duplicated(storage_results)]
}
names(final_results)<-possible_starting_actions
return(final_results)
}