2.2 HDF5 and Loom input
This is a very quick example of how data can be loaded into R using loom and h5 files without the tapestri package. For the H5 files from tapestri pipeline v2, I found the rhdf5 package rather intuitive, even though the loomR package is largely based on hdf5r. It is unlikely that the code below is the most efficient way to load and extract these files, so part of this might not be helpful if you are already familiar with the hdf5 format. What may be helpful is the logic and order of events we take for filtering variants and cells of interet.
See sample data on github and google drive
library(rhdf5)
library(dplyr)
library(tidyr)
setwd("/Users/bowmanr/Projects/scDNA/scDNA_myeloid/")
<-("./data/Sample17020.dna_protein.h5") file
First the extraction of protein data, which is very straightforward. This can be later subset on which cells are preserved after genotype filtering. See latter sections for normalization.
<-h5read(file=file,name="/assays/protein_read_counts/layers/read_counts")
protein_matrownames(protein_mat) <- h5read(file=file,name="/assays/protein_read_counts/ca/id")
colnames(protein_mat)<- h5read(file=file,name="/assays/protein_read_counts/ra/barcode")
print(rownames(protein_mat))
## [1] "CD10" "CD117" "CD11b" "CD11c"
## [5] "CD123" "CD13" "CD138" "CD14"
## [9] "CD141" "CD16" "CD163" "CD19"
## [13] "CD1c" "CD2" "CD22" "CD25"
## [17] "CD3" "CD30" "CD303" "CD304"
## [21] "CD33" "CD34" "CD38" "CD4"
## [25] "CD44" "CD45" "CD45RA" "CD45RO"
## [29] "CD49d" "CD5" "CD56" "CD62L"
## [33] "CD62P" "CD64" "CD69" "CD7"
## [37] "CD71" "CD8" "CD83" "CD90"
## [41] "FcεRIα" "HLA-DR" "IgG1" "IgG2a"
## [45] "IgG2b"
print((protein_mat)[1:5,1:5])
## AACAACCTAACGAATCGC AACAACCTACCTCTGCAT AACAACCTAGGTGATAGG
## CD10 145 168 45
## CD117 54 179 50
## CD11b 68 578 94
## CD11c 23 308 5
## CD123 120 241 40
## AACAACCTAGTCACAGAG AACAACTGGCAGATCATT
## CD10 31 320
## CD117 46 1017
## CD11b 53 422
## CD11c 6 145
## CD123 48 602
Now we will move to extracting variant data. For the sake of processing time, we will first impose a cutoff that a given variant must be present in at least 1% of cells. This is a cutoff we used in our manuscript. This is a subjective cutoff, and will likely be a varying feature for many studies.
<- 0.01
VAF_cutoff <-h5read(file=file,name="/assays/dna_variants/layers/NGT")
NGT==3]<-NA
NGT[NGT<-which((rowSums(NGT,na.rm=TRUE)/(ncol(NGT)*2))>VAF_cutoff)
VAF_select<-h5read(file=file,name="/assays/dna_variants/layers/AF",index=list(VAF_select,NULL))
AF<-h5read(file=file,name="/assays/dna_variants/layers/DP",index=list(VAF_select,NULL))
DP<-h5read(file=file,name="/assays/dna_variants/layers/GQ",index=list(VAF_select,NULL))
GQ<-h5read(file=file,name="/assays/dna_variants/layers/NGT",index=list(VAF_select,NULL))
NGTlim==3]<-NA
NGTlim[NGTlim<-h5read(file=file,name="/assays/dna_variants/ca/id",index=list(VAF_select))
variants<-h5read(file=file,name="/assays/dna_variants/ra/barcode")
cell_barcodes colnames(NGTlim) <-cell_barcodes
rownames(NGTlim) <- variants
print(rownames(NGTlim)[1:5])
## [1] "chr1:43815097:T/C" "chr1:115256538:G/A" "chr1:115256539:T/C"
## [4] "chr1:115256539:T/G" "chr1:115256540:A/G"
print(NGTlim[1:5,1:5])
## AACAACCTAACGAATCGC AACAACCTACCTCTGCAT AACAACCTAGGTGATAGG
## chr1:43815097:T/C 1 NA 0
## chr1:115256538:G/A 0 0 0
## chr1:115256539:T/C NA NA NA
## chr1:115256539:T/G NA NA NA
## chr1:115256540:A/G 0 0 0
## AACAACCTAGTCACAGAG AACAACTGGCAGATCATT
## chr1:43815097:T/C 0 1
## chr1:115256538:G/A 0 1
## chr1:115256539:T/C 0 1
## chr1:115256539:T/G 0 0
## chr1:115256540:A/G 0 0
dim(NGTlim)
## [1] 105 6315
Now alternatively with loom files from tapestri pipeline v1, where you will also want the VCF header file to extract cell barcodes. This is not strictly necessary for DNA only analysis, but critical for integration with protein. I imagine most users will be working with v2 data, but I leave it here for you incase you need it, or are replicating data from our initial manuscript. It is worth noting that the matrices from the loom files are transposed compared to the H5, hence the t() to orient them for consistent downstream processing.
library(loomR)
setwd("/Users/bowmanr/Projects/scDNA/scDNA_myeloid/")
<- connect(filename = "./data/MSK91_IGO_09868_L_7_S7_R_191122041131.cells.loom", mode = "r")
lfile <-t(lfile$matrix[,])
NGT==3]<-NA
NGT[NGT<-which((rowSums(NGT,na.rm=TRUE)/(ncol(NGT)*2))>VAF_cutoff)
VAF_select<- 0.01
VAF_cutoff
<- t(lfile$layers$DP[,VAF_select])
DP <- t(lfile$layers$GQ[,VAF_select])
GQ <- t(lfile$layers$AD[,VAF_select])/DP *100
AF <-t(lfile$matrix[,VAF_select])
NGTlim==3]<-NA
NGTlim[NGTlim<-lfile$row.attrs$id[VAF_select]
variants
#for loom files from Tapestri v1
<-names(read.delim("./data/MSK91.vcf_header.txt",sep="\t"))
cell_barcodes colnames(NGTlim) <-cell_barcodes
rownames(NGTlim) <- variants
The v2 pipeline also outputs loom files, but given the easy packaging of protein and DNA data in the single H5 file, I can’t find a good reason to use the loom. Nevertheless, small differences in formatting are observed.
#for loom files from Tapestri v2 you do not need the VCF header file.
<-lfile$col.attrs$barcode[] cell_barcodes
Now let’s filter the data using the default cutoffs from Tapestri Insights and what we used in our manuscript.
=10 #read depth
DP_cut=20 #allele frequency cutoff
AF_cut=30 #geotype quality cutoff
GQ_cut=50 #variant must be genotyped in greater than 50% of cells
variant_presence_cutoff=50 # cell must possess genotype information for at least 50% of the variants of interest
cell_genotype_cutoff
#bind together long form AF, DP, GQ and NGT data
<-data.frame(setNames(
filtered_long# produce long form allele frequency data
data.frame(AF,
"variants"=variants),
c(all_of(cell_barcodes),"variants")) %>%
pivot_longer(cols=!c(variants),
names_to="Cell",
values_to="AF"),
#produce long form allele depth data
data.frame(DP)%>%
pivot_longer(cols=everything(),
names_to="Cell",
values_to="DP")%>%dplyr::select(DP),
#produce long form genotype quality data
data.frame(GQ)%>%
pivot_longer(cols=everything(),
names_to="Cell",
values_to="GQ")%>%dplyr::select(GQ),
#produce long form genotype call data
data.frame(NGTlim)%>%
pivot_longer(cols=everything(), names_to="Cell",
values_to="NGT")%>%dplyr::select(NGT)) %>%
#filter DP and GQ
filter(DP>DP_cut&
>GQ_cut)%>%
GQ#filter AF for each genotype call
mutate(pass=case_when(
==1&(AF>AF_cut)&(AF<(100-AF_cut)) ~ "include",
NGT==1&((AF<=AF_cut)|(AF>=(100-AF_cut))) ~ "exclude",
NGT==2&AF>=(100-AF_cut) ~ "include",
NGT==2&AF<(100-AF_cut) ~ "exclude",
NGT==0&AF<=AF_cut ~ "include",
NGT==0&AF>AF_cut ~ "exclude",
NGTTRUE ~"other"
%>%
))filter(pass=="include")
# here we check to make sure a variant is not NA (genotype 3) in >50% of cells
# we can also filter out variants that are likely SNPs and only show up as WT, het or homozygous This assumes SNPs would never undergo allele dropout, which is pretty unlikely, so this filter is not realistic. I leave it here incase it is helpful to someone.
<-filtered_long%>%
final_variants group_by(variants)%>%
summarize(diversity=sum(c(0,1,2)%in%NGT),
gt.mv=(length(NGT)/length(all_of(cell_barcodes)))*100)%>%
filter(#diversity>1&
>variant_presence_cutoff)%>%
gt.mvpull(variants)
# here we filter for cells that now contain genotype information for atleast 50% of our curated set of variants.
<- filtered_long%>%
final_cellsfilter(variants%in%final_variants)%>%
group_by(Cell)%>%
summarize(gt.mc=(length(NGT)/length(all_of(final_variants)))*100)%>%
filter(gt.mc>=cell_genotype_cutoff)%>%
pull(Cell)
# lastly we reconsstruct a new NGT matrix of cell-genotype pairs that passsed the above filters.
<- filtered_long%>%
final_NGTfilter(Cell%in%final_cells&variants%in%final_variants)%>%
pivot_wider(id_cols=Cell,names_from=variants,values_from=NGT)
Below is some code on how we can annotate the variants. I manually curated a transcript ID table for our genes of interest so that when the protein changes were reported they were consistent with the common hotspot mutations in which we are interested. The lines below introduce the required packages and how to create a limted TxDB object. would suggest just running this once and saving an rds object. It’s a little time intensive.
library(VariantAnnotation)
library(GenomicRanges)
library(magrittr)
require(RMariaDB)
library(plyranges)
library(BSgenome.Hsapiens.UCSC.hg19)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
<-read.csv("./data/annotation_key.csv")
annotation_key <- makeTxDbFromUCSC(genome="hg19",
hg19refseq_txdb transcript_ids=annotation_key$ccds_id,
tablename="ccdsGene")
%<>%inner_join(select(hg19refseq_txdb,
annotation_keykeys=annotation_key$ccds_id,
columns=c("TXID","TXNAME"),
keytype = "TXNAME"),
by=c("ccds_id"="TXNAME"))%>%
mutate(TXID=as.character(TXID))
For H5 files from v2 pipeline.
<-read.csv("./data/banned_list.csv")
banned <-data.frame(do.call(cbind,
SNV_math5read(file=file,name="/assays/dna_variants/ca/",
index=list(VAF_select)))) %>%
filter(id%in%final_variants&
!id%in%banned[,1])%>%
mutate(ALT=ifelse(ALT=="*","N",ALT))%>%
mutate(CHROM=paste0("chr",CHROM))
For loom files from v1 pipeline
<- connect(filename = "./data/MSK91_IGO_09868_L_7_S7_R_191122041131.cells.loom", mode = "r")
lfile <-data.frame(do.call(cbind,
SNV_matlist("ALT"=lfile$row.attrs$ALT[],
"CHROM"=lfile$row.attrs$CHROM[],
"POS"=lfile$row.attrs$POS[],
"QUAL"=lfile$row.attrs$QUAL[],
"REF"=lfile$row.attrs$REF[],
"amplicon"=lfile$row.attrs$amplicon[],
"id"=lfile$row.attrs$id[]))) %>%
filter(id%in%final_variants&
!id%in%banned[,1])%>%
mutate(ALT=ifelse(ALT=="*","N",ALT))%>%
mutate(CHROM=paste0("chr",CHROM))
Now we will map the variants of interest to genes, and if they land in a coding exon, predict the amino acid change. For INDELs this gets messy, still trying to figure out how too name them nicely (and open to feedback!). The printed results below are for the H5 input described above.
#necessary for meaningful GRangess
$REF<-as(SNV_mat$REF, "DNAStringSet")
SNV_mat$ALT<-as(SNV_mat$ALT, "DNAStringSet")
SNV_mat
<-makeGRangesFromDataFrame(SNV_mat,
variant_gRangeseqnames.field = "CHROM",
start.field="POS",
end.field="POS",
keep.extra.columns=TRUE)
#necessary for downstream joining of
$QUERYID<-1:length(variant_gRange)
variant_gRange
#identify and isolate non coding variants
<- locateVariants(variant_gRange,
non_coding_variants
hg19refseq_txdb,AllVariants())%>%
data.frame()%>%
filter(as.character(LOCATION)!="coding")%>%
inner_join(variant_gRange,by="QUERYID",copy=TRUE)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 5 out-of-bound ranges located on sequences 17,
## 24, 26, and 30. Note that ranges located on a sequence whose length is
## unknown (NA) or on a circular sequence are not considered out-of-bound
## (use seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim these
## ranges. See ?`trim,GenomicRanges-method` for more information.
#identify and isolate coding variants
<- predictCoding(variant_gRange,
coding_variants
hg19refseq_txdb, seqSource=Hsapiens,
varAllele=variant_gRange$ALT)%>%
data.frame()
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 5 out-of-bound ranges located on sequences 17,
## 24, 26, and 30. Note that ranges located on a sequence whose length is
## unknown (NA) or on a circular sequence are not considered out-of-bound
## (use seqlengths() and isCircular() to get the lengths and circularity
## flags of the underlying sequences). You can use trim() to trim these
## ranges. See ?`trim,GenomicRanges-method` for more information.
#Bind it all together into one big table.
<- bind_rows(non_coding_variants,coding_variants) %>%
out inner_join(annotation_key)%>%
mutate(AA=ifelse(!is.na(REFAA),
paste0(gene_name,".",REFAA,PROTEINLOC,VARAA),
paste0(gene_name,".intronic")))%>%
::select(id,AA)
dplyr
#append Bulk VAF for reference in future cutoffs and allele selection
<-data.frame(out,
final_mutation_info"Bulk_VAF"=colSums(final_NGT[,out$id], na.rm=TRUE)/
nrow(final_NGT)*2)*100)
(
print(head(final_mutation_info))
## id AA Bulk_VAF
## 1 chr2:25463121:C/T DNMT3A.intronic 0.1979414
## 2 chr2:25463483:G/A DNMT3A.intronic 45.1702296
## 3 chr2:25464334:A/C DNMT3A.intronic 1.3855899
## 4 chr2:198266943:C/T SF3B1.intronic 97.9018211
## 5 chr2:209113048:G/GA IDH1.intronic 5.0910530
## 6 chr3:128202879:C/T GATA2.intronic 0.1821061