Chapter 21 TCR clonotype analysis

#Annotation file: Load in an annotation file indicating all the samples, batch effects etc 

TRUST4path="../data/TRUST4_TCR/"
TRUST4files=dir(TRUST4path,"*.tsv")
#FileNames=sapply(strsplit(TRUST4files, "_report.tsv"), function(x) x[1])
matchidx=match(strsplit(TRUST4files, ".tsv"), infoTableFinal$TumorIDnew)

#RepNames=paste(infoTableFinal$Rat_ID, tempAnnot$Location, tempAnnot$Fraction, sep="_")[matchidx]
# load in all the files and save the list to file
RatTrust=lapply(TRUST4files, function(x) read.delim(paste(TRUST4path, x, sep=""), sep="\t"))
names(RatTrust)=infoTableFinal$SampleID[matchidx]
names(RatTrust)=gsub("Ep", "CD45", names(RatTrust))

TRUST4path="../data/TRUST4_TCR/matchedNormal/"
TRUST4files=dir(TRUST4path,"*.tsv")
#FileNames=sapply(strsplit(TRUST4files, "_report.tsv"), function(x) x[1])
FileNames=strsplit(TRUST4files, "*.tsv")
RatTrustNormal=lapply(TRUST4files, function(x) read.delim(paste(TRUST4path, x, sep=""), sep="\t"))
names(RatTrustNormal)=FileNames

RatTrust=c(RatTrust, RatTrustNormal)

save(RatTrust, file=sprintf("outputs/Rat_TRUST4_TCR_%s.RData", Sys.Date()))

We will refine the above plot to contain only the CD45 population and assess clonotypes:

with at least 2 reads
a CDR3aa sequence (not out of frame)
appears in TRAC, TRBC TRDC or TRGC

NSamples=sapply(RatTrust, nrow)
NSamplesCt2=sapply(RatTrust, function(x) length(which(x$X.count>=2)))
NSamplesT=sapply(RatTrust, function(x) length(which(x$C%in%c("TRAC", "TRBC", "TRDC", "TRGC"))))
NSamplesCDR3comp=sapply(RatTrust, function(x) length(which(x$CDR3aa!="out_of_frame")))
NSamplesCDR3compCt2=sapply(RatTrust, function(x) length(which(x$X.count>=2 &x$C%in%c("TRAC", "TRBC", "TRDC", "TRGC")&x$CDR3aa!="out_of_frame" )))

TableOut=cbind(NSamples, NSamplesCt2,NSamplesT, NSamplesCDR3comp, NSamplesCDR3compCt2)

RatTrustB=lapply(RatTrust, function(x) x[which(x$X.count>=2 &x$C%in%c("TRAC", "TRBC", "TRDC", "TRGC") & x$CDR3aa!="out_of_frame" ), ])
for (i in 1:length(RatTrustB)){
  RatTrustB[[i]]$frequency=RatTrustB[[i]]$X.count/sum(RatTrustB[[i]]$X.count)
}

We will assess TCR diversity using the following metrics:

** Shannon index **

The shannon index is computed by:

filtering through reads of interest
recalculate the fraction such that the new list sums to 1
compute entropy as the sum of log(freq_x)*freq_x amongst all populations x
compute the maximum entropy expected for that case -log(1/N), where N is the number of populations present
to determine confidence intervals, bootstrap the population (500 times) and compute the expected entropy

** Gini index **

The Gini index can be considered as an inverse of the Shannon index

** Top Clonotypes **

We will see the proportion of the BCR repertoire which is computed using the top 10 frequent clones. This will give an idea of whether there is a clonal expansion

# compute values here
Div1=sapply(RatTrustB, function(x) -sum(x$frequency*log(x$frequency), na.rm=T))
EDiv=sapply(NSamplesCDR3compCt2, function(x) -log(1/x))
NormDiv=Div1/EDiv

## Shannon index
# bootstrap rslt
BSrslt=sapply(RatTrustB, function(x) tryCatch(BootstrapShannonIdx(x[ ,1], 1000), error=function(e) c(NA, NA)))
# divide the CI by the maximum possible diversity

BSCI=t(BSrslt)/EDiv

matchidx=match(rownames(BSCI), infoTableFinal$SampleID)

df=data.frame(sample=rownames(BSCI), Val=NormDiv, Lower=BSCI[ , 1], Upper=BSCI[ ,2 ], Type=infoTableFinal$Fraction[matchidx], Batch=infoTableFinal$Batch[matchidx], Growth=infoTableFinal$Growth[matchidx])
## Gini index

Gini=sapply(RatTrustB, function(x) gini(x$frequency))

PGini=sapply(RatTrustB, function(x) tryCatch(PermuteGini(x$X.count, 1000), error=function(e) c(NA, NA)))

df$Calc="shannon"

df2=data.frame(sample=df$sample, Val=Gini, Lower=PGini[1, ], Upper=PGini[2, ],  Calc="Gini", Type=df$Type, Batch=df$Batch,
               Growth=df$Growth)

df3=data.frame(sample=rownames(TableOut), Val=TableOut[ ,5], Lower=NA, Upper=NA,  Calc="NClon", Type=df$Type, Batch=df$Batch, Growth=df$Growth)

dfAll=rbind(df, df2, df3)

## Top Clonotypes

TopClones=sapply(RatTrustB, function(x) sum(x$frequency[1:5], na.rm = T))
TopClones[which(NSamplesCDR3compCt2==0)]=0
df2$Val=TopClones
df2$Calc="Top10"
df2$Lower=TopClones
df2$Upper=TopClones

dfAll=rbind(dfAll, df2)

dfAll$Batch=as.character(dfAll$Batch)

dfAll$Batch[grep("^N_", rownames(dfAll))]="match"
dfAll$Type[grep("^N_", rownames(dfAll))]="CD45"

Summary plots

ggplot(dfAll[dfAll$Type=="CD45" & dfAll$Calc=="shannon", ], aes(x=sample, y=Val, fill=Type))+geom_bar(stat="identity")+facet_grid(~Batch, scales="free_x", space="free")+geom_errorbar( aes(ymin=Lower, ymax=Upper), width=.2)+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ggtitle("Shannon idx of TCR diversity")


ggplot(dfAll[dfAll$Type=="CD45" & dfAll$Calc=="Gini", ], aes(x=sample, y=Val, fill=Type))+geom_bar(stat="identity")+facet_grid(~Batch, scales="free_x", space="free")+geom_errorbar( aes(ymin=Lower, ymax=Upper), width=.2)+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ggtitle("Shannon idx of TCR diversity")


ggplot(dfAll[dfAll$Type=="CD45" & dfAll$Calc=="Top10", ], aes(x=sample, y=Val, fill=Type))+geom_bar(stat="identity")+facet_grid(~Batch, scales="free_x", space="free")+geom_errorbar( aes(ymin=Lower, ymax=Upper), width=.2)+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ggtitle("Shannon idx of TCR diversity")


ggplot(dfAll[dfAll$Type=="CD45" & dfAll$Calc=="NClon", ], aes(x=sample, y=Val, fill=Type))+geom_bar(stat="identity")+facet_grid(~Batch, scales="free_x", space="free")+geom_errorbar( aes(ymin=Lower, ymax=Upper), width=.2)+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ggtitle("Shannon idx of TCR diversity")

### Characterisation cohort

CharTemp=dfAll[which((dfAll$Batch==1 | dfAll$Batch=="match" ) & dfAll$Calc!="NClon"), ]

a1=sapply(unique(factor(CharTemp$Calc)), function(x) wilcox.test(CharTemp$Val[CharTemp$Calc==x & CharTemp$Batch==1], CharTemp$Val[CharTemp$Calc==x & CharTemp$Batch=="match"])$p.value)
p=ggplot(CharTemp, aes(x=Batch, y=Val))+geom_boxplot()+facet_grid(~Calc)+ggtitle(sprintf("wilcox pval: Gini %s, Shannon %s, Top10 = %s", round(a1[2],2), round(a1[1],2), round(a1[3],2)))+theme_bw()+geom_jitter(col="grey")
print(p)


CharTemp=dfAll[which((dfAll$Batch==1 | dfAll$Batch=="match" ) & dfAll$Calc=="NClon"), ]
## append with the samples with no TCR samples detected
CharTemp=rbind(CharTemp, data.frame(sample=c("NMU12_LA",    "NMU13_LAU", "NMU13_LLU"), Val=rep(1,3), Lower=NA, Upper=NA, Type="CD45", Batch=1, Growth=NA, Calc="NClon"))
a1=wilcox.test(CharTemp$Val[CharTemp$Calc=="NClon" & CharTemp$Batch==1], CharTemp$Val[CharTemp$Calc=="NClon" & CharTemp$Batch=="match"])$p.value
ggplot(CharTemp, aes(x=Batch, y=Val))+geom_boxplot()+facet_grid(~Calc)+ggtitle(sprintf("wilcox pval: Gini %s, Shannon %s, Top10 = %s", round(a1[2],2), round(a1[1],2), round(a1[3],2)))+theme_bw()+geom_jitter(col="grey")

21.0.1 Progression cohort

CharTemp=dfAll[which((dfAll$Batch==2 | dfAll$Batch==3 ) & dfAll$Calc!="NClon"), ]

a1=sapply(unique(factor(CharTemp$Calc)), function(x) wilcox.test(CharTemp$Val[CharTemp$Calc==x]~CharTemp$Growth[CharTemp$Calc==x])$p.value)
p=ggplot(CharTemp, aes(x=Growth, y=Val, col=Growth))+geom_boxplot()+facet_grid(~Calc)+ggtitle(sprintf("wilcox pval: Gini %s, Shannon %s, Top10 = %s", round(a1[2],2), round(a1[1],2), round(a1[3],2)))+geom_point()+theme_bw()+scale_colour_manual(values=c(ColSizeb, "black"))

print(p)


a1=wilcox.test(CharTemp$Val~CharTemp$Growth)$p.value
CharTemp=dfAll[which((dfAll$Batch==2 | dfAll$Batch==3 ) & dfAll$Calc=="NClon"), ]
ggplot(CharTemp, aes(x=Growth, y=Val, col=Growth))+geom_boxplot()+facet_grid(~Calc)+ggtitle(sprintf("wilcox pval: NClon = %s", round(a1[1],2)))+geom_point()+theme_bw()+scale_colour_manual(values=c(ColSizeb, "black"))