Chapter 8 DESeq analysis: Progression/Immunotherapy cohort

This document sets up DESeq runs to compare:

  • DN vs Ep samples
  • growing vs stable samples (all 3 fractions)
  • treatments (all 3 fractions)
  • spatial patterns (cd8 or epithelial fractions)

The outputs of these analyses will be used for Gene Set Enrichment Analysis, using MSigDB databases (c2, c5, hallmark), alongside pathways from Metacore (Process Networks, Pathway Maps)

8.1 DN vs Ep

In section 6.2, we have noticed that some DN samples had expression of epithelial markers. Here, we perform a differential gene expression analysis to find genes which are different between these two fractions.

Below is a summary of the number of differential genes, using p value cut off of 0.05 and log2 fold change of 1.5 and base expression of 100+.

epidx=as.character(infoTableFinal$SampleID[which(infoTableFinal$Cohort=="Progression" & infoTableFinal$Fraction!="CD45")])
DNEpdds=DESeqDataSetFromMatrix(allstarFinal[ ,epidx], infoTableFinal[epidx, ], design=~Fraction) ## 
DNEpdds=DESeq(DNEpdds)
resDNep=results(DNEpdds, contrast=c("Fraction", "DN", "Ep"))
summary(resDNep)
## 
## out of 16929 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 3331, 20%
## LFC < 0 (down)     : 1972, 12%
## outliers [1]       : 0, 0%
## low counts [2]     : 1970, 12%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

vsdDNep=vst(DNEpdds)

print('significant differential genes')
## [1] "significant differential genes"

resDNeprslt2=resDNep[which(resDNep$padj<0.05 & abs(resDNep$log2FoldChange)>1.5 &
                             resDNep$baseMean>100), ]
resDNeprslt2$CellMarker=ifelse(rownames(resDNeprslt2)%in%unlist(GeneListRat), 1, 0)
resDNeprslt2=resDNeprslt2[order(resDNeprslt2$CellMarker, abs(resDNeprslt2$log2FoldChange), decreasing = T), ]


DT::datatable(as.data.frame(resDNeprslt2), rownames=F, class='cell-border stripe',
          extensions="Buttons", options=list(dom="Bfrtip", buttons=c('csv', 'excel')))

HighExprGenes=rownames(resDNeprslt2)[which(resDNeprslt2$baseMean>100 & resDNeprslt2$log2FoldChange<0) ]

colSide=factor(infoTableFinal$Fraction[ which(infoTableFinal$Fraction!="CD45" & infoTableFinal$Cohort=="Progression")])
colSide2=factor(infoTableFinal$Growth[ which(infoTableFinal$Fraction!="CD45"& infoTableFinal$Cohort=="Progression")])
colSide3=factor(infoTableFinal$Treatment[ which(infoTableFinal$Fraction!="CD45"& infoTableFinal$Cohort=="Progression")])
colSide4=cut(infoTableFinal$CD8Frac[ which(infoTableFinal$Fraction!="CD45"& infoTableFinal$Cohort=="Progression")], c(-1, 0.025, 0.05, 0.1, 1), brewer.pal(4, "Greens"))
colSide5=cut(infoTableFinal$IFEpCAM[which(infoTableFinal$Fraction!="CD45"& infoTableFinal$Cohort=="Progression")], c(-1, 0.05, 0.1, 0.15, 1), brewer.pal(4, "Greens"))

TableCols=rbind(palette()[colSide], palette()[colSide2], palette()[colSide3], as.character(colSide4), as.character(colSide5))
rownames(TableCols)=c("Fraction", "growth", "treatment", "cd8 fraction", "cd8 int. fraction")
colnames(TableCols)=colnames(assay(vsdDNep))

#pdf(sprintf("rslt/DESeq/difference_between_DN_ep_samples_%s.pdf", Sys.Date()), height=7, width=8)

t1=assay(vsdDNep)[rownames(vsdDNep)%in%unlist(GeneListRat), ]

heatmap.2(t1, trace="none", col=RdBu[11:1], ColSideColors = palette()[colSide], scale="row", main="cell type specific markers")


t2=assay(vsdDNep)[which(rownames(vsdDNep)%in%RatCosmic & rownames(vsdDNep)%in%rownames(resDNeprslt2)), ]
heatmap.2(t2, trace="none", col=RdBu[11:1], ColSideColors = palette()[colSide], scale="row", main="DEG cosmic genes")


t2=assay(vsdDNep)[which( rownames(vsdDNep)%in%HighExprGenes), ]
a1=heatmap.2(t2, trace="none", col=RdBu[11:1], ColSideColors = palette()[colSide], scale="row", main="Enriched in Ep genes")

#heatmap.2(t2[ ,-grep("CD45", colnames(t2))], trace="none", col=RdBu[11:1], ColSideColors = palette()[colSide2], scale="row", main="Enriched in Ep genes")

TableCols2=TableCols[ ,a1$colInd]

colOutput=melt(TableCols2)
colOutput$Var2=factor(colOutput$Var2, levels=unique(colOutput$Var2))

#write.csv(resDNeprslt2, file=sprintf("outputs/DESeq/difference_between_DN_Ep(ref)_samples_%s.csv", Sys.Date()))

8.2 No. samples in comparisons

Separate out the tables into CD45, DN and Ep fractions. Below, we check the number of samples in each subgroup, based on

  1. Treatment
  2. growth rate
  3. fraction
infoTableFinal$Treatment=factor(infoTableFinal$Treatment)#[which(is.na(infoTableFinal$Treatment))]="PDL1"
infoTableFinal$SpatialManual=factor(infoTableFinal$SpatialManual)
infoTableFinal$treatA=factor(ifelse(infoTableFinal$Treatment=="Vehicle", "control", "imm"))
infoTableFinal$MHcut=factor(ifelse(infoTableFinal$MHEpCAM>=median(df.Spatial$MH.EpCAM, na.rm = T), "inf", "res"))
infoTableFinal$CD8FracCut=factor(ifelse(infoTableFinal$CD8Frac>=median(df.Spatial$CD8frac, na.rm = T), "high", "low"))
infoTableFinal$Growth=factor(infoTableFinal$Growth)

# print('number of samples for each treatment and growth rate')
# table(infoTableFinal$Fraction, infoTableFinal$Treatment, infoTableFinal$Growth)

a1=table(infoTableFinal$Fraction, infoTableFinal$Treatment, infoTableFinal$Growth)
par(mfrow=c(3,2))
ContTable(t(a1[ , , 1]), "growing", F, ylabL="fraction", xlabL="treatment")
ContTable(t(a1[ , , 2]), "stable", F, ylabL="fraction", xlabL="treatment")

a1=table(infoTableFinal$Fraction, infoTableFinal$Treatment, infoTableFinal$MHcut)
ContTable(t(a1[ , , 1]), "infiltrating (MH)", F, ylabL="fraction", xlabL="treatment")
ContTable(t(a1[ , , 2]), "restricted (MH)", F, ylabL="fraction", xlabL="treatment")

a1=table(infoTableFinal$Fraction, infoTableFinal$Treatment, infoTableFinal$CD8FracCut)
ContTable(t(a1[ , , 1]), "CD8 low", F, ylabL="fraction", xlabL="treatment")
ContTable(t(a1[ , , 2]), "CD8 high", F, ylabL="fraction", xlabL="treatment")

Note that based on the above tables, there some comparisons which are slightly imbalanced:

  • Note that PDL1/LY treatment with high cd8 content results in very few epithelial cells for analysis
  • There are few “stable” samples for comparisons
  • LY samples are overwhelmingly “infiltrating”

8.3 Set-up cell-type specific the comparisons

Setup the following comparisons for each cell type (Separate out the tables into CD45, DN and Ep fractions)

\[ expression ~ growing + treatment + batch (exclude 1) \]

Compare:

  1. Treatment (EpddsTreat)
  2. Any immunotherapy treatment (Epdds)
  3. Growth alone: (EpddsGrowth) - change this to numeric
  4. MH index: (EpddsStrMH)
  5. CD8 content (EpddsCD8)

We remove genes which have 0 counts in more than half the samples, and genes which have a row sum less than \(10^{(log10(mean(rowsums))-log10(sd(rowsums)))}\)

# find epithelial samples
epidx=as.character(infoTableFinal$SampleID[which(infoTableFinal$Cohort=="Progression" & infoTableFinal$Fraction=="Ep" & !is.na(infoTableFinal$Growth))])

# 2: ep
Epdds=DESeqDataSetFromMatrix(allstarFinal[ ,epidx], infoTableFinal[epidx, ], design=~treatA+factor(Batch))
a1x=rowSums(counts(Epdds))
a1b=apply(counts(Epdds), 1, function(c) sum(c!=0))
sd1vals=mean(log10(a1x+1))-sd(log10(a1x+1))
keep=which(rowSums(counts(Epdds))>10^sd1vals)
keep2=which(apply(counts(Epdds), 1, function(c) sum(c!=0))> (ncol(Epdds)/2))
Epdds=Epdds[intersect(keep, keep2), ]
Epdds=DESeq(Epdds)
# 1: ep
EpddsTreat=Epdds
design(EpddsTreat)=~Treatment+factor(Batch)
EpddsTreat=DESeq(EpddsTreat)
# 3. ep
EpddsGrowth=Epdds
design(EpddsGrowth)=~Growth+factor(Batch)  ## error is here!
EpddsGrowth=DESeq(EpddsGrowth)
# 4. ep
EpddsStrMH=Epdds[ ,which(!is.na(Epdds$MHcut))]
design(EpddsStrMH)=~MHcut+factor(Batch)
EpddsStrMH=DESeq(EpddsStrMH)
# 5.ep
Epddscd8=Epdds[ ,which(!is.na(Epdds$CD8FracCut))]
design(Epddscd8)=~CD8FracCut+factor(Batch)
Epddscd8=DESeq(Epddscd8)

#R1=results(EpddsTreatG, contrast = list(c("Treatment_LY_vs_Vehicle", "Growthgrowing")))
## CD45 samples

cdidx=as.character(infoTableFinal$SampleID[which(infoTableFinal$Cohort=="Progression" & infoTableFinal$Fraction=="CD45" & !is.na(infoTableFinal$Growth))])
CDdds=DESeqDataSetFromMatrix(allstarFinal[ ,cdidx], infoTableFinal[cdidx, ], design=~treatA+factor(Batch)) ## change class
a1x=rowSums(counts(CDdds))
a1b=apply(counts(CDdds), 1, function(c) sum(c!=0))
sd1vals=mean(log10(a1x+1))-sd(log10(a1x+1))
keep=which(rowSums(counts(CDdds))>10^sd1vals)
keep2=which(apply(counts(CDdds), 1, function(c) sum(c!=0))> (ncol(CDdds)/2))
CDdds=CDdds[intersect(keep, keep2), ]
#1. CD45
CDdds=DESeq(CDdds)
#2, CD45
CDddsTreat=CDdds
design(CDddsTreat)=~Treatment+factor(Batch)
CDddsTreat=DESeq(CDddsTreat)

# 3. Cd45
CDddsGrowth=CDdds
design(CDddsGrowth)=~Growth+factor(Batch)
CDddsGrowth=DESeq(CDddsGrowth)
#4
CDddsStrMH=CDdds[ ,which(!is.na(CDdds$MHcut))]
design(CDddsStrMH)=~MHcut+factor(Batch)
CDddsStrMH=DESeq(CDddsStrMH)

CDddscd8=CDdds[ ,which(!is.na(CDdds$CD8FracCut))]
design(CDddscd8)=~CD8FracCut+factor(Batch)
CDddscd8=DESeq(CDddscd8)

## DN samples

DNidx=as.character(infoTableFinal$SampleID[which(infoTableFinal$Cohort=="Progression" & infoTableFinal$Fraction=="DN" & !is.na(infoTableFinal$Growth))])
DNdds=DESeqDataSetFromMatrix(allstarFinal[ ,DNidx], infoTableFinal[DNidx, ], design=~treatA+factor(Batch)) ## change class
a1x=rowSums(counts(DNdds))
a1b=apply(counts(DNdds), 1, function(c) sum(c!=0))
# par(mfrow=c(1,2))
# hist(log10(a1x+1), main="log10 total counts")
# hist((a1b+1), main="Non-zero entries")
sd1vals=median(log10(a1x+1))-sd(log10(a1x+1))
keep=which(rowSums(counts(DNdds))>10^sd1vals)
keep2=which(apply(counts(DNdds), 1, function(c) sum(c!=0))> (ncol(DNdds)/2))
keep=which(rowSums(counts(DNdds))>(ncol(DNdds)))
keep2=which(apply(counts(DNdds), 1, function(c) sum(c!=0))> (ncol(DNdds)/2))
DNdds=DNdds[intersect(keep, keep2), ]
DNdds=DESeq(DNdds)

DNddsTreat=DNdds
design(DNddsTreat)=~Treatment+factor(Batch)
DNddsTreat=DESeq(DNddsTreat)

DNddsGrowth=DNdds
design(DNddsGrowth)=~(Growth)+factor(Batch)
DNddsGrowth=DESeq(DNddsGrowth)

DNddsStrMH=DNdds[ ,which(!is.na(DNdds$MHcut))]
design(DNddsStrMH)=~MHcut+factor(Batch)
DNddsStrMH=DESeq(DNddsStrMH)

DNddscd8=DNdds[ ,which(!is.na(DNdds$CD8FracCut))]
design(DNddscd8)=~CD8FracCut+factor(Batch)
DNddscd8=DESeq(DNddscd8)

save(Epdds, EpddsTreat,CDdds, CDddsTreat, DNdds, DNddsTreat,
    EpddsGrowth, CDddsGrowth, DNddsGrowth, 
      DNddscd8,  DNddsStrMH,
     CDddscd8,  CDddsStrMH,
      Epddscd8,  EpddsStrMH,
     file=sprintf("outputs/subfraction_analysis_%s.RData", Sys.Date()))

# Alist=results(EpddsSpatMan, c("SpatialManual", "Infiltrating", "restricted"))
# Clist=results(CDddsSpatMan, c("SpatialManual", "Infiltrating", "restricted"))
# Dlist=results(DNddsSpatMan, c("SpatialManual", "Infiltrating", "restricted"))

#load("rslt/DESeq/subfraction_analysis_2020-09-29.RData")
#write.csv(a2, sprintf("rslt/DESeq/Spatial_Manual_comparison_%s.csv", Sys.Date()))

These comparisons are saved in the temporary outputfile outputs/subfraction_analysis_2022-02-13.RData. In each comparison, there are 13134 genes compared.

8.4 PCA plots

Below, we look at PCA plots with information on treatment, growth, spatial patterns overlaid.

These are separated based on cell type

8.4.1 EpCAM

We see that treatment nor growth separates out these samples. Below is a pca plot of the result, we can also print out the tables for each comparison

Epdds$cd8MH=paste(Epdds$CD8FracCut, Epdds$MHcut)

vstEp=vst(Epdds, blind=F)
#vstEp$Treatment2=ifelse(vstEp$Treatment=="Vehicle", "no", "yes")
vsdLimmaEp=vstEp
assay(vsdLimmaEp)<- limma::removeBatchEffect(assay(vsdLimmaEp),vsdLimmaEp$Batch)

plotPCA(vsdLimmaEp, c("Growth"))+scale_color_manual(values=c(ColSizeb, "#31A354"))+theme_bw()+ggtitle("Growth Ep")

a2=plotPCA(vsdLimmaEp, c("Treatment"))+theme_bw()+ggtitle("Treatment Ep")+scale_color_manual(values=c(ColMerge[ ,1], "#5D5D5D"))
a2


DT::datatable(a2$data, rownames=F, class='cell-border stripe',
          extensions="Buttons", options=list(dom="Bfrtip", buttons=c('csv', 'excel')))

#write.csv(a2$data, file="nature-tables/3i_epcam.csv")

8.4.2 CD45

Note there is a strong batch effect: need to apply limma remove Batch Effect prior to creating PCA plots

#CDdds$cd8MH=paste(CDdds$CD8FracCut, CDdds$MHcut)

vstCD=vst(CDdds, blind=F)
plotPCA(vstCD, "Batch")

vsdLimmaCD=vstCD

assay(vsdLimmaCD)<- limma::removeBatchEffect(assay(vsdLimmaCD),vstCD$Batch)
# library(cluster)

ax1=plotPCA(vsdLimmaCD, c("Batch"), returnData=T)
plotPCA(vsdLimmaCD, c("Growth"))+scale_color_manual(values=c(ColSizeb, "#31A354"))+theme_bw()+ggtitle("Growth CD45")

a2=plotPCA(vsdLimmaCD, c("Treatment"))+theme_bw()+ggtitle("Treatment CD45")+scale_color_manual(values=c(ColMerge[,1], "#5D5D5D"))
a2

DT::datatable(a2$data, rownames=F, class='cell-border stripe',
          extensions="Buttons", options=list(dom="Bfrtip", buttons=c('csv', 'excel')))
#write.csv(a2$data, file="nature-tables/3i_cd45.csv")

8.4.3 DN

DNdds$cd8MH=paste(DNdds$CD8FracCut, DNdds$MHcut)

vstDN=vst(DNdds, blind=T)
vstDN$Treatment2=ifelse(vstDN$Treatment=="Vehicle", "no", "imm")

vsdLimmaDN=vstDN

assay(vsdLimmaDN)<- limma::removeBatchEffect(assay(vsdLimmaDN),vstDN$Batch)

ax1=plotPCA(vsdLimmaDN, c("Batch"), returnData=T)
plotPCA(vsdLimmaDN, c("Growth"))+scale_color_manual(values=c("orange", "#31A354"))+theme_bw()+ggtitle("Growth DN")

a2=plotPCA(vsdLimmaDN, c("Treatment"))+theme_bw()+ggtitle("Treatment DN")+scale_color_manual(values=c(ColMerge[,1], "#5D5D5D"))

DT::datatable(a2$data, rownames=F, class='cell-border stripe',
          extensions="Buttons", options=list(dom="Bfrtip", buttons=c('csv', 'excel')))


#write.csv(a2$data, file="nature-tables/3i_dn.csv")

8.5 Pearson correlation plots of samples

Below are pearson correlation plots of the samples in each comparison, colored by growth status

Xa1=cor(assay(vsdLimmaEp))
heatmap.2(Xa1, col=RdBu[11:1], ColSideColors = ColSizeb[vstEp$Growth], trace="none", main="Ep")


DT::datatable(Xa1, rownames=F, class='cell-border stripe',
          extensions="Buttons", options=list(dom="Bfrtip", buttons=c('csv', 'excel')))
#write.csv(Xa1, file="nature-tables/Ext3j.csv")


Xa1=cor(assay(vsdLimmaCD))
heatmap.2(Xa1, col=RdBu[11:1], ColSideColors = ColSizeb[vstCD$Growth], trace="none", main="CD45")

DT::datatable(Xa1, rownames=F, class='cell-border stripe',
          extensions="Buttons", options=list(dom="Bfrtip", buttons=c('csv', 'excel')))

#write.csv(Xa1, file="nature-tables/FIgS2-cd45-pca-cor-plot.csv")


Xa1=cor(assay(vsdLimmaDN))
heatmap.2(Xa1, col=RdBu[11:1], ColSideColors = ColSizeb[vstDN$Growth], trace="none", main="DN")


#dev.off()
DT::datatable(Xa1, rownames=F, class='cell-border stripe',
          extensions="Buttons", options=list(dom="Bfrtip", buttons=c('csv', 'excel')))

#write.csv(Xa1, file="nature-tables/DNsamples_progression_similarity.csv")