Chapter 1 Prerequisites
1.1 Packages and Software
The following packages are required to conduct the analyses described below.
In house scripts are deposited in the rscript folder.
library(AnnotationHub)
library(beeswarm)
library(biomaRt)
library(Biostrings)
library(colorspace)
library(DESeq2)
library(dplyr)
library(DT)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(forestplot)
library(GenVisR)
library(GenomicFeatures)
library(ggplot2)
library(ggrepel)
library(gplots)
library(GSEABase)
library(GSVA)
library(heatmap.plus)
library(HTSanalyzeR2)
library(kableExtra)
library(knitr)
library(limma)
library(matrixStats)
library(pamr)
library(reshape2)
library(rmarkdown)
library(RColorBrewer)
library(scales)
library(spatstat)
library(survival)
library(tcR)
library(vcfR)
library(xlsx)
library(writexl)
DiffCols=hue_pal()(8)
palette(brewer.pal(9, "Set1"))
RdBu=brewer.pal(11, "RdBu")
SetCols=brewer.pal(12, "Set3")
source("../rscript/cnFreq_fn.R") #modified version of GenVisR
source("../rscript/merge_contig.R")
source("../rscript/gseaCode.R")
source("../rscript/ContingencyTable.R")
source("../rscript/PvalueHeatMap.R")
source("../rscript/BootstrapShannonIdx.R")
source("../rscript/CreateRnor87db.R")
source("../rscript/FindRatAAHomolog.R")
source("../rscript/FindTriNucleotideContext.R")
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
ColMerge=matrix(c("#FFC82F", "#FFEDBC", "#73FDFE","#D2FFFF", "#FF41FF", "#FECAFF", "#5D5D5D", "#BEBEBE"), ncol=2, byrow = T)
rownames(ColMerge)=c("LY","PDL1", "PDL1+LY","Vehicle")
ColSize=c("#008E00", "#FF9300")
names(ColSize)=c("small", "big")
ColSizeb=ColSize[2:1]
ColSizec=c(ColSize[2], "purple", ColSize[1])
hclust.ave <- function(x) hclust(x, method="average")
Hsedb<-EnsDb.Hsapiens.v86
1.2 External software
The following external software was utilised:
Software | Function |
---|---|
bwa | Alignment of WGS data to reference |
GATK4 | Mutation calling, done by NYGC. Mutation calling from RNA (Haplotype caller) |
strelka | Mutation calling, done by NYGC |
BICseq | CNV calling |
GEM3 | create mappability files for CNV calling |
STAR | Alignment of RNAseq data |
RSEM | Calculate RSEM, TPM, FPKM from RNAseq data |
TRUST4 | assignment of T and B cell clonotypes from RNA-seq data |
Oncotator | Annotation of genetic variants |
QuPath | Tool for cell segmentation and extraction of features from IF images |
samtools, bcftools | querying and dispalying information from bam files, extracting allelic depth at specific genomic locations |
CIBERSORT | inferring immune composition from RNA |
lumpy | structural variants |
PAM50 | code from parker et al 2009 to infer PAM50 subtypes |
1.3 Annotations
1.3.1 Genomic properties
Information on chromosome sizes, cytobands and centromere locations were obtained from the UCSC genome browser.
The following annotation data for the rn6 genome is required:
Data Type | Download link |
---|---|
ref. genome | http://hgdownload.soe.ucsc.edu/goldenPath/rn6/bigZips/rn6.fa.gz |
refSeq annot | http://hgdownload.soe.ucsc.edu/goldenPath/rn6/bigZips/genes/rn6.refGene.gtf.gz |
gff3 file | for TRUST4 (ftp://ftp.ensembl.org/pub/release-100/gff3/rattus_norvegicus/Rattus_norvegicus.Rnor_6.0.100.gff3.gz) |
rn6cytoBand | ucsc server of all cytoband locations |
rn6chrInfo | ucsc server of all chromosome sizes |
biomart | conversion between gene symbol, ensbl and entrez was faciliated using biomart package |
Below is the summary of chromosome sizes and centromere locations:
chrInfo=read.delim("../anntotations/chromInfo_rat.txt", header=T)
cytoInfo=read.delim("../anntotations/cytoBand_Rat2.txt", header=F, stringsAsFactors = F)
colnames(cytoInfo)=c("chrom", "chromStart", "chromEnd", "name", "gieStain")
GRcytoInfo=GRanges(seqnames=cytoInfo$chrom, ranges=IRanges(start = cytoInfo$chromStart, end=cytoInfo$chromEnd), cytoband=cytoInfo$name)
head(GRcytoInfo)
## GRanges object with 6 ranges and 1 metadata column:
## seqnames ranges strand | cytoband
## <Rle> <IRanges> <Rle> | <character>
## [1] chr1 0-10704345 * | p13
## [2] chr1 10704345-25618263 * | p12
## [3] chr1 25618263-40652454 * | p11
## [4] chr1 40652454-51356799 * | q11
## [5] chr1 51356799-73607402 * | q12
## [6] chr1 73607402-95016091 * | q21
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
Below is an example of the gene annotation files
txRn6=makeTxDbFromGFF("../anntotations/rn6_refGene.gtf", format="gtf")
txRn6=genes(txRn6)
txRn6=sort(txRn6)
txRn6$gene_width=width(txRn6)
save(txRn6, file="../anntotations/txRn6_refGene.RData")
load("../anntotations/txRn6_refGene.RData")
head(txRn6)
## GRanges object with 6 ranges and 2 metadata columns:
## seqnames ranges strand | gene_id gene_width
## <Rle> <IRanges> <Rle> | <character> <integer>
## Vom2r3 chr1 396700-409676 + | Vom2r3 12977
## Lrp11 chr1 1702696-1731210 + | Lrp11 28515
## Nup43 chr1 1771721-1781554 + | Nup43 9834
## Lats1 chr1 1784078-1817310 + | Lats1 33233
## Katna1 chr1 1826170-1867786 + | Katna1 41617
## Ppil4 chr1 1897350-1930311 + | Ppil4 32962
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
1.3.2 Gene name homologs between organisms
Biomart was used to convert between rat, mouse and human gene symbols and ensembl ids. Below is an example of the human gene names mapped to the rat homolog
## save here:
## human to rat mapping of genes
## rat ENSEMBL vs symbol conversion
library(biomaRt)
#library(refGenome)
TS=read.delim("~/Documents/metacore-hack/hg38_allsymbols.txt")
TS=as.character(TS[ ,1])
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
rat = useMart("ensembl", dataset = "rnorvegicus_gene_ensembl")
SymHum2Rat = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = TS , mart = human, attributesL = c("rgd_symbol", "ensembl_gene_id"), martL = rat, uniqueRows=T)
Rat2Mouse= getLDS(attributes = c("rgd_symbol"), filters = "rgd_symbol", values = rownames(allrsemFinal) , mart = rat, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
Mouse2Hum = getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", values = rownames(allrsemFinal) , mart = mouse, attributesL = c("hgnc_symbol"), martL = human, uniqueRows=T)
save(SymHum2Rat, Rat2Hum,Mouse2Hum,Rat2Mouse, file="../anntotations/Rat_biomart_gene_annotations.RData")
1.3.3 Gene signatures and data-bases
Gene sets/signatures were obtained from the following sources:
Source | Description |
---|---|
IEDB | database of immune epitopes |
MsigDB | c2, c5, hallmark set of curated pathway gene sets |
Metacore | Process Networks and Pathway Maps data bases |
COSMIC | database of concensus oncogenes |
ImmPort | List of immune related genes |
InnateDB | List of genes associated with innate immune system |
Rosenthal 2019 | genes associated with MHC-I presentation |
Thorsson 2018 | Immune gene signatures curated from studies by Wolf, Calabro, Teschendorff, Beck, Chang |
Pardoll, Wykes | Immune checkpoint genes |
Gil del Alcazar 2017 | Supplementary table 5: list of activation, dysfunction gene signatures |
Bailey 2018 | List of 10 most comon tumor pathways |
Chang 2018 | Common mutation locations in cancer |
# These gene signatures are saved in the annotations folder and loaded below:
#########################
## cell specific markers
#######################
GeneList=read.csv("../anntotations/cell_type_markers.csv")
cn=colnames(GeneList)
GeneList=lapply(1:ncol(GeneList), function(x) setdiff(unique(GeneList[ ,x]), ""))
names(GeneList)=cn
## map all the names to rat names
GeneListRat=lapply(GeneList, function(x) SymHum2Rat$RGD.symbol[SymHum2Rat$HGNC.symbol%in%x])
######################
## cancer genes
######################
## cosmic cancer genes
AllCosmic=read.csv("../anntotations/Census_COSMIC_Feb2020.csv")
RatCosmic=SymHum2Rat$RGD.symbol[match(AllCosmic$Gene.Symbol, SymHum2Rat$HGNC.symbol)]
tx=AllCosmic$Gene.Symbol[which(is.na(RatCosmic)|RatCosmic=="")]
tx=tolower(tx)
tx=firstup(tx)
RatCosmic[which(is.na(RatCosmic)|RatCosmic=="")]=tx
RatBreastCosmic=RatCosmic[grep("breast", AllCosmic$Tumour.Types.Somatic.)]
## Bailey List of 10 most common tumor pathways
PathwayList=read.csv("../anntotations/cancer_pathways_annot_Bailey_cell2018_modified.csv")
cn=colnames(PathwayList)
PathwayListA=lapply(seq(1, length(cn), by=2), function(x) setdiff(unique(PathwayList[ ,x]), ""))
names(PathwayListA)=cn[seq(1, length(cn), by=2)]
PathwaySign=lapply(seq(2, length(cn), by=2), function(x) PathwayList[ which(PathwayList[ ,x]!=""),x])
names(PathwaySign)=cn[seq(1, length(cn), by=2)]
PathwayListRata=lapply(PathwayListA, function(x) SymHum2Rat$RGD.symbol[match(x, SymHum2Rat$HGNC.symbol)])
PathwayListRatb=lapply(PathwayListA, function(x) Rat2Hum$RGD.symbol[match(x, Rat2Hum$HGNC.symbol)])
PathwayListRat=lapply(1:length(PathwayListRata), function(x) ifelse(is.na(PathwayListRata[[x]]),
PathwayListRatb[[x]], PathwayListRata[[x]]))
names(PathwayListRat)=names(PathwayListA)
AllCancerPathwayGenes=na.omit(unlist(PathwayListRat))
############################
## List of immunesignatures
############################
## read in all the files
Exp2=read.csv("../anntotations/Supplementary Table 5.csv")
Exp2List=lapply(1:ncol(Exp2), function(x) setdiff(unique(Exp2[ ,x]), ""))
names(Exp2List)=colnames(Exp2)
List2=read.csv("../anntotations/Thorsson_signatures.csv")
List2b=lapply(1:ncol(List2), function(x) setdiff(unique(List2[ ,x]), ""))
names(List2b)=colnames(List2)
Exp2List=c(Exp2List, List2b)
ImmSuppAPC=read.delim("../anntotations/immune_Suppression.csv", header=T, stringsAsFactors = F, sep=",")
ImmSuppAPC=lapply(1:ncol(ImmSuppAPC), function(x) setdiff(unique(ImmSuppAPC[ ,x]), ""))
names(ImmSuppAPC)=c("Inh", "Act", "Both")
Exp2List=c(Exp2List, ImmSuppAPC)
MHCPres=read.delim("../anntotations/MHCloss.csv", header=F, stringsAsFactors = F, sep=",")
MHCPres=as.character(MHCPres[ ,1])
MHCPres2Rat=unique(SymHum2Rat$RGD.symbol[SymHum2Rat$HGNC.symbol%in%MHCPres])
## change the names to Rat-specific
Exp2RatImm=lapply(Exp2List, function(x) unique(SymHum2Rat$RGD.symbol[SymHum2Rat$HGNC.symbol%in%x]))
## All immune genes
AllImmGenes1=read.csv("../anntotations/ImmPort_Set.csv")
AllImmGenes2=read.csv("../anntotations/innatedb_curated_genes.csv")
AllImmGenes=unique(c(as.character(AllImmGenes1$Symbol), as.character(AllImmGenes2$Gene.Symbol[which(AllImmGenes2$Species==9606)])))
RatAllImm=na.omit(unique(c(SymHum2Rat$RGD.symbol[match(AllImmGenes, SymHum2Rat$HGNC.symbol)], as.character(AllImmGenes2$Gene.Symbol[which(AllImmGenes2$Species!=9606)]))))
## CIBERSORT specific rat genes
#lm22rat=read.delim("../anntotations/LM22_to_rnorvegicus_1.txt", sep="\t")
# Oncoprint
Oncoprint=read.csv("../anntotations/Onco_Mamma.csv")
Oncoprint=lapply(1:ncol(Oncoprint), function(x) setdiff(unique(Oncoprint[ ,x]), ""))
names(Oncoprint)=c("Oncotype", "Mammaprint")
1.3.3.1 Human gene homologs
Below, lists of common mutations in cancer are loaded and the “homolog” in rat is determined using an in-house script. The steps involved are:
- determine the amino acid context in human (find 5 a.a. prior and after)
- find the region with most amino acid homology in rat (2 or less differences)
- check whether the amino acid of interest is present in rat
An example of the output is shown
######################
## Annotation for Mutation Locations
######################
BaileyList=read.csv("../anntotations/list_mutations_bailey.csv", stringsAsFactors = F)
BList=FindRatAAHomolog(BaileyList$Gene, substr(BaileyList$Mutation, 3, 3), substr(BaileyList$Mutation, 4, nchar(as.character(BaileyList$Mutation))-1), substr(BaileyList$Mutation, nchar(as.character(BaileyList$Mutation)), nchar(as.character(BaileyList$Mutation))))
ChangList=read.delim("../anntotations/hotspots_chang_et_al_2017_cancer_discovery.txt", sep="\t", stringsAsFactors = F)
ChangList=ChangList[which(ChangList$Type=="single residue"), ]
ChangList$AA1=substr(ChangList$Residue, 1, 1)
ChangList$Loc=substr(ChangList$Residue,2, nchar(ChangList$Residue))
AA2list=sapply(ChangList$Variants, function(x) strsplit(x, ":[0-9]+[\\|]*"))
AAun=unlist(AA2list)
C2List=tibble(ChangList[ ,c("Gene", "AA1", "Loc")])
C2List=C2List %>% slice(rep(1:n(), times=sapply(AA2list, length)))
C2List$AA2=AAun
C2List=C2List[-which(C2List$AA2=="sp"| C2List$AA1=="*"), ]
#C2ListB=C2List[-which(is.na(C2List$Loc)), ]
ChangList2=FindRatAAHomolog(C2List$Gene, C2List$AA1, C2List$Loc, C2List$AA2)
head(ChangList2)
## Gene AAno AA1 Variant RatGene Sequence HumProt RatProt
## 1 NRAS 61 Q R Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 2 NRAS 61 Q K Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 3 NRAS 61 Q L Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 4 NRAS 61 Q H Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 5 NRAS 61 Q P Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 6 NRAS 61 Q * Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## RatAAno RatSequence
## 1 61 LDTAGQEEYSA
## 2 61 LDTAGQEEYSA
## 3 61 LDTAGQEEYSA
## 4 61 LDTAGQEEYSA
## 5 61 LDTAGQEEYSA
## 6 61 LDTAGQEEYSA
1.3.3.2 GSEA compendiums
For pathway analysis, the c2 (pathway), Hallmark and c5 (Gene Ontology). In addition, metacore pathways (pathway maps and process networks) were obtained and loaded below. This gives a list of 7 different data-sets to interrogate.
Below is example code to load the hallmark and c2 compendiums.
PathInc2=getGmt(con="../anntotations/c2.cp.v7.0.symbols.gmt", geneIdType=SymbolIdentifier(),
collectionType=BroadCollection(category="c2"))
c2entrez=mapIdentifiers(PathInc2, EntrezIdentifier('org.Hs.eg.db'))
c2ListHs=geneIds(c2entrez)
PathInH=getGmt(con="../anntotations/h.all.v7.1.symbols.gmt", geneIdType=SymbolIdentifier(),
collectionType=BroadCollection(category="h"))
cHentrez=mapIdentifiers(PathInH, EntrezIdentifier('org.Hs.eg.db'))
cHListHs=geneIds(cHentrez)
#####################################################
# also load in the process networks and pathway maps
#################
load("../anntotations/ListofGeneSets2.RData")