Chapter 1 Prerequisites

1.1 Packages and Software

The following packages are required to conduct the analyses described below.

In house scripts are deposited in the rscript folder.

library(AnnotationHub)
library(beeswarm)
library(biomaRt)
library(Biostrings)
library(colorspace)
library(DESeq2)
library(dplyr)
library(DT)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(forestplot)
library(GenVisR)
library(GenomicFeatures)
library(ggplot2)
library(ggrepel)
library(gplots)
library(GSEABase)
library(GSVA)
library(heatmap.plus)
library(HTSanalyzeR2)
library(kableExtra)
library(knitr)
library(limma)
library(matrixStats)
library(pamr)
library(reshape2)
library(rmarkdown)
library(RColorBrewer)
library(scales)
library(spatstat)
library(survival)
library(tcR)
library(vcfR)
library(xlsx)
library(writexl)

DiffCols=hue_pal()(8)
palette(brewer.pal(9, "Set1"))
RdBu=brewer.pal(11, "RdBu")
SetCols=brewer.pal(12, "Set3")


source("../rscript/cnFreq_fn.R") #modified version of GenVisR
source("../rscript/merge_contig.R")
source("../rscript/gseaCode.R")
source("../rscript/ContingencyTable.R")
source("../rscript/PvalueHeatMap.R")
source("../rscript/BootstrapShannonIdx.R")
source("../rscript/CreateRnor87db.R")
source("../rscript/FindRatAAHomolog.R")
source("../rscript/FindTriNucleotideContext.R")

firstup <- function(x) {
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  x
}

ColMerge=matrix(c("#FFC82F", "#FFEDBC", "#73FDFE","#D2FFFF", "#FF41FF", "#FECAFF", "#5D5D5D", "#BEBEBE"), ncol=2, byrow = T)
rownames(ColMerge)=c("LY","PDL1", "PDL1+LY","Vehicle")
ColSize=c("#008E00", "#FF9300")
names(ColSize)=c("small", "big")
ColSizeb=ColSize[2:1]
ColSizec=c(ColSize[2], "purple", ColSize[1])
hclust.ave <- function(x) hclust(x, method="average")

Hsedb<-EnsDb.Hsapiens.v86

1.2 External software

The following external software was utilised:

Software Function
bwa Alignment of WGS data to reference
GATK4 Mutation calling, done by NYGC. Mutation calling from RNA (Haplotype caller)
strelka Mutation calling, done by NYGC
BICseq CNV calling
GEM3 create mappability files for CNV calling
STAR Alignment of RNAseq data
RSEM Calculate RSEM, TPM, FPKM from RNAseq data
TRUST4 assignment of T and B cell clonotypes from RNA-seq data
Oncotator Annotation of genetic variants
QuPath Tool for cell segmentation and extraction of features from IF images
samtools, bcftools querying and dispalying information from bam files, extracting allelic depth at specific genomic locations
CIBERSORT inferring immune composition from RNA
lumpy structural variants
PAM50 code from parker et al 2009 to infer PAM50 subtypes

1.3 Annotations

1.3.1 Genomic properties

Information on chromosome sizes, cytobands and centromere locations were obtained from the UCSC genome browser.

The following annotation data for the rn6 genome is required:

Data Type Download link
ref. genome http://hgdownload.soe.ucsc.edu/goldenPath/rn6/bigZips/rn6.fa.gz
refSeq annot http://hgdownload.soe.ucsc.edu/goldenPath/rn6/bigZips/genes/rn6.refGene.gtf.gz
gff3 file for TRUST4 (ftp://ftp.ensembl.org/pub/release-100/gff3/rattus_norvegicus/Rattus_norvegicus.Rnor_6.0.100.gff3.gz)
rn6cytoBand ucsc server of all cytoband locations
rn6chrInfo ucsc server of all chromosome sizes
biomart conversion between gene symbol, ensbl and entrez was faciliated using biomart package

Below is the summary of chromosome sizes and centromere locations:

chrInfo=read.delim("../anntotations/chromInfo_rat.txt", header=T)
cytoInfo=read.delim("../anntotations/cytoBand_Rat2.txt", header=F, stringsAsFactors = F)
colnames(cytoInfo)=c("chrom", "chromStart", "chromEnd", "name", "gieStain")
GRcytoInfo=GRanges(seqnames=cytoInfo$chrom, ranges=IRanges(start = cytoInfo$chromStart, end=cytoInfo$chromEnd), cytoband=cytoInfo$name)
head(GRcytoInfo)
## GRanges object with 6 ranges and 1 metadata column:
##       seqnames            ranges strand |    cytoband
##          <Rle>         <IRanges>  <Rle> | <character>
##   [1]     chr1        0-10704345      * |         p13
##   [2]     chr1 10704345-25618263      * |         p12
##   [3]     chr1 25618263-40652454      * |         p11
##   [4]     chr1 40652454-51356799      * |         q11
##   [5]     chr1 51356799-73607402      * |         q12
##   [6]     chr1 73607402-95016091      * |         q21
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

Below is an example of the gene annotation files

txRn6=makeTxDbFromGFF("../anntotations/rn6_refGene.gtf", format="gtf")
txRn6=genes(txRn6)
txRn6=sort(txRn6)
txRn6$gene_width=width(txRn6)
save(txRn6, file="../anntotations/txRn6_refGene.RData")
load("../anntotations/txRn6_refGene.RData")
head(txRn6)
## GRanges object with 6 ranges and 2 metadata columns:
##          seqnames          ranges strand |     gene_id gene_width
##             <Rle>       <IRanges>  <Rle> | <character>  <integer>
##   Vom2r3     chr1   396700-409676      + |      Vom2r3      12977
##    Lrp11     chr1 1702696-1731210      + |       Lrp11      28515
##    Nup43     chr1 1771721-1781554      + |       Nup43       9834
##    Lats1     chr1 1784078-1817310      + |       Lats1      33233
##   Katna1     chr1 1826170-1867786      + |      Katna1      41617
##    Ppil4     chr1 1897350-1930311      + |       Ppil4      32962
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

1.3.2 Gene name homologs between organisms

Biomart was used to convert between rat, mouse and human gene symbols and ensembl ids. Below is an example of the human gene names mapped to the rat homolog

## save here:
## human to rat mapping of genes
## rat ENSEMBL vs symbol conversion

library(biomaRt)
#library(refGenome)
TS=read.delim("~/Documents/metacore-hack/hg38_allsymbols.txt")
TS=as.character(TS[ ,1])
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
rat = useMart("ensembl", dataset = "rnorvegicus_gene_ensembl")

SymHum2Rat = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = TS , mart = human, attributesL = c("rgd_symbol", "ensembl_gene_id"), martL = rat, uniqueRows=T)

Rat2Mouse= getLDS(attributes = c("rgd_symbol"), filters = "rgd_symbol", values = rownames(allrsemFinal) , mart = rat, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)

Mouse2Hum = getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", values = rownames(allrsemFinal) , mart = mouse, attributesL = c("hgnc_symbol"), martL = human, uniqueRows=T)

save(SymHum2Rat, Rat2Hum,Mouse2Hum,Rat2Mouse,  file="../anntotations/Rat_biomart_gene_annotations.RData")
load("../anntotations/Rat_biomart_gene_annotations.RData")
head(SymHum2Rat)
##   HGNC.symbol RGD.symbol
## 1        TAB1       Tab1
## 2        PHF1       Phf1
## 3       RNF39      Rnf39
## 4      IGSF10     Igsf10
## 5     TMEM130    Tmem130
## 6       EFNB1      Efnb1

1.3.3 Gene signatures and data-bases

Gene sets/signatures were obtained from the following sources:

Source Description
IEDB database of immune epitopes
MsigDB c2, c5, hallmark set of curated pathway gene sets
Metacore Process Networks and Pathway Maps data bases
COSMIC database of concensus oncogenes
ImmPort List of immune related genes
InnateDB List of genes associated with innate immune system
Rosenthal 2019 genes associated with MHC-I presentation
Thorsson 2018 Immune gene signatures curated from studies by Wolf, Calabro, Teschendorff, Beck, Chang
Pardoll, Wykes Immune checkpoint genes
Gil del Alcazar 2017 Supplementary table 5: list of activation, dysfunction gene signatures
Bailey 2018 List of 10 most comon tumor pathways
Chang 2018 Common mutation locations in cancer
# These gene signatures are saved in the annotations folder and loaded below:
#########################
## cell specific markers
#######################
GeneList=read.csv("../anntotations/cell_type_markers.csv")
cn=colnames(GeneList)
GeneList=lapply(1:ncol(GeneList), function(x) setdiff(unique(GeneList[ ,x]), ""))
names(GeneList)=cn
## map all the names to rat names
GeneListRat=lapply(GeneList, function(x) SymHum2Rat$RGD.symbol[SymHum2Rat$HGNC.symbol%in%x])

######################
## cancer genes
######################

## cosmic cancer genes
AllCosmic=read.csv("../anntotations/Census_COSMIC_Feb2020.csv")
RatCosmic=SymHum2Rat$RGD.symbol[match(AllCosmic$Gene.Symbol, SymHum2Rat$HGNC.symbol)]
tx=AllCosmic$Gene.Symbol[which(is.na(RatCosmic)|RatCosmic=="")]
tx=tolower(tx)
tx=firstup(tx)
RatCosmic[which(is.na(RatCosmic)|RatCosmic=="")]=tx
RatBreastCosmic=RatCosmic[grep("breast", AllCosmic$Tumour.Types.Somatic.)]

## Bailey List of 10 most common tumor pathways
PathwayList=read.csv("../anntotations/cancer_pathways_annot_Bailey_cell2018_modified.csv")
cn=colnames(PathwayList)
PathwayListA=lapply(seq(1, length(cn), by=2), function(x) setdiff(unique(PathwayList[ ,x]), ""))
names(PathwayListA)=cn[seq(1, length(cn), by=2)]
PathwaySign=lapply(seq(2, length(cn), by=2), function(x) PathwayList[ which(PathwayList[ ,x]!=""),x])
names(PathwaySign)=cn[seq(1, length(cn), by=2)]

PathwayListRata=lapply(PathwayListA, function(x) SymHum2Rat$RGD.symbol[match(x, SymHum2Rat$HGNC.symbol)])
PathwayListRatb=lapply(PathwayListA, function(x) Rat2Hum$RGD.symbol[match(x, Rat2Hum$HGNC.symbol)])
PathwayListRat=lapply(1:length(PathwayListRata), function(x) ifelse(is.na(PathwayListRata[[x]]),
                        PathwayListRatb[[x]], PathwayListRata[[x]]))
names(PathwayListRat)=names(PathwayListA)

AllCancerPathwayGenes=na.omit(unlist(PathwayListRat))

############################
## List of immunesignatures
############################
## read in all the files
Exp2=read.csv("../anntotations/Supplementary Table 5.csv")
Exp2List=lapply(1:ncol(Exp2), function(x) setdiff(unique(Exp2[ ,x]), ""))
names(Exp2List)=colnames(Exp2)

List2=read.csv("../anntotations/Thorsson_signatures.csv")
List2b=lapply(1:ncol(List2), function(x) setdiff(unique(List2[ ,x]), ""))
names(List2b)=colnames(List2)
Exp2List=c(Exp2List, List2b)

ImmSuppAPC=read.delim("../anntotations/immune_Suppression.csv", header=T, stringsAsFactors = F, sep=",")
ImmSuppAPC=lapply(1:ncol(ImmSuppAPC), function(x) setdiff(unique(ImmSuppAPC[ ,x]), ""))
names(ImmSuppAPC)=c("Inh", "Act", "Both")
Exp2List=c(Exp2List, ImmSuppAPC)

MHCPres=read.delim("../anntotations/MHCloss.csv", header=F, stringsAsFactors = F, sep=",")
MHCPres=as.character(MHCPres[ ,1])
MHCPres2Rat=unique(SymHum2Rat$RGD.symbol[SymHum2Rat$HGNC.symbol%in%MHCPres])

## change the names to Rat-specific 
Exp2RatImm=lapply(Exp2List, function(x) unique(SymHum2Rat$RGD.symbol[SymHum2Rat$HGNC.symbol%in%x]))

## All immune genes
AllImmGenes1=read.csv("../anntotations/ImmPort_Set.csv")
AllImmGenes2=read.csv("../anntotations/innatedb_curated_genes.csv")
AllImmGenes=unique(c(as.character(AllImmGenes1$Symbol), as.character(AllImmGenes2$Gene.Symbol[which(AllImmGenes2$Species==9606)])))
RatAllImm=na.omit(unique(c(SymHum2Rat$RGD.symbol[match(AllImmGenes, SymHum2Rat$HGNC.symbol)], as.character(AllImmGenes2$Gene.Symbol[which(AllImmGenes2$Species!=9606)]))))

## CIBERSORT specific rat genes
#lm22rat=read.delim("../anntotations/LM22_to_rnorvegicus_1.txt", sep="\t")

# Oncoprint
Oncoprint=read.csv("../anntotations/Onco_Mamma.csv")
Oncoprint=lapply(1:ncol(Oncoprint), function(x) setdiff(unique(Oncoprint[ ,x]), ""))
names(Oncoprint)=c("Oncotype", "Mammaprint")

1.3.3.1 Human gene homologs

Below, lists of common mutations in cancer are loaded and the “homolog” in rat is determined using an in-house script. The steps involved are:

  • determine the amino acid context in human (find 5 a.a. prior and after)
  • find the region with most amino acid homology in rat (2 or less differences)
  • check whether the amino acid of interest is present in rat

An example of the output is shown

######################
## Annotation for Mutation Locations
######################
BaileyList=read.csv("../anntotations/list_mutations_bailey.csv", stringsAsFactors = F)
BList=FindRatAAHomolog(BaileyList$Gene, substr(BaileyList$Mutation, 3, 3),  substr(BaileyList$Mutation, 4, nchar(as.character(BaileyList$Mutation))-1), substr(BaileyList$Mutation, nchar(as.character(BaileyList$Mutation)), nchar(as.character(BaileyList$Mutation))))

ChangList=read.delim("../anntotations/hotspots_chang_et_al_2017_cancer_discovery.txt", sep="\t", stringsAsFactors = F)
ChangList=ChangList[which(ChangList$Type=="single residue"), ]
ChangList$AA1=substr(ChangList$Residue, 1, 1)
ChangList$Loc=substr(ChangList$Residue,2, nchar(ChangList$Residue))
AA2list=sapply(ChangList$Variants, function(x) strsplit(x, ":[0-9]+[\\|]*"))
AAun=unlist(AA2list)
C2List=tibble(ChangList[ ,c("Gene", "AA1", "Loc")])
C2List=C2List %>% slice(rep(1:n(), times=sapply(AA2list, length)))
C2List$AA2=AAun
C2List=C2List[-which(C2List$AA2=="sp"| C2List$AA1=="*"), ]
#C2ListB=C2List[-which(is.na(C2List$Loc)), ]

ChangList2=FindRatAAHomolog(C2List$Gene, C2List$AA1,  C2List$Loc, C2List$AA2)

head(ChangList2)
##   Gene AAno AA1 Variant RatGene    Sequence         HumProt            RatProt
## 1 NRAS   61   Q       R    Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 2 NRAS   61   Q       K    Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 3 NRAS   61   Q       L    Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 4 NRAS   61   Q       H    Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 5 NRAS   61   Q       P    Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
## 6 NRAS   61   Q       *    Nras LDTAGQEEYSA ENSP00000358548 ENSRNOP00000036381
##   RatAAno RatSequence
## 1      61 LDTAGQEEYSA
## 2      61 LDTAGQEEYSA
## 3      61 LDTAGQEEYSA
## 4      61 LDTAGQEEYSA
## 5      61 LDTAGQEEYSA
## 6      61 LDTAGQEEYSA

1.3.3.2 GSEA compendiums

For pathway analysis, the c2 (pathway), Hallmark and c5 (Gene Ontology). In addition, metacore pathways (pathway maps and process networks) were obtained and loaded below. This gives a list of 7 different data-sets to interrogate.

Below is example code to load the hallmark and c2 compendiums.

PathInc2=getGmt(con="../anntotations/c2.cp.v7.0.symbols.gmt", geneIdType=SymbolIdentifier(),
                collectionType=BroadCollection(category="c2"))

c2entrez=mapIdentifiers(PathInc2, EntrezIdentifier('org.Hs.eg.db'))
c2ListHs=geneIds(c2entrez)

PathInH=getGmt(con="../anntotations/h.all.v7.1.symbols.gmt", geneIdType=SymbolIdentifier(),
                collectionType=BroadCollection(category="h"))

cHentrez=mapIdentifiers(PathInH, EntrezIdentifier('org.Hs.eg.db'))
cHListHs=geneIds(cHentrez)

#####################################################
# also load in the process networks and pathway maps 
#################

load("../anntotations/ListofGeneSets2.RData")