## ----style, eval=TRUE, echo=FALSE, results='asis'------------------------
options(max.print=1000)
library(knitr)
opts_chunk$set(cache=TRUE, tidy=FALSE, fig.path='figure/exercise-', fig.align='center', fig.show='hold', fig.lp='',fig.width=5,fig.height=4, dev=c('pdf','png'))
options(replace.assign=TRUE,width=60)


## ----makeTxDbFromBiomart,eval=FALSE,message=FALSE------------------------
## ## Requires internet access and takes several minutes.
## library(GenomicFeatures)
## txdb <- makeTxDbFromBiomart(biomart="ENSEMBL_MART_ENSEMBL",
##                             dataset="hsapiens_gene_ensembl",host="feb2014.archive.ensembl.org")
## ## Type ?makeTxDbFromBiomart to get more information on this function
## ex_by_gene <- exonsBy(txdb, by="gene")  # GRangesList object
## save(ex_by_gene,file="ex_by_gene.RData")


## ----exonsBy,warning=FALSE,message=FALSE---------------------------------
library(GenomicFeatures)
rooturl <- "http://bioinformatics.amc.nl/wp-content/uploads/"
download.file(paste0(rooturl,
               "gs-sequence-analysis/RNASequencing/ex_by_gene.zip"),
              destfile="ex_by_gene.zip")
unzip("ex_by_gene.zip")
load("ex_by_gene.RData")
ex_by_gene


## ----esults='hide',warning=FALSE,message=FALSE---------------------------
library(parathyroidSE)

## ------------------------------------------------------------------------
data(exonsByGene)
exonsByGene


## ------------------------------------------------------------------------
exonsByGene[[2]]


## ------------------------------------------------------------------------
length(exonsByGene)


## ------------------------------------------------------------------------
ex_by_gene[[4]]
exonsByGene[[4]]


## ----bampaths,warning=FALSE,message=FALSE--------------------------------
bamdir <- system.file("extdata", package="parathyroidSE")
bamfiles <- list.files(bamdir, pattern="bam$", full.names=TRUE)
bamfiles

## Then create a BamFileList that contains a pointer to all three BAM files
library(Rsamtools)
bamfile_list <- BamFileList(bamfiles, index=character())
## We need to use index=character() here because there are no
## BAM index files (.bam.bai extension) associated with our BAM files.


## ------------------------------------------------------------------------
library(GenomicAlignments)
read_count0 <- summarizeOverlaps(exonsByGene, bamfile_list,
                                 ignore.strand=TRUE,
                                 singleEnd=FALSE)
read_count0


## ------------------------------------------------------------------------
head(assay(read_count0))  # head restricts the output to the first 6 rows
dim(assay(read_count0))


## ------------------------------------------------------------------------
param <- ScanBamParam(flag=scanBamFlag(isDuplicate=FALSE,isSecondaryAlignment=FALSE))
read_count1 <- summarizeOverlaps(exonsByGene, bamfile_list,
                                ignore.strand=TRUE, singleEnd=FALSE, param=param)
read_count1


## ------------------------------------------------------------------------
## Are the count tables without (read_count0) and with (read_count1) filtering 
## identical?
identical(assay(read_count0),assay(read_count1))


## ----eval=FALSE----------------------------------------------------------
## data(package="parathyroidSE")


## ------------------------------------------------------------------------
library(parathyroidSE)
data(parathyroidGenesSE)
parathyroidGenesSE


## ------------------------------------------------------------------------
dim(assay(parathyroidGenesSE))


## ------------------------------------------------------------------------
assay(parathyroidGenesSE)[1:6, 1:6]


## ------------------------------------------------------------------------
colnames(parathyroidGenesSE)


## ------------------------------------------------------------------------
colSums(assay(parathyroidGenesSE))


## ------------------------------------------------------------------------
sum(rowSums(assay(parathyroidGenesSE)) != 0)


## ------------------------------------------------------------------------
colData(parathyroidGenesSE)


## ------------------------------------------------------------------------
table(colData(parathyroidGenesSE)$treatment)


## ------------------------------------------------------------------------
colData(parathyroidGenesSE)$sample


## ----collapseReplicates,warning=FALSE------------------------------------
library(DESeq2)
parathyroidGenesSE_new <- 
  collapseReplicates(parathyroidGenesSE,groupby=colData(parathyroidGenesSE)$sample)
# Only keep the column data columns that we actually need for our analysis below
colData(parathyroidGenesSE_new) <- 
  colData(parathyroidGenesSE_new)[ , c( "patient", "treatment", "time" ) ]

parathyroidGenesSE_new


## ----testCountSum,echo=FALSE---------------------------------------------
stopifnot(sum(assay(parathyroidGenesSE)) == sum(assay(parathyroidGenesSE_new)))
save(parathyroidGenesSE_new,file="parathyroidGenesSE_new.RData")


## ------------------------------------------------------------------------
rooturl <- "http://bioinformatics.amc.nl/wp-content/uploads/"
download.file(paste0(rooturl,"gs-sequence-analysis//RNASequencing/parathyroidGenesSE_new.zip"), 
                     destfile="parathyroidGenesSE_new.zip")
unzip("parathyroidGenesSE_new.zip")
load("parathyroidGenesSE_new.RData")


## ----sumExpInput---------------------------------------------------------
library(DESeq2)
ddsFull <- DESeqDataSet(se = parathyroidGenesSE_new, design = ~ patient + treatment)
ddsFull


## ----subset--------------------------------------------------------------
dds <- ddsFull[ , colData(ddsFull)$treatment %in% c("Control","DPN") & 
                  colData(ddsFull)$time == "48h" ]


## ------------------------------------------------------------------------
dds$treatment
dds$treatment <- factor(dds$treatment)
dds$treatment
## Do we have the right samples?
colData(dds)


## ----runDESeq------------------------------------------------------------
dds <- DESeq(dds)


## ----extractResults------------------------------------------------------
res <- results(dds)
res


## ----resCols-------------------------------------------------------------
mcols(res)


## ------------------------------------------------------------------------
colData(dds)$sizeFactor


## ------------------------------------------------------------------------
colSums(assay(dds))/mean(colSums(assay(dds)))


## ----checkBaseMean-------------------------------------------------------
range(res$baseMean - rowMeans(counts(dds)))
range(res$baseMean - rowMeans(counts(dds,normalized=TRUE)))


## ----rawpvalue-----------------------------------------------------------
sum( res$pvalue < 0.05, na.rm=TRUE )
table( is.na(res$pvalue) )


## ----adjpvalue-----------------------------------------------------------
sum( res$padj < 0.1, na.rm=TRUE )


## ----strongGenes---------------------------------------------------------
resSig <- res[ which(res$padj < 0.1 ), ]
head( resSig[ order( resSig$log2FoldChange ), ] )


## ----strongGenesUp-------------------------------------------------------
tail( resSig[ order( resSig$log2FoldChange ), ] )


## ----proportionSigGenes--------------------------------------------------
table(sign(resSig$log2FoldChange))


## ----plotCounts,fig.cap="Normalized counts for the gene with the smallest p-value in the comparison of DPN versus control treatment."----
plotCounts(dds, gene=which.min(res$pvalue), intgroup="treatment")


## ----plotMA1,fig.cap="The MA-plot shows the $\\log_2$ fold changes from the treatment over the mean of normalized counts, i.e. the average of counts normalized by size factor. Points are colored red if the adjusted p-value is less than 0.1. Points which fall out of the window are plotted as open triangles pointing either up or down."----
plotMA(dds, ylim = c(-0.75, 0.75) )


## ----LFCshrinkage--------------------------------------------------------
resLFC <- lfcShrink(dds, coef="treatment_DPN_vs_Control")


## ----plotMA2,fig.cap="The MA-plot shows the shrunken $\\log_2$ fold changes from the treatment over the mean of normalized counts, i.e. the average of counts normalized by size factor. Points are colored red if the adjusted p-value is less than 0.1. Points which fall out of the window are plotted as open triangles pointing either up or down. Shrinkage incorporates a prior on $\\log_2$ fold changes, resulting in moderated estimates from genes with low counts and highly variable counts, as can be seen by the narrowing of the vertical spread of points on the left side of the plot."----
plotMA(resLFC, ylim = c(-0.75, 0.75) )


## ----dispPlot,fig.cap='Plot of dispersion estimates. See text for details.'----
plotDispEsts( dds )


## ----pvalHist,fig.cap="Histogram of the p-values returned by the test for differential expression."----
hist( res$pvalue, breaks=100 )


## ----rld, cache=TRUE-----------------------------------------------------
rld <- rlogTransformation(dds)
head( assay(rld) )


## ----samplePCA,fig.cap="The PCA plot shows that the difference between patients is much greater than the difference between treatments"----
print( plotPCA( rld, intgroup = c( "patient", "treatment") ) )


## ------------------------------------------------------------------------
ddsFull <- DESeqDataSet(se = parathyroidGenesSE_new, design = ~ treatment)
dds <- ddsFull[ , colData(ddsFull)$treatment %in% c("Control","DPN") & 
                   colData(ddsFull)$time == "48h" ]
dds$treatment <- factor(dds$treatment)
dds <- DESeq(dds)
res <- results(dds)
sum( res$padj < 0.1, na.rm=TRUE )


## ----sessInfo, echo=FALSE------------------------------------------------
sessionInfo()


## ----removeAnswers, echo=FALSE-------------------------------------------
removeAnswers = function(con="exercisesRNAseq.tex"){
  txt = readLines(con=con)
  i1 = grep("\\begin{answer}", txt, fixed=TRUE)
  i2 = grep("\\end{answer}",  txt, fixed=TRUE)
  if(!((length(i1)==length(i2))&&all(i2>i1))) stop("\\begin{answer}/\\end{answer} macros are unbalanced.")
  i = unlist(mapply(`:`, i1, i2))
  res = txt[-i]
  writeLines(res, con=con)
}