3 min read

Exploring Homoscedasticity in Gene Expression

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
## 
##     apply
dds <- makeExampleDESeqDataSet(n=50000,m=2)
reads = counts(dds)
plot(log2(reads[,1]),log2(reads[,2]))

reads_mean = apply(reads,1,mean)
reads_sd = apply(reads,1, sd)
plot(reads_mean, reads_sd)

Now lets normalize library size and esimate dispersion:

dds = estimateSizeFactors(dds)
estimateDispersions(dds)
## Warning in checkForExperimentalReplicates(object, modelMatrix): same number of samples and coefficients to fit,
##   estimating dispersion by treating samples as replicates.
##   please read the ?DESeq section on 'Experiments without replicates'.
##   in summary: this analysis only potentially useful for data exploration,
##   accurate differential expression analysis requires replication
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## class: DESeqDataSet 
## dim: 50000 2 
## metadata(1): version
## assays(2): counts mu
## rownames(50000): gene1 gene2 ... gene49999 gene50000
## rowData names(12): trueIntercept trueBeta ... dispOutlier dispMAP
## colnames(2): sample1 sample2
## colData names(2): condition sizeFactor
ntd = assay(normTransform(dds)) # regular log2 with a psudocount
rld = assay(rlog(dds)) # doing the same thing as log2, but not allowing low counts to explode 
library("vsn")
meanSdPlot(ntd, ranks=F)

meanSdPlot(rld, ranks=F)

So here we can see that we have shrunk the sd at low levels

plot(rld[,1],rld[,2])

now lets do it ourselves:

ntd_mean = apply(ntd,1,mean)
ntd_sd = apply(ntd,1,sd)
plot(ntd_mean,ntd_sd)

library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:IRanges':
## 
##     collapse
## This is mgcv 1.8-23. For overview type 'help("mgcv-package")'.
model <- smooth.spline(ntd_mean, ntd_sd) # build the model
fit <- predict( model , se = TRUE )$fit # estimated values
se <- predict( model , se = TRUE)$se.fit # standard error

# Confidence interval:
lcl <- fit - 1.96 * se
ucl <- fit + 1.96 * se


fit <- smooth.spline(ntd_mean, ntd_sd)      # smooth.spline fit
res <- (fit$yin - fit$y)/(1-fit$lev)      # jackknife residuals
sigma <- sqrt(var(res))                     # estimate sd

upper <- fit$y + 2.0*sigma*sqrt(fit$lev)   # upper 95% conf. band
lower <- fit$y - 2.0*sigma*sqrt(fit$lev) 

data = data.frame(estimated_mean = fit$x,
                  estimated_sd = fit$y,
                  lower_bound = lower,
                  upper_bound = upper)

library(ggplot2)

ggplot(data) +
       geom_line(aes(x = estimated_mean,
                     y = estimated_sd)) + 
       geom_ribbon(aes(x = estimated_mean,
                       ymax = upper_bound,
                       ymin = lower_bound), alpha=0.5)