1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
|
#' Apply a variance stabilizing transformation (VST) to the count data
#'
#' This function calculates a variance stabilizing transformation (VST) from the
#' fitted dispersion-mean relation(s) and then transforms the count data (normalized
#' by division by the size factors or normalization factors), yielding a matrix
#' of values which are now approximately homoskedastic (having constant variance along the range
#' of mean values). The transformation also normalizes with respect to library size.
#' The \code{\link{rlog}} is less sensitive
#' to size factors, which can be an issue when size factors vary widely.
#' These transformations are useful when checking for outliers or as input for
#' machine learning techniques such as clustering or linear discriminant analysis.
#'
#' @aliases varianceStabilizingTransformation getVarianceStabilizedData
#'
#' @param object a DESeqDataSet or matrix of counts
#' @param blind logical, whether to blind the transformation to the experimental
#' design. blind=TRUE should be used for comparing samples in a manner unbiased by
#' prior information on samples, for example to perform sample QA (quality assurance).
#' blind=FALSE should be used for transforming data for downstream analysis,
#' where the full use of the design information should be made.
#' blind=FALSE will skip re-estimation of the dispersion trend, if this has already been calculated.
#' If many of genes have large differences in counts due to
#' the experimental design, it is important to set blind=FALSE for downstream
#' analysis.
#' @param fitType in case dispersions have not yet been estimated for \code{object},
#' this parameter is passed on to \code{\link{estimateDispersions}} (options described there).
#'
#' @details For each sample (i.e., column of \code{counts(dds)}), the full variance function
#' is calculated from the raw variance (by scaling according to the size factor and adding
#' the shot noise). We recommend a blind estimation of the variance function, i.e.,
#' one ignoring conditions. This is performed by default, and can be modified using the
#' 'blind' argument.
#'
#' Note that neither rlog transformation nor the VST are used by the
#' differential expression estimation in \code{\link{DESeq}}, which always
#' occurs on the raw count data, through generalized linear modeling which
#' incorporates knowledge of the variance-mean dependence. The rlog transformation
#' and VST are offered as separate functionality which can be used for visualization,
#' clustering or other machine learning tasks. See the transformation section of the
#' vignette for more details.
#'
#' The transformation does not require that one has already estimated size factors
#' and dispersions.
#'
#' A typical workflow is shown in Section \emph{Variance stabilizing transformation}
#' in the package vignette.
#'
#' If \code{\link{estimateDispersions}} was called with:
#'
#' \code{fitType="parametric"},
#' a closed-form expression for the variance stabilizing
#' transformation is used on the normalized
#' count data. The expression can be found in the file \file{vst.pdf}
#' which is distributed with the vignette.
#'
#' \code{fitType="local"},
#' the reciprocal of the square root of the variance of the normalized counts, as derived
#' from the dispersion fit, is then numerically
#' integrated, and the integral (approximated by a spline function) is evaluated for each
#' count value in the column, yielding a transformed value.
#'
#' \code{fitType="mean"}, a VST is applied for Negative Binomial distributed counts, 'k',
#' with a fixed dispersion, 'a': ( 2 asinh(sqrt(a k)) - log(a) - log(4) )/log(2).
#'
#' In all cases, the transformation is scaled such that for large
#' counts, it becomes asymptotically (for large values) equal to the
#' logarithm to base 2 of normalized counts.
#'
#' The variance stabilizing transformation from a previous dataset
#' can be "frozen" and reapplied to new samples.
#' The frozen VST is accomplished by saving the dispersion function
#' accessible with \code{\link{dispersionFunction}}, assigning this
#' to the \code{DESeqDataSet} with the new samples, and running
#' varianceStabilizingTransformation with 'blind' set to FALSE.
#' Then the dispersion function from the previous dataset will be used
#' to transform the new sample(s).
#'
#' Limitations: In order to preserve normalization, the same
#' transformation has to be used for all samples. This results in the
#' variance stabilizition to be only approximate. The more the size
#' factors differ, the more residual dependence of the variance on the
#' mean will be found in the transformed data. \code{\link{rlog}} is a
#' transformation which can perform better in these cases.
#' As shown in the vignette, the function \code{meanSdPlot}
#' from the package \pkg{vsn} can be used to see whether this is a problem.
#'
#' @return \code{varianceStabilizingTransformation} returns a
#' \code{\link{DESeqTransform}} if a \code{DESeqDataSet} was provided,
#' or returns a a matrix if a count matrix was provided.
#' Note that for \code{\link{DESeqTransform}} output, the matrix of
#' transformed values is stored in \code{assay(vsd)}.
#' \code{getVarianceStabilizedData} also returns a matrix.
#'
#' @references
#'
#' Reference for the variance stabilizing transformation for counts with a dispersion trend:
#'
#' Simon Anders, Wolfgang Huber: Differential expression analysis for sequence count data. Genome Biology 2010, 11:106. \url{http://dx.doi.org/10.1186/gb-2010-11-10-r106}
#'
#' @author Simon Anders
#'
#' @seealso \code{\link{plotPCA}}, \code{\link{rlog}}, \code{\link{normTransform}}
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet(m=6)
#' vsd <- varianceStabilizingTransformation(dds)
#' dists <- dist(t(assay(vsd)))
#' # plot(hclust(dists))
#'
#' @export
varianceStabilizingTransformation <- function (object, blind=TRUE, fitType="parametric") {
if (is.null(colnames(object))) {
colnames(object) <- seq_len(ncol(object))
}
if (is.matrix(object)) {
matrixIn <- TRUE
object <- DESeqDataSetFromMatrix(object, DataFrame(row.names=colnames(object)), ~1)
} else {
matrixIn <- FALSE
}
if (is.null(sizeFactors(object)) & is.null(normalizationFactors(object))) {
object <- estimateSizeFactors(object)
}
if (blind) {
design(object) <- ~ 1
}
if (blind | is.null(attr(dispersionFunction(object),"fitType"))) {
object <- estimateDispersionsGeneEst(object, quiet=TRUE)
object <- estimateDispersionsFit(object, quiet=TRUE, fitType)
}
vsd <- getVarianceStabilizedData(object)
if (matrixIn) {
return(vsd)
}
se <- SummarizedExperiment(
assays = vsd,
colData = colData(object),
rowRanges = rowRanges(object),
metadata = metadata(object))
DESeqTransform(se)
}
#' @rdname varianceStabilizingTransformation
#' @export
getVarianceStabilizedData <- function(object) {
if (is.null(attr(dispersionFunction(object),"fitType"))) {
stop("call estimateDispersions before calling getVarianceStabilizedData")
}
ncounts <- counts(object, normalized=TRUE)
if( attr( dispersionFunction(object), "fitType" ) == "parametric" ) {
coefs <- attr( dispersionFunction(object), "coefficients" )
vst.fn <- function( q ) {
log( (1 + coefs["extraPois"] + 2 * coefs["asymptDisp"] * q + 2 * sqrt( coefs["asymptDisp"] * q * ( 1 + coefs["extraPois"] + coefs["asymptDisp"] * q ) ) ) / ( 4 * coefs["asymptDisp"] ) ) / log(2)
}
return(vst.fn(ncounts))
} else if ( attr( dispersionFunction(object), "fitType" ) == "local" ) {
# non-parametric fit -> numerical integration
if (is.null(sizeFactors(object))) {
stopifnot(!is.null(normalizationFactors(object)))
# approximate size factors from columns of NF
sf <- exp(colMeans(log(normalizationFactors(object))))
} else {
sf <- sizeFactors(object)
}
xg <- sinh( seq( asinh(0), asinh(max(ncounts)), length.out=1000 ) )[-1]
xim <- mean( 1/sf )
baseVarsAtGrid <- dispersionFunction(object)( xg ) * xg^2 + xim * xg
integrand <- 1 / sqrt( baseVarsAtGrid )
splf <- splinefun(
asinh( ( xg[-1] + xg[-length(xg)] )/2 ),
cumsum(
( xg[-1] - xg[-length(xg)] ) *
( integrand[-1] + integrand[-length(integrand)] )/2 ) )
h1 <- quantile( rowMeans(ncounts), .95 )
h2 <- quantile( rowMeans(ncounts), .999 )
eta <- ( log2(h2) - log2(h1) ) / ( splf(asinh(h2)) - splf(asinh(h1)) )
xi <- log2(h1) - eta * splf(asinh(h1))
tc <- sapply( colnames(counts(object)), function(clm) {
eta * splf( asinh( ncounts[,clm] ) ) + xi
})
rownames( tc ) <- rownames( counts(object) )
return(tc)
} else if ( attr( dispersionFunction(object), "fitType" ) == "mean" ) {
alpha <- attr( dispersionFunction(object), "mean" )
# the following stablizes NB counts with fixed dispersion alpha
# and converges to log2(q) as q => infinity
vst.fn <- function(q) ( 2 * asinh(sqrt(alpha * q)) - log(alpha) - log(4) ) / log(2)
return(vst.fn(ncounts))
} else {
stop( "fitType is not parametric, local or mean" )
}
}
#' Quickly estimate dispersion trend and apply a variance stabilizing transformation
#'
#' This is a wrapper for the \code{\link{varianceStabilizingTransformation}} (VST)
#' that provides much faster estimation of the dispersion trend used to determine
#' the formula for the VST. The speed-up is accomplished by
#' subsetting to a smaller number of genes in order to estimate this dispersion trend.
#' The subset of genes is chosen deterministically, to span the range
#' of genes' mean normalized count.
#'
#' @param object a DESeqDataSet or a matrix of counts
#' @param blind logical, whether to blind the transformation to the experimental
#' design (see \code{\link{varianceStabilizingTransformation}})
#' @param nsub the number of genes to subset to (default 1000)
#' @param fitType for estimation of dispersions: this parameter
#' is passed on to \code{\link{estimateDispersions}} (options described there)
#'
#' @return a DESeqTranform object or a matrix of transformed, normalized counts
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet(n=2000, m=20)
#' vsd <- vst(dds)
#'
#' @export
vst <- function(object, blind=TRUE, nsub=1000, fitType="parametric") {
if (nrow(object) < nsub) {
stop("less than 'nsub' rows,
it is recommended to use varianceStabilizingTransformation directly")
}
if (is.null(colnames(object))) {
colnames(object) <- seq_len(ncol(object))
}
if (is.matrix(object)) {
matrixIn <- TRUE
object <- DESeqDataSetFromMatrix(object, DataFrame(row.names=colnames(object)), ~ 1)
} else {
if (blind) {
design(object) <- ~ 1
}
matrixIn <- FALSE
}
if (is.null(sizeFactors(object)) & is.null(normalizationFactors(object))) {
object <- estimateSizeFactors(object)
}
baseMean <- rowMeans(counts(object, normalized=TRUE))
if (sum(baseMean > 5) < nsub) {
stop("less than 'nsub' rows with mean normalized count > 5,
it is recommended to use varianceStabilizingTransformation directly")
}
# subset to a specified number of genes with mean normalized count > 5
object.sub <- object[baseMean > 5,]
baseMean <- baseMean[baseMean > 5]
o <- order(baseMean)
idx <- o[round(seq(from=1, to=length(o), length=nsub))]
object.sub <- object.sub[idx,]
# estimate dispersion trend
object.sub <- estimateDispersionsGeneEst(object.sub, quiet=TRUE)
object.sub <- estimateDispersionsFit(object.sub, fitType=fitType, quiet=TRUE)
# assign to the full object
suppressMessages({dispersionFunction(object) <- dispersionFunction(object.sub)})
# calculate and apply the VST (note blinding is accomplished above,
# here blind=FALSE is used to avoid re-calculating dispersion)
vsd <- varianceStabilizingTransformation(object, blind=FALSE)
if (matrixIn) {
return(assay(vsd))
} else {
return(vsd)
}
}
|