Bowtie Code

Brought to you by: ben_langmead, ch4rr0, ctrapnell, mcschatz
[r101]: / myrna / Stats.R Maximize Restore History

377 lines (332 with data), 11.8 kB

##
# Stats.R
#
# Authors: Jeff Leek & Ben Langmead
#
# Load one or more gene's worth of alignments and calculate a statistic
# on each gene's worth, and output a per-gene record.
#

# Stats.pl loads this when invoking Rscript
#library(lmtest)
#library(MASS)

##
# Increment given Hadoop counter by given amount.  Hadoop interprets
# lines of stderr output with format "reporter:counter:<name>:<amt>" as
# a request to atomically increment the global counter called <name> by
# amount <amt>.
#
counter <- function(names, amts) {
	sink(stderr())
	cat(paste("reporter:counter:Stats,", names, ",", amts, sep="", collapse="\n"), "\n")
	flush(stderr())
	sink()
}

##
# Write something to stderr with a newline
#
msg <- function(...) {
	sink(stderr())
	cat("Stats.R [")
	cat(format(Sys.time(), "%H:%M:%S"))
	cat("]: ")
	cat(...)
	cat('\n')
	sink()
}

##
# Get number of rows for an object that might be a vector or a matrix.
# Vectors have 1 row.
#
rows <- function(vecOrMat) {
	nr <- nrow(vecOrMat)
	if(is.null(nr) && length(vecOrMat) > 0) {
		1
	} else if(is.null(nr)) {
		0
	} else {
		nr
	}
}

##
# Wrap glm and check arguments
#
glmWrap <- function(dat, grps, family, offset) {
	if(length(dat) != length(as.factor(grps))) {
		msg(c("Warning: data vector (",dat,") has length",length(dat),"but groups vector (",grps,") has length",length(grps),"; offsets=(",offset,")"))
		q(status=1)
	}
	if(length(dat) != length(offset)) {
		msg(c("Warning: data vector (",dat,") has length",length(dat),"but offset vector (",offset,") has length",length(offset),"; grps=(",grps,")"))
		q(status=1)
	}
	dat.l <- if(family == "gaussian") { log(dat+1) } else { dat }
	glm(dat.l ~ as.factor(grps) + log(offset+1), family=family)
}

##
# Wrap paired glm and check arguments
#
glmWrapPaired <- function(dat, grps, pairs, family, offset) {
	if(length(dat) != length(as.factor(grps))) {
		msg(c("Warning: data vector (",dat,") has length",length(dat),"but groups vector (",grps,") has length",length(grps),"; offsets=(",offset,")"))
		q(status=1)
	}
	if(length(dat) != length(offset)) {
		msg(c("Warning: data vector (",dat,") has length",length(dat),"but offset vector (",offset,") has length",length(offset),"; grps=(",grps,")"))
		q(status=1)
	}
	if(length(dat) != length(pairs)) {
		msg(c("Warning: data vector (",dat,") has length",length(dat),"but pairs vector (",pairs,") has length",length(offset),"; grps=(",grps,")"))
		q(status=1)
	}
	dat.l <- if(family == "gaussian") { log(dat+1) } else { dat }
	glm(dat.l ~ as.factor(grps) + as.factor(pairs) + log(offset+1), family=family)
}

##
# Calculate Pvals for one row of data, assuming there are two groups
# and they're matched in alphabetical order.
#
calcPairedPvalRow <- function(dat, grps.row, pairs, ltots, family) {
	glm1 <- glmWrapPaired(dat, grps.row, pairs, family, ltots)
	# Note: Unlike with lrtest(), [2,5] is not necessarily >= 0 here
	summary(glm1)$coeff[2,4]
}

##
# Calculate Pvals for one row of data and one grouping.
#
calcPvalRow <- function(dat, grps.row, ltots, family, stats) {
	glm1 <- glmWrap(dat, grps.row, family=family, offset=ltots)
	dat.l <- if(family == "gaussian") { log(dat+1) } else { dat }
	glm0 <- glm(dat.l ~ log(1+ltots), family=family)
	lrtest(glm0, glm1)[2,if(stats) {4} else {5}]
}

##
# Calculate Pvals for one row of data and a matrix of groupings.
#
calcPval <- function(dat, grps.mat, ltots, family, stats) {
	apply(grps.mat, 1, calcPvalRow, dat=dat, ltots=ltots, family=family, stats=stats)
}

##
# Calculate null Pvals for a matrix of data with given grouping.
#
calcNullPvals <- function(nulls.per.unit, dat.mat, grps, ltots, family) {
	grps.mat <- matrix(replicate(nulls.per.unit, sample(grps, length(grps))), nulls.per.unit, byrow=T)
	counter("Null Pvals calculated", nulls.per.unit * rows(dat.mat))
	counter("Null Pval batches calculated", 1)
	apply(dat.mat, 1, calcPval, grps.mat=grps.mat, ltots=ltots, family=family, stats=T)
}

##
# Calculate observed Pvals for a matrix of data with given grouping.
#
calcObsPvals <- function(dat.mat, grps, ltots, family, stats) {
	counter("Observed Pval batches calculated", 1)
	if(is.null(nrow(dat.mat)) && length(dat.mat) > 0) {
		counter("Observed Pvals calculated", 1)
		calcPvalRow(dat.mat, grps, ltots, family)
	} else {
		counter("Observed Pvals calculated", nrow(dat.mat))
		apply(dat.mat, 1, calcPvalRow, grps.row=grps, ltots=ltots, family=family, stats=stats)
	}
}

##
# Calculate observed Pvals for a matrix of data with given grouping,
# assuming the groups match up.
#
calcObsPairedPvals <- function(dat.mat, grps, ltots, family) {
	counter("Observed Pval batches calculated", 1)
	pairs <- rep(1:(length(grps)/2), 2)
	if(is.null(nrow(dat.mat)) && length(dat.mat) > 0) {
		counter("Observed Pvals calculated", 1)
		calcPairedPvalRow(dat.mat, grps, pairs, ltots, family)
	} else {
		counter("Observed Pvals calculated", nrow(dat.mat))
		apply(dat.mat, 1, calcPairedPvalRow, grps.row=grps, pairs=pairs, ltots=ltots, family=family)
	}
}

##
# Calculate fake Pvals (all 1) for a matrix of data.
#
calcFakePvals <- function(dat.mat) {
	counter("Fake Pval batches calculated", 1)
	if(is.null(nrow(dat.mat)) && length(dat.mat) > 0) {
		counter("Fake Pvals calculated", 1)
		1
	} else {
		counter("Fake Pvals calculated", nrow(dat.mat))
		rep(1, nrow(dat.mat))
	}
}

##
# Convert labels to group names by leading - and everything after.  If
# there's no dash, leave it alone. 
#
labToGroup <- function(labs) { sub("-.*$", "", labs) }

##
# Format a list of statistics or P-values so that they sort properly.
# This involves clamping Inf and extremely large numbers down to about
# 10^10.
#
formatVals <- function(vals, digits, width) {
	formatC(pmin(vals, 999999999), flag="0", format="f", digits=digits, width=width)
}

##
#
#
deTest <- function(fn, all.labels.str, famstr, nulls.per.unit, bypass.pvals, add.fudge, paired) {
	
	msg(c("Called deTest(", fn, ", ", all.labels.str, ", ", famstr, ", ", bypass.pvals, ", ", add.fudge, ", ", paired, ")"), sep="")

	# Read in the alignments
	reads <- scan(
		fn, sep='\t', quote="", comment.char="",
		allowEscapes=F, multi.line=F, what=list(
			"character",  # Name
			"integer",    # Offset
			"character",  # Strand
			"integer",    # SeqLen
			"character",  # Ignore
			"character",  # Mis
			"character",  # Strat
			"character",  # Label (e.g. "H-1", "T-19")
			"integer"))   # Norm (any type of normalization factor)
	
	names(reads) <- c("gene", "offset", "strand", "seqlen", "oms",
	                  "cigar", "strat", "label", "norm")
	
	nreads <- length(reads$gene)
	msg(c("Processing batch of", nreads, "alignments"))

	# Get the unit of analysis (genes, exons, etc. by strand)
	
	units <- table(reads$gene)
	units.vals <- names(units)
	nunits <- length(units.vals) # nunits = # of different genes in this batch

	msg(c("Alignment batch has", nunits, "genes"))
	msg(c("Genes:", paste(units.vals, sep=",")))

	# Get the labels in this batch
	labels <- table(reads$label)
	labels.vals <- names(labels)
	nsamples <- length(labels.vals) # nsamples = # of different labels in batch

	msg(c("Alignment batch has", nsamples, "distinct labels"))
	msg(c("Labels:", paste(labels.vals, sep=",")))

	# Get the labels in the whole dataset (passed in from wrapper)
	all.labels <- unlist(strsplit(all.labels.str, split=",", fixed=TRUE))
	if(length(all.labels) != length(unique(all.labels))) {
		msg(c("Warning! labels list has non-unique elements"))
		q(status=1)
	}
	all.nsamples <- length(all.labels)

	msg(c("Whole dataset has", all.nsamples, "distinct labels"))
	msg(c("Labels:", paste(all.labels, sep=",")))

	# Ensure all the batch's labels are legit
	if(!all(labels.vals %in% all.labels)) {
		msg(c("ERROR: one or more batch labels were not in the whole-dataset list of labels"))
		q(status=1)
	}

	# Get the groups in this batch
	grp <- table(labToGroup(reads$label))
	grp.vals <- names(grp)
	ngrps <- length(grp.vals)

	msg(c("Batch has", ngrps, "distinct groups"))
	msg(c("Groups:", paste(grp.vals, sep=",")))

	# Get the groups in the whole dataset
	all.grp <- labToGroup(all.labels)
	all.grp.vals <- unique(all.grp)

	msg(c("Whole dataset has", length(all.grp.vals), "distinct groups"))
	msg(c("Groups:", paste(all.grp.vals, sep=",")))
	
	if(paired) {
		nulls.per.unit <- 0
		all.grp.tab <- table(all.grp)
		if(dim(all.grp.tab) != 2) {
			msg(c("Error: Stats.R deTest() called with paired=True, but there are",dim(all.grp.tab),"distinct groups in the input"))
			q(status=1);
		}
		g1 <- all.grp.tab[1]
		g2 <- all.grp.tab[2]
		if(g1 != g2) {
			msg(c("Error: Stats.R deTest() called with paired=True, but the",
			      "number of samples per group is different:",
			      g1, "in group", all.grp[1], "and", g2, "in", all.grp[2]))
			q(status=1);
		}
	}

	if(!all(grp.vals %in% all.grp.vals)) {
		msg(c("ERROR: one or more batch groups were not in the whole-dataset list of groups"))
		q(status=1)
	}

	msg("Factorizing label column")
	reads$label <- factor(reads$label, levels=all.labels)

	msg("Getting per-sample normalization factors")
	lab.by.norm.tab <- table(reads$label, reads$norm)
	totals <- apply(lab.by.norm.tab, 1, which.max)
	totals <- colnames(lab.by.norm.tab)[totals]
	totals <- as.integer(totals) + add.fudge
	names(totals) <- rownames(lab.by.norm.tab)

	# Set up the data matrix
	msg("Setting up data matrix")
	dat <- as.matrix(table(reads$gene, reads$label)) + add.fudge
	
	if(length(names(totals)) != length(colnames(dat)) ||
	   any(names(totals) != colnames(dat)))
	{
		msg(c("Names of the normalization factors (", names(totals),
		      ") did not match with column names of the dat matrix (",
		      colnames(dat), ")"))
		q(status=1)
	}
	
	msg("Setting up output strings")
	datstr <- apply(dat, 1, paste, sep=",", collapse=",")
	
	digits <- if (nulls.per.unit > 0) { 10 } else { 10 }
	width <- if (nulls.per.unit > 0) { 10+digits } else { 10+digits }
	pvals <- if(bypass.pvals || length(all.grp) < 2) {
		msg("Calculating batch of fake P-values")
		calcFakePvals(dat)
	} else {
		pvals <- if(paired) {
			msg("Calculating batch of",rows(dat),"observed P-values (paired)")
			calcObsPairedPvals(dat, all.grp, totals, famstr)
		} else {
			msg("Calculating batch of",rows(dat),"observed P-values (unpaired)")
			calcObsPvals(dat, all.grp, totals, famstr, nulls.per.unit > 0)
		}
		if(!all(pvals >= 0)) {
			msg("Some calculated observed P values were < 0")
			sink(stderr())
			print(table(pvals[pvals < 0]))
			q(status=1);
		}
		if(nulls.per.unit == 0) {
			if(!all(pvals <= 1)) {
				msg("Some calculated observed P values were > 1")
				sink(stderr())
				print(table(pvals[pvals > 1]))
				q(status=1);
			}
			abs(-log(pvals))
		} else { pvals }
	}
	if(!all(pvals >= 0)) {
		msg("Some calculated statistics/P-vals were < 0")
		sink(stderr())
		print(pvals)
		q(status=1);
	}
	msg("Outputting observed P-values")
	sink("/dev/stdout")
	cat(paste("1", formatVals(pvals, digits, width), "O", units.vals, sep="\t"), sep="\n")
	if(nulls.per.unit > 0 && !bypass.pvals) {
		msg(c("Calculating a batch of",(nulls.per.unit*rows(dat)),"null P-values for",rows(dat),"genes"))
		nvals <- calcNullPvals(nulls.per.unit, dat, all.grp, totals, famstr)
		msg(c("Outputting batch of null P-values"))
		cat(paste("1", formatVals(nvals, digits, width), "N", sep="\t"), sep="\n")
	}
	sink()
	msg("Finished call to deTest")
}

args <- commandArgs(T)
alnFile <- args[2]
allgrp.str <- args[3]
famstr <- args[4]
nulls.per.unit <- as.integer(args[5])
seed <- as.integer(args[6])
bypass.pvals <- as.logical(as.integer(args[7]))
do.profile <- as.logical(as.integer(args[8]))
add.fudge <- as.integer(args[9])
paired <- as.logical(as.integer(args[10]))

if(do.profile) {
	Rprof(filename = paste("RProfile.Stats.R", Sys.getpid(), sep="."))
}

if(seed >= 0) { set.seed(seed) }

deTest(alnFile, allgrp.str, famstr, nulls.per.unit, bypass.pvals, add.fudge, paired)