Bowtie Code

Brought to you by: ben_langmead, ch4rr0, ctrapnell, mcschatz
[r101]: / myrna / Postprocess.R Maximize Restore History

376 lines (345 with data), 11.4 kB

##
# Postprocess.R
#
# Authors: Jeff Leek & Ben Langmead
# 'qvalue' function borrowed from 'qvalue' package by John D. Storey
#
# Postprocess Myrna results into a series of R plots.
#

##
# Write something to stderr with a newline
#
msg <- function(...) {
	sink(stderr())
	cat("Postprocess.R: ")
	cat(...)
	cat(" at ")
	cat(format(Sys.time(), "%H:%M:%S"))
	cat('\n')
	flush(stderr())
	sink()
}

##
# Convert labels to group names by removing trailing - and everything
# after.  If there's no dash, leave it alone. 
#
labToGroup <- function(labs) { sub("-.*$", "", labs) }

##
# Convert labels to samples by removing everything up to and including
# the first dash.  If there's no dash, leave it alone.
#
labToSample <- function(labs) { sub("^[^-]*-", "", labs) }

##
# Plot coverage for one gene.
#
plotGene <- function(inaln, exons) {
	msg("Alignments: ", inaln)
	genename <- sub(".txt$", "", inaln)
	msg("Gene name: ", genename)
	inaln <- paste("alignments/", inaln, sep="")
	msg("Alignments (adjusted): ", inaln)
	
	# Read alignments
	reads <- read.table(
		inaln, sep='\t', as.is=T, quote="", header=F,
		comment.char="", colClasses=c(
			"character",  # Sample
			"integer",    # Offset
			"character",  # Strand
			"integer",    # SeqLen
			"character",  # Oms
			"character",  # CIGAR
			"character")) # Mate
	colnames(reads) <-
		c("sample", "offset", "strand", "seqlen", "oms", "cigar", "mate")

 	if(length(reads$sample) == 0) {
 		msg("Read 0 alignments")
		q(status=1);
	}
	readlens <- reads$seqlen

	# Get the annotation information
	sexons <- if(is.null(exons)) { list() } else { exons[(exons$gene == genename),] }
	bb <- as.numeric(as.vector(sexons$start))
	ee <- as.numeric(as.vector(sexons$end))
	nexons <- length(bb)

	# Get the sample labels
	slabels <- reads$sample
	slabels.vals <- sort(unique(slabels))
	nslabels <- length(slabels.vals)

	# Get the groups
	grp <- labToGroup(slabels)
	samp <- labToSample(slabels)

	# Get the colors
	ntot <- sum(table(grp, samp) > 0)
	cols <- as.numeric(as.factor(unlist(strsplit(unique(reads$sample),"-"))[1:ntot*2-1]))

	# Find the length of the reads
	start <- reads$offset
	end <- reads$offset + readlens
	nreads <- dim(reads)[1]

	# Get the positions
	pos <- min(reads$offset):(max(reads$offset) + readlens[nreads])

	# Make the individual plot
	pdf(file=paste(paste("alignments/", genename, sep=""), "replicates.pdf", sep="_"))
	plot(pos,pos,ylim=c(-1,ntot),type="n",xlab="Genomic Position",ylab="",yaxt="n")
 	axis(side=2,at=c(-0.5,1:ntot),labels=c("Exons",unique(reads$sample)),las=2)
	msg("Drawing",nexons,"exons for per-replicate plot")
	if(nexons > 0) {
		for(i in 1:nexons){
			polygon(c(bb[i],bb[i],ee[i],ee[i]),c(-0.60,-0.40,-0.40,-0.60),lwd=2,col="green")
		}
	}

	# Get the max
	mx <- 0
	for(i in 1:length(slabels.vals)){
		ss <- c(min(reads$offset), reads$offset[slabels==slabels.vals[i]], max(reads$offset))
		ww <- c(2,nchar(readlens[slabels==slabels.vals[i]]),100)
		ir <- IRanges(ss, (ss + ww - 1))
		cc <- as.vector(coverage(ir,shift=(-min(ss)  + 1),weight=c(0,rep(1,length(ss)-2),0)))
		mx <- max(c(mx,cc))
		#pos <- min(ss):(max(ss) + ww[length(ww)] - 1)
	}

	# Do the plot
	for(i in 1:length(slabels.vals)){
		ss <- c(min(reads$offset),reads$offset[slabels==slabels.vals[i]],max(reads$offset))
		ww <- c(2,nchar(readlens[slabels==slabels.vals[i]]),100)
		ir <- IRanges(ss, (ss + ww - 1))
		cc <- as.vector(coverage(ir,shift=(-min(ss)  + 1),weight=c(0,rep(1,length(ss)-2),0)))/mx
		pos <- min(ss):(max(ss) + ww[length(ww)] - 1)
		lines(pos,cc+i,col=cols[i])
		xx <- c(pos,rev(pos))
		yy <- c(rep(i,length(pos)),rev(cc + i))
		polygon(xx,yy,col=cols[i],border=cols[i])
	}
	invisible(dev.off())

	# Make the average plot
	pdf(file=paste(paste("alignments/", genename, sep=""), "groups.pdf", sep="_"))
	plot(pos,pos,ylim=c(-1,(length(unique(grp))+1)),type="n",xlab="Genomic Position",ylab="",yaxt="n")
	axis(side=2,at=c(-0.5,1:length(unique(grp))),labels=c("Exons",unique(grp)),las=2)
	msg("Drawing",nexons,"exons for grouped plot")
	if(nexons > 0) {
		for(i in 1:nexons){ 
			polygon(c(bb[i],bb[i],ee[i],ee[i]),c(-0.60,-0.40,-0.40,-0.60),lwd=2,col="green")
		}
	}

	# Get the max
	mx <- 0
	for(i in 1:length(unique(grp))){
		ss <- c(min(reads$offset),reads$offset[grp==unique(grp)[i]], max(reads$offset))
		ww <- c(2,nchar(readlens[grp==unique(grp)[i]]),100)
		ir <- IRanges(ss, (ss + ww - 1))
		cc <- as.vector(coverage(ir,shift=(-min(ss)  + 1),weight=c(0,rep(1,length(ss)-2),0)))
		mx <- max(c(mx,cc))
		#pos <- min(ss):(max(ss) + ww[length(ww)] - 1)
	}

	for(i in 1:length(unique(grp))){
		ss <- c(min(reads$offset),reads$offset[grp==unique(grp)[i]],max(reads$offset))
		ww <- c(2,nchar(readlens[grp==unique(grp)[i]]),100)
		ir <- IRanges(ss, (ss + ww - 1))
		cc <- as.vector(coverage(ir,shift=(-min(ss)  + 1),weight=c(0,rep(1,length(ss)-2),0)))/mx
		pos <- min(ss):(max(ss) + ww[length(ww)] - 1)
		lines(pos,cc+i,col=cols[i])
		xx <- c(pos,rev(pos))
		yy <- c(rep(i,length(pos)),rev(cc + i))
		polygon(xx,yy,col=i,border=i)
	}
	invisible(dev.off())
}

##
# Borrowed (with GUI stuff removed) from John Storey's qvalue package.
#
qvalue <- function(p = NULL, lambda = seq(0, 0.9, 0.05), pi0.method = "smoother",
                   fdr.level = NULL, robust = FALSE, gui = FALSE, smooth.df = 3,
                   smooth.log.pi0 = FALSE)
{
	if (min(p) < 0 || max(p) > 1) {
		msg("ERROR: p-values not in valid range.")
		return(0)
	}
	if (length(lambda) > 1 && length(lambda) < 4) {
		msg("ERROR: If length of lambda greater than 1, you need at least 4 values.")
		return(0)
	}
	if (length(lambda) > 1 && (min(lambda) < 0 || max(lambda) >= 1)) {
		msg("ERROR: Lambda must be within [0, 1).")
		return(0)
	}
	m <- length(p)
	if (length(lambda) == 1) {
		if (lambda < 0 || lambda >= 1) {
			msg("ERROR: Lambda must be within [0, 1).")
			return(0)
		}
		pi0 <- mean(p >= lambda)/(1 - lambda)
		pi0 <- min(pi0, 1)
	}
	else {
		pi0 <- rep(0, length(lambda))
		for (i in 1:length(lambda)) {
			pi0[i] <- mean(p >= lambda[i])/(1 - lambda[i])
		}
		if (pi0.method == "smoother") {
			if (smooth.log.pi0) 
				pi0 <- log(pi0)
			spi0 <- smooth.spline(lambda, pi0, df = smooth.df)
			pi0 <- predict(spi0, x = max(lambda))$y
			if (smooth.log.pi0) 
				pi0 <- exp(pi0)
			pi0 <- min(pi0, 1)
		}
		else if (pi0.method == "bootstrap") {
			minpi0 <- min(pi0)
			mse <- rep(0, length(lambda))
			pi0.boot <- rep(0, length(lambda))
			for (i in 1:100) {
				p.boot <- sample(p, size = m, replace = TRUE)
				for (i in 1:length(lambda)) {
					pi0.boot[i] <- mean(p.boot > lambda[i])/(1 - lambda[i])
				}
				mse <- mse + (pi0.boot - minpi0)^2
			}
			pi0 <- min(pi0[mse == min(mse)])
			pi0 <- min(pi0, 1)
		}
		else {
			msg("ERROR: 'pi0.method' must be one of 'smoother' or 'bootstrap'.")
			return(0)
		}
	}
	if (pi0 <= 0) {
		msg("Warning: The estimated pi0 <= 0. Check that you have valid p-values or use another lambda method.")
		return(1)
	}
	if (!is.null(fdr.level) && (fdr.level <= 0 || fdr.level > 1)) {
		msg("ERROR: 'fdr.level' must be within (0, 1].")
		return(0)
	}
	u <- order(p)
	qvalue.rank <- function(x) {
		idx <- sort.list(x)
		fc <- factor(x)
		nl <- length(levels(fc))
		bin <- as.integer(fc)
		tbl <- tabulate(bin)
		cs <- cumsum(tbl)
		tbl <- rep(cs, tbl)
		tbl[idx] <- tbl
		return(tbl)
	}
	v <- qvalue.rank(p)
	qvalue <- pi0 * m * p/v
	if (robust) {
		qvalue <- pi0 * m * p/(v * (1 - (1 - p)^m))
	}
	qvalue[u[m]] <- min(qvalue[u[m]], 1)
	for (i in (m - 1):1) {
		qvalue[u[i]] <- min(qvalue[u[i]], qvalue[u[i + 1]], 1)
	}
	if (!is.null(fdr.level)) {
		retval <- list(call = match.call(), pi0 = pi0, qvalues = qvalue, 
			pvalues = p, fdr.level = fdr.level, significant = (qvalue <= 
				fdr.level), lambda = lambda)
	}
	else {
		retval <- list(call = match.call(), pi0 = pi0, qvalues = qvalue, 
			pvalues = p, lambda = lambda)
	}
	class(retval) <- "qvalue"
	return(retval)
}

##
# A wrapper for qvalue that returns just the qvalues and handles the
# case where it complains about the pi0 estimate.
#
qvalue2 <- function(p) {
	qvals <- qvalue(p)
	qvals <- if(class(qvals) == "numeric" && qvals == 1) {
		msg("Warning: Very low p-values detected, perhaps due to",
		    "very small sample size.  All q-values will be set to 0.")
		rep(0.0, length(p))
	} else {
		qvals$qvalues
	}
}

##
# Plot p-value histogram for p-values in 'pvals' and put the pdf result in 'dest'
#
processPvalues <- function(pvals) {
	msg("PValue file: ", pvals)
	# Read in the alignments
	pv <- read.table(pvals, as.is=T, quote="", header=T)
	if(length(pv[,1]) == 0) {
		msg("Read 0 p-values")
		q(status=1)
	}
	msg("Read ", length(pv[,1]), " p-values")
	qvals <- qvalue2(pv[,2])
	qv <- cbind(pv[,1], format(qvals, scientific=T))
	colnames(qv) <- c("ensembl_gene_id", "q_value")
	write.table(qv, "qvals.txt", row.names=F, col.names=T, sep="\t", quote=F)

	# Make the p-value plot
	pdf(file="pval_hist.pdf")
	hist(as.numeric(pv[,2]), main="p-value histogram", xlab="p-value", col="dodgerblue", xlim=c(0, 1))
	invisible(dev.off())
	pdf(file="pval_hist_dense.pdf")
	hist(as.numeric(pv[,2]), main="p-value histogram", xlab="p-value", col="dodgerblue", xlim=c(0, 1), breaks=80)
	invisible(dev.off())

	# Make the q-value plot
	pdf(file="qval_hist.pdf")
	hist(qvals, main="q-value histogram", xlab="q-value", col="dodgerblue", xlim=c(0, 1))
	invisible(dev.off())
	pdf(file="qval_hist_dense.pdf")
	hist(qvals, main="q-value histogram", xlab="q-value", col="dodgerblue", xlim=c(0, 1), breaks=80)
	invisible(dev.off())

	# Make the p-value versus log-count smoothScatter plot
	counts <- read.table("count_table.txt", as.is=T, quote="", header=T)
	rsum <- rowSums(counts)
	counts.sum <- rsum[rsum > 0]
	if(length(counts.sum) != length(pv[,2])) {
		msg("Warning: length of the non-zero row-sum vector is",length(counts.sum),
		    "but length of p-value vector is",length(pv[,2]))
	} else {
		pdf(file="pval_scatter.pdf")
		smoothScatter(log(counts.sum+1), pv[,2],
		              xlab="Log(sum of all counts+1) per gene", ylab="p-value", main="p-value scatter")
		invisible(dev.off())
	 }
}

exonsFn <- "exons.txt";
exonsFh <- file(exonsFn, open = "r");
exonsFirst <- readLines(exonsFh, n = 1, warn = FALSE);
exonsHead <- unlist(strsplit(exonsFirst, "\t", fixed=TRUE));
# Load exons
cls <- c(
		"character",  # Gene id
		"character",  # Transcript id
		"character",  # Exon id
		"character",  # Chr name
		"integer",    # Exon start
		"integer");   # Exon end
if(length(exonsHead) == 8) {
	# Includes is_constitutive
	cls <- c(cls, "integer"); # Constitutive?
} else {
	# Does not include is_constitutive
	if(length(exonsHead) != 7) {
		# Unexpected number of columns in exons.txt
		msg("Expected exons.txt header line to contain 7 or 8 columns; got:\n", exonsHead);
		q(status=1)
	}
}
cls <- c(cls, "character"); # Biotype

exons <- if(file.exists("exons.txt")) {
	read.table("exons.txt", sep='\t', as.is=T, quote="", header=T, comment.char="", colClasses=cls)
} else { NULL }

if(!is.null(exons)) {
	if(length(exonsHead) == 7) {
		colnames(exons) <- c("gene", "transcript", "exon", "chr", "start", "end", "biotype");
	} else {
		colnames(exons) <- c("gene", "transcript", "exon", "chr", "start", "end", "const", "biotype");
	}
}

args <- commandArgs(T)
cores <- args[2]

if(file_test("-d", "alignments")) {
	mclapply(list.files("alignments", pattern="[.]txt$"), plotGene, exons=exons, mc.cores=cores)
}

processPvalues("pvals.txt")