[go: up one dir, main page]

Menu

[r101]: / myrna / Assign.R  Maximize  Restore  History

Download this file

291 lines (262 with data), 9.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
##
# Assign.R
#
# Authors: Ben Langmead and Hector Corrada Bravo
# Date: November 5, 2009
#
# Load a chunk of alignment tuples from a file, overlap them with a set
# of relevant genomic ranges, then output a new stream of tuples for
# every overlap. Also, for every read label involved in any overlap,
# output a set of per-interval subtotals which can be used for
# normalization across replicates later on.
#
# Assign.pl loads this when invoking Rscript
# library(IRanges)
##
# Write something to stderr with a newline
#
msg <- function(...) {
sink(stderr())
cat("Assign.R [")
cat(format(Sys.time(), "%H:%M:%S"))
cat("]: ")
cat(...)
cat('\n')
flush(stderr())
sink()
}
args <- commandArgs(T)
# Get alignment file from first arg after "--"
alnFile <- args[2]
maxAlgnInfluence <- as.integer(args[3])
ivalsFromDir <- args[4]
ivalsDir <- args[5]
binWidth <- as.integer(args[6])
from3prime <- args[7]
fromMiddle <- args[8]
ivalsFromDir <- (ivalsFromDir == "0")
from3prime <- (from3prime == "1")
fromMiddle <- (fromMiddle == "1")
msg("Alignment file:", alnFile)
msg("Influence:", maxAlgnInfluence)
msg("Ivals from dir?:", ivalsFromDir)
msg("Ivals dir:", ivalsDir)
msg("Bin width:", binWidth)
msg("From 3':", from3prime)
msg("From middle:", fromMiddle)
##
# Increment given Hadoop counter by given amount. Hadoop interprets
# lines of stderr output with format "reporter:counter:<name>:<amt>" as
# a request to atomically increment the global counter called <name> by
# amount <amt>.
#
counter <- function(names, amts) {
sink(stderr())
cat(paste("reporter:counter:Assign,", names, ",", amts, sep="", collapse="\n"), "\n")
flush(stderr())
sink()
}
##
# Calculate overlaps given arbitrary intervals of interest.
#
calcOlap <- function(ranges1, ranges2) {
matchMatrix(findOverlaps(ranges2, ranges1))
}
##
# Calculate overlaps given that intervals of interest are bins.
#
calcBinOlap <- function(ranges1, bwidth) {
cbind(as.integer(start(ranges1)/bwidth)+1, seq(from=1, to=length(ranges1)));
}
##
# Load ranges from an interval file
#
rangesFromIvalFile <- function(dir, chromo) {
# First try raw chromosome id, then try "chr" followed by id
fname1 <- paste(dir, "/", chromo, ".ivals", sep="")
fname2 <- paste(dir, "/chr", chromo, ".ivals", sep="")
fname3 <- paste(dir, "/other.ivals", sep="")
what <- list("", # Chr
"", # Gene,
integer(0), # Start
integer(0)) # End
ivs <- if(file.exists(fname1)) {
msg("Reading intervals from", fname1)
scan(fname1, what=what, sep='\t', quote="")
} else if(file.exists(fname2)) {
msg("Reading intervals from", fname2)
scan(fname2, what=what, sep='\t', quote="")
} else if(file.exists(fname3)) {
msg("Reading intervals from", fname3)
counter("Had to use other.ivals", 1)
scan(fname3, what=what, sep='\t', quote="")
} else {
counter("No ival file", 1)
list( as.character(c()), as.character(c()),
as.integer(c()), as.integer(c()) )
}
niv <- length(ivs[[1]])
msg(c("Read", niv, "intervals"))
# Give columns reasonable labels
names(ivs) <- c('chromo', 'name', 'start', 'end')
class(ivs) <- "data.frame"
attr(ivs, "row.names") <- as.character(seq(len=niv))
ivs
}
##
# Create ranges given a bin width
#
rangesFromBins <- function(bwidth, chromo) {
l <- list()
starts <- seq(from=0, to=300000000, by=bwidth)
ends <- seq(from=bwidth-1, by=bwidth, length.out=length(starts))
l[[1]] <- rep(chromo, length(starts))
l[[2]] <- paste(chromo, "_", format(starts, scientific=F, trim=T), sep="");
l[[3]] <- starts
l[[4]] <- ends
names(l) <- c('chromo', 'name', 'start', 'end')
class(l) <- "data.frame"
attr(l, "row.names") <- as.character(seq(len=length(l[[1]])))
l
}
# Read partition alignments from file
msg("Reading in alignments")
what <- list("", # Chr
integer(0), # Part
integer(0), # ChrOff
"", # Orient
integer(0), # Seq length
integer(0), # Oms
"", # CIGAR
"", # Mate
"") # Lab
#what <- list("", # Chr
# integer(0), # Part
# integer(0), # ChrOff
# "", # Orient
# "", # Seq
# "", # Qual
# integer(0), # Oms
# "", # CIGAR
# "", # Mate
# "") # Lab
# Give columns reasonable labels
alns <- scan(alnFile, what=what, sep='\t', quote="",
allowEscapes=F, multi.line=F)
names(alns) <- c('Chr', 'Part', 'ChrOff', 'Orient', 'SeqLen',
'Oms', 'CIGAR', 'Mate', 'Lab')
class(alns) <- "data.frame"
nread <- length(alns$Chr)
attr(alns, "row.names") <- as.character(seq(len=nread))
counter("Alignments read by Assign.R", nread)
msg(c("Read", nread, "alignments from", alnFile))
# Abort if there were no alignments
if(nread == 0) {
counter("0-read inputs", 1)
q(status=1);
}
alns.split <- split(alns, alns$Chr)
##
# Given a bundle of alignments falling into the same chromosome, read
# the corresponding ivals file, calculate overlaps, and emit overlaps.
#
handleChr <- function(alns) {
nread <- length(alns$Chr)
if(nread == 0) {
msg("Error, no reads in call to handleChr")
q(status=1)
}
# Extract chromosome name from first alignment (all alignments in alns
# are in the same partition, and so should have same chromosome name)
chromo <- alns$Chr[1]
# Load gene annotations from files; only load genes and only load
# annotations from our chromosome
msg(c("Reading interval file for chromosome", chromo, "in dir", ivalsDir))
ivals = if(ivalsFromDir) {
rangesFromIvalFile(ivalsDir, chromo)
} else {
rangesFromBins(binWidth, chromo)
}
attach(ivals, name="ivals")
# If there aren't any intervals to overlap, bail
if(length(ivals[[1]]) == 0) {
msg("Aborting because there are no ranges to overlap")
detach("ivals")
return(NULL)
}
ranges <- IRanges(start, end)
# Extract read intervals from reads
msg("Converting reads to ranges")
readlens <- alns$SeqLen
widths <- readlens
widths <- pmin(readlens, maxAlgnInfluence)
ori <- if(from3prime) { "+" } else { "-" }
readbegins <- alns$ChrOff + if(fromMiddle) {
# From the middle
floor((readlens - widths + as.integer(alns$Orient == "-")) / 2)
} else {
# From either the 3' or the 5' end
as.integer(alns$Orient == ori) * (readlens - widths)
}
readranges <- IRanges(readbegins, width=widths)
# Update counters since we may abort soon if there are no overlaps
counter("Alignments handled by Assign.R", nread)
counter("Chromosome bins handled by Assign.R", 1)
# Overlap read intervals with genomic intervals
# Query = genomic intervals, Subject = reads
msg("Finding overlaps")
olaps <- if(ivalsFromDir || maxAlgnInfluence > 1) {
counter("Calls to calcOlap", 1)
calcOlap(readranges, ranges)
} else {
counter("Calls to calcBinOlap", 1)
calcBinOlap(readranges, binWidth)
}
if(length(olaps) == 0 | nrow(olaps) == 0) {
msg("Aborting because there are no overlaps")
counter("No-overlap chromosome bins", 1)
detach("ivals")
return(NULL)
}
msg(c("Overlap matrix is", nrow(olaps), "rows by", ncol(olaps),"cols"))
# Check each possible overlap and, where one exists, print an alignment
# tuple; pri key = Lab, sec key = 1
# Add 100,000 to the IvalOffs so that they're all positive; we have to
# keep them positive and pad them with 0s in order for Hadoop to sort
# them properly.
msg("Outputting overlapping alignments")
outdf <- cbind(Lab=sub("/[12]$", "", alns$Lab[olaps[,2]]),
Part="999999999",
alns[olaps[,2],4:8],
Ival=name[olaps[,1]],
IvalOff=formatC(readbegins[olaps[,2]], 10, flag="0"))
write.table(outdf,
col.names=FALSE, row.names=FALSE, quote=FALSE, sep='\t')
msg("Outputting per-label subtotals")
# Grab just the labels for the overlapped reads
labtab <- table(outdf$Lab)
labtab <- cbind(row.names(labtab), labtab)
# Make a table counting overlaps per-label-per-interval
labIvalTab <- table(paste(outdf$Lab, " ", outdf$Ival))
# Remove the interval name from the key and pad the count with 0's
# so that it can be sorted lexicographically by Hadoop
labIvalTabRows <- row.names(labIvalTab)
labIvalTab <- cbind(sub(" .*", "", labIvalTabRows), # just label
formatC(labIvalTab, 8, flag="0"),
sub(".* ", "", labIvalTabRows)) # just gene id
# Output a label subtotal for each label that appeared in an overlap
# A secondary key of 0 is given so that the subtotals are
# guaranteed to arrive at the next Reduce step before any of the
# alignments (alignments have secondary key = 1)
write.table(labIvalTab,
col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")
msg("Finished outputting per-label subtotals")
# Also update global counters
counter(paste("Overlaps for label ", labtab[,1], sep=""), labtab[,2])
# Print Hadoop counter updates
counter("Overlaps output by Assign.R", nrow(olaps))
detach("ivals")
}
# Split by chromosome, dispatch each chromosome bin to handleChr
invisible(lapply(alns.split, handleChr))
counter("Invocations of Assign.R", 1)