[go: up one dir, main page]

Menu

[r9]: / myrna / myrna_hadoop  Maximize  Restore  History

Download this file

236 lines (206 with data), 9.9 kB

#!/usr/bin/perl -w

##
# Author: Ben Langmead
#   Date: February 11, 2010
#
# Initiate a Hadoop streaming Crossbow job.  Must be on the Hadoop master node.
#

use strict;
use warnings;
use Getopt::Long qw(:config pass_through);
use FindBin qw($Bin);
use lib $Bin;
use MyrnaIface;
use Cwd 'abs_path';

my $APP = "Myrna";
my $app = lc $APP;
my $SCRIPT = "myrna_hadoop";
my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;

my $usage = qq{
$SCRIPT: Run $APP v$VERSION as a Hadoop job

Usage: perl $SCRIPT --input <url> --output <url> \
                    [--reference <url> | --just-preprocess ] [options]

Options (defaults in []):

 Job params:

  --dry-run              Produce and print path to a script for running the
                         Hadoop job, but don't run it.
  --input <url>          HDFS or S3 URL for input.  URL is typically a
                         directory containing preprocessed reads.  If
                         --preprocess or --just-preprocess are enabled, URL is
                         a manifest file.  If --resume-align is enabled, URL is
                         a directory containing $APP alignments.
  --output <url>         Final output (can be HDFS, S3)
  --intermediate <url>   Intermediate output (can be HDFS, S3).  Use an S3 URL
                         if you'd like keep to keep intermediate results after
                         cluster is deallocated. [hdfs:///crossbow]
  --partition-len <int>  Partition length in bases [1 million]
  --bowtie <path>        Path to bowtie binary on slaves [search
                         \$MYRNA_BOWTIE_HOME, \$MYRNA_HOME/bin, \$PATH locally]
  --Rhome <path>         R home on slaves [search \$MYRNA_RHOME, \$MYRNA_HOME/R,
                         \$PATH locally]

 $APP params (affect results):

  --reference <path>     Path to directory with reference information (must be
                         local, with "index", and "ivals" subdirs)
  --bin <int>            If set, Myrna uses non-overlapping genomic bins
                         instead of gene/exon intervals.  <int> = bin width
                         [off].  If used with --sam-passthrough, --reference
                         need not be specified.
  --top <int>            Make plots and save alignments for <int> genes with
                         lowest P values [50]
  --test <family>        Statistical family to use for differential expression
                         test; one of: "poisson", "gaussian" [poisson]
  --norm <type>          Per-gene count normalization scheme to use; one of:
                         "upper-quartile", "median", "total" ["upper-quartile"]
  --ival-model <type>    Interval model used to model genes; one of: "union",
                         "union-constitutive", "union-intersection"
                         ["union-intersection"]
  --influence <int>      Maximum "influence" per alignment.  If alignment
                         length is < <int>, overlaps are calculated as if
                         length = <int> [1]
  --from5prime           Measure read's influence from the 5' end [from 3' end]
  --sam-passthrough      Re-use SAM's alignments instead of calculating new
                         ones.  If used with --bin, --reference need not be
                         specified.
  --sam-lab-rg           When --sam-passthrough is specified, set each read's
                         label equal to the value of the RG:Z optional field.
  --just-align           Stop after alignment; alignments put in --output [off]
  --just-count           Don't do normalization or statistics; --output will
                         just contain per-gene counts [off]
  --resume-align         --input URL is an S3 intermediate directory from a
                         previous run; pipeline resumes before overlapping
                         [off]
  --resume-olaps         --input URL is an S3 intermediate directory from a
                         previous run; resumes before normalization [off]
  --resume-normal        --input URL is an S3 intermediate directory from a
                         previous run; resumes before statistics [off]
  --resume-stats         --input URL is an S3 intermediate directory from a
                         previous run; resumes before summarization [off]
  --resume-summ          --input URL is an S3 intermediate directory from a
                         previous run; resumes before postprocessing [off]
  --bowtie-args "<args>" Arguments for Bowtie [-m 1] (Note: --partition --mm -t
                         --hadoopout --startverbose are always set by $APP)
  --bypass-pvals         Don't calculate interval stastics; all intervals get a
                         differential expression P-value of 1. [off]
  --quality <type>       Encoding for sequence quality values; one of: phred33,
                         phred64, solexa64 [phred33]
  --discard-reads <frac> Randomly discard specified fraction of input reads.
                         [off]
  --truncate <int>       Truncate reads longer than <int> bases to <int> bases
                         by trimming from the 3' end.
  --truncate-discard <int> Same as --truncate except that reads shorter than
                         <int> bases are discarded.

  Preprocessing params (not necessary if --input points to preprocessed reads):

  --preprocess           --input URL is a manifest file describing a set of
                         unpreprocessed, FASTQ read files; preprocess them
                         before running $APP [off]
  --just-preprocess      Like --preprocess but $APP isn't run; --output
                         contains preprocessed reads [off]
  --pre-output <url>     If --preprocess is on, put preprocessed output here
                         instead of in the intermediate directory [off].  Has
                         no effect if --just-preprocess is specified (--output
                         is used instead).  Useful if future jobs use same
                         input.
  --pre-compress <type>  Compression type; one of: gzip, none [gzip]
  --pre-stop <int>       Stop preprocessing after <int> reads/mates [no limit]
  --pre-filemax <int>    Split preprocessed output such that there are no more
                         than <int> reads/mates per preprocessed read file;
                         0 = no limit. [500,000]

  Other params:

  --test                 Try to locate all necessary software; print a helpful
                         message showing what was found and quit [off]
  --tempdir <path>       Put temporary scripts in <path>
                         [/tmp/$APP/invoke.scripts]
                         (umask 0077 used to protect credentials)

};

sub dieusage($$$) {
	my ($text, $usage, $lev) = @_;
	print STDERR "$usage\nError:\n";
	print STDERR "$text\n\n";
	exit $lev;
}

# Try to avoid forcing the user to use the equals sign in cases where
# they're specifying a set of arguments, as in --bowtie-args "-n 3 -l 35"
for(my $i = 0; $i < scalar(@ARGV)-1; $i++) {
	if($ARGV[$i] =~ /^-.*-args$/) {
		$ARGV[$i] = "$ARGV[$i]=\"".$ARGV[$i+1]."\"";
		splice @ARGV, $i+1, 1;
	}
}

my $input = "";
my $output = "";
my $intermediate = "";
my $ref = "";
my $bowtie = "";
my $Rhome = "";
my $samtools = "";
my $verbose = 0;
my $test = 0;

GetOptions (
	"input:s"        => \$input,
	"output:s"       => \$output,
	"intermediate:s" => \$intermediate,
	"reference:s"    => \$ref,
	"Rhome:s"        => \$Rhome,
	"bowtie:s"       => \$bowtie,
	"samtools:s"     => \$samtools,
	"test"           => \$test,
	"verbose"        => \$verbose
);

##
# Take a path and make it absolute.  If it has a protocol, assume it's
# already absolute.
#
sub absPath($$$) {
	my ($path, $check, $name) = @_;
	return $path if $path =~ /^s3n?:\//i;
	return $path if $path =~ /^hdfs:\//i;
	return $path if $path =~ /^file:\//i;
	$path =~ s/^~/$ENV{HOME}/;
	die "Error: $name path doesn't exist: $path" unless (!$check || -f $path || -d $path);
	return abs_path($path);
}

if($verbose) {
	print STDERR "Relative paths:\n";
	print STDERR "  input: $input\n";
	print STDERR "  output: $output\n";
	print STDERR "  intermediate: $intermediate\n";
	print STDERR "  reference: $ref\n";
	print STDERR "  Rhome: $Rhome\n";
	print STDERR "  bowtie: $bowtie\n";
	print STDERR "  samtools: $samtools\n";
}

$input        = absPath($input, 1, "--input") if $input ne "";
$output       = absPath($output, 0, "--output") if $output ne "";
$intermediate = absPath($intermediate, 0, "--intermediate") if $intermediate ne "";
$ref          = absPath($ref, 1, "--reference") if $ref ne "";
$Rhome        = absPath($Rhome, 1, "--Rhome") if $Rhome ne "";
$bowtie       = absPath($bowtie, 1, "--bowtie") if $bowtie ne "";
$samtools     = absPath($samtools, 1, "--samtools") if $samtools ne "";

if($verbose) {
	print STDERR "Absolute paths:\n";
	print STDERR "  input: $input\n";
	print STDERR "  output: $output\n";
	print STDERR "  intermediate: $intermediate\n";
	print STDERR "  reference: $ref\n";
	print STDERR "  Rhome: $Rhome\n";
	print STDERR "  bowtie: $bowtie\n";
	print STDERR "  samtools: $samtools\n";
}

if(!$test) {
	$input ne ""  || dieusage("Must specify --input",  $usage, 1);
	$output ne "" || dieusage("Must specify --output", $usage, 1);
}

my @args = ();

push @args, "--hadoop-job";
push @args, ("--input", $input) if $input ne "";
push @args, ("--output", $output) if $output ne "";
push @args, ("--intermediate", $intermediate) if $intermediate ne "";
push @args, ("--reference", $ref) if $ref ne "";
push @args, ("--Rhome", $Rhome) if $Rhome ne "";
push @args, ("--bowtie", $bowtie) if $bowtie ne "";
push @args, ("--samtools", $samtools) if $samtools ne "";
push @args, "--verbose" if $verbose;
push @args, "--test" if $test;

$ref ne "" || $test || die "Must specify --reference\n";

push @args, @ARGV;

MyrnaIface::myrna(\@args, $SCRIPT, $usage, undef, undef, undef, undef);