[go: up one dir, main page]

File: combineKmers.cpp

package info (click to toggle)
seer 1.1.2-3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 3,616 kB
  • ctags: 342
  • sloc: cpp: 2,853; perl: 503; python: 122; makefile: 85
file content (104 lines) | stat: -rw-r--r-- 3,064 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/*
 * combineKmers.cpp
 * Takes the union of kmer counts generated by dsk
 *
 */


//
// Consider transforming bases to a more compact representation
//
// Just use sample index vs. use full sample name
//
// TODO read dsk output directly

#include "combineKmers.hpp"

int main (int argc, char *argv[])
{
   // Program description
   std::cerr << "combineKmers: basic algorithm to combine kmer counts across samples\n";

   // Do parsing and checking of command line params
   // If no input options, give quick usage rather than full help
   boost::program_options::variables_map vm;
   if (argc == 1)
   {
      std::cerr << "Usage: combineKmers -s samples.txt -o all_kmers --min_samples 2\n\n"
         << "For full option details run combineKmers -h\n";
      return 0;
   }
   else if (parseCommandLine(argc, argv, vm))
   {
      return 1;
   }

   // Read in list of sample kmer files and their names
   std::vector<std::tuple<std::string, std::string> > samples = readSamples(vm["samples"].as<std::string>());
   std::unordered_map<int, std::string> sample_names;
   size_t min_samples = checkMin(samples.size(), vm["min_samples"].as<int>());

   // Open the output file before counting kmers
   ogzstream out_file((vm["output"].as<std::string>() + ".gz").c_str());

   // Map to store kmers
   // TODO this would be neater if written with objects
   std::unordered_map<std::string, std::vector<std::tuple<int, int>>> kmer_union;

   // Add kmers to map
   std::cerr << "Reading and mapping kmers..." << std::endl;
   for (unsigned int i = 0; i < samples.size(); ++i)
   {
      std::string sample_name, filename;
      std::tie(sample_name, filename) = samples[i];

      // Rather than storing lots of copies of sample names as strings, just
      // store hash keys
      sample_names[i] = sample_name;

      std::ifstream kmer_counts(filename.c_str());

      if (kmer_counts)
      {
         std::cerr << "File " << i + 1 << "/" << samples.size() << "\r";
         std::cerr.flush();

         std::string kmer, abundance;
         while (kmer_counts)
         {
            kmer_counts >> kmer >> abundance;

            kmer_union[kmer].push_back(std::make_tuple(i, stoi(abundance)));
         }
      }
      else
      {
         std::cerr << "Could not open " + filename << std::endl;
         std::cerr << "Skipping..." << std::endl;
      }
   }
   std::cerr << std::endl;

   // Print results
   std::cerr << "Printing union of kmers" << std::endl;
   for(auto kmer_it = kmer_union.cbegin(); kmer_it != kmer_union.cend(); ++kmer_it)
   {
      if (kmer_it->second.size() >= min_samples)
      {
         out_file << kmer_it->first;
         for (auto sample_it = kmer_it->second.cbegin(); sample_it != kmer_it->second.cend(); ++sample_it)
         {
            int sample, abundance;
            std::tie (sample, abundance) = *sample_it;

            out_file << " " << sample_names[sample] + ":" + std::to_string(abundance);
         }
         out_file << std::endl;
      }
   }

   std::cerr << "Done." << std::endl;

   return(0);
}