[go: up one dir, main page]

File: seerCommon.cpp

package info (click to toggle)
seer 1.1.2-3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 3,616 kB
  • ctags: 342
  • sloc: cpp: 2,853; perl: 503; python: 122; makefile: 85
file content (239 lines) | stat: -rw-r--r-- 5,429 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
/*
 * File: seerCommon.cpp
 *
 * Functions common to seer and kmds
 * Program options parsing
 *
 */

#include "seercommon.hpp"

// Parse command line parameters into usable program parameters
cmdOptions verifyCommandLine(boost::program_options::variables_map& vm, const std::vector<Sample>& samples)
{
   cmdOptions verified;

   verified.max_length = vm["max_length"].as<long int>();

   if(vm.count("kmers"))
   {
      verified.kmers = vm["kmers"].as<std::string>();
   }

   if(vm.count("output"))
   {
      verified.output = vm["output"].as<std::string>();
   }

   if(vm.count("chisq"))
   {
      verified.chi_cutoff = stod(vm["chisq"].as<std::string>());
   }

   if (vm.count("pval"))
   {
      verified.log_cutoff = stod(vm["pval"].as<std::string>());
   }

   // Verify MDS options in a separate function
   // This is pc, size and number of threads
   verifyMDSOptions(verified, vm);

   verified.filter = 1;
   if (vm.count("no_filtering"))
   {
      verified.filter = 0;
   }
   else
   {
      // Error check filtering options
      verified.min_words = 0;
      if (vm.count("min_words"))
      {
         int min_words_in = vm["min_words"].as<int>();
         if (min_words_in >= 0)
         {
            verified.min_words = min_words_in;
         }
         else
         {
            badCommand("min_words", std::to_string(min_words_in));
         }
      }
      else
      {
         double maf_in = vm["maf"].as<double>();
         if (maf_in >= 0)
         {
            verified.min_words = static_cast<unsigned int>(samples.size() * maf_in);
         }
         else
         {
            badCommand("maf", std::to_string(maf_in));
         }
      }

      if (verified.min_words > samples.size())
      {
         badCommand("min_words/maf", std::to_string(verified.min_words));
      }
      verified.max_words = samples.size() - verified.min_words;
   }

   verified.print_samples = 0;
   if (vm.count("print_samples"))
   {
      verified.print_samples = 1;
   }

   return verified;
}

// Check these options in a separate function, which is also usable by kmds
// in mds_concat mode
void verifyMDSOptions(cmdOptions& verified, boost::program_options::variables_map& vm)
{
   // Number of threads is also needed by both
   if (vm.count("threads") && vm["threads"].as<int>() > 0)
   {
      verified.num_threads = vm["threads"].as<int>();
   }
   else
   {
      verified.num_threads = 1;
   }

   if (vm.count("pc"))
   {
      if (vm["pc"].as<int>() > 0)
      {
         verified.pc = vm["pc"].as<int>();
      }
      else
      {
         badCommand("pc", std::to_string(vm["pc"].as<int>()));
      }
   }

   if (vm.count("size"))
   {
      if (vm["size"].as<long int>() > 0)
      {
         verified.size = vm["size"].as<long int>();
      }
      else
      {
         badCommand("size", std::to_string(vm["size"].as<long int>()));
      }
   }

   if (vm.count("write_distances"))
   {
      verified.write_distances = 1;
   }
   else
   {
      verified.write_distances = 0;
   }
}

// Check for continuous phenotype. If even one sample has neither 0 or 1 as
// phenotype
int continuousPhenotype (const std::vector<Sample>& sample_list)
{
   int cont_pheno = 0;
   for (std::vector<Sample>::const_iterator it = sample_list.begin(); it != sample_list.end(); ++it)
   {
      if (it->continuous())
      {
         cont_pheno = 1;
         break;
      }
   }

   // Write inferred output to terminal
   if (cont_pheno)
   {
      std::cerr << "Detected continuous phenotype\n";
   }
   else
   {
      std::cerr << "Detected binary phenotype\n";
   }

   return cont_pheno;
}

// Conversion functions required as code is a mix of dlib and armadillo
// matrices
// This could obviously be improved...
arma::vec dlib_to_arma(const column_vector& dlib_vec)
{
   arma::vec converted(dlib_vec.nr());

   for (unsigned int i = 0; i < dlib_vec.nr(); ++i)
   {
      converted(i) = dlib_vec(i);
   }

   return converted;
}

column_vector arma_to_dlib(const arma::vec& arma_vec)
{
   column_vector converted;
   converted.set_size(arma_vec.n_elem);

   for (unsigned int i = 0; i < arma_vec.n_elem; ++i)
   {
      converted(i) = arma_vec(i);
   }

   return converted;
}

// Converts a stl vector of strings to an arma mat of doubles
arma::mat vecToMat(const std::vector<std::string>& in_col)
{
   arma::mat out_mat(in_col.size(), 1);

   for (unsigned int i = 0; i < in_col.size(); ++i)
   {
      out_mat(i, 0) = std::stof(in_col[i]);
   }

   return out_mat;
}

// Normalises a matrix's columns: subtract mean, divide by std dev
void normaliseMatCols(arma::mat& matrix_in)
{
   arma::mat means = arma::mean(matrix_in);
   arma::mat stddevs = arma::stddev(matrix_in);

   matrix_in.each_row() -= means;
   matrix_in.each_row() /= stddevs;
}

// Inverts a symmetric positive matrix, checking for errors
// Not passed by ref, creates a copy. Right thing to do?
arma::mat inv_covar(arma::mat A)
{
   // Try the default. Internally this uses Cholesky decomposition and back
   // solves. For large condition numbers it fails.
   arma::mat B;
   if (!inv_sympd(B, A))
   {
      // If the Cholesky decomposition fails, try pseudo-inverse
      // This uses SVD:
      // A = U*S*V.t() => A^-1 = V*S^-1*U.t()
      // and ignores small values in the S matrix
      if (!arma::pinv(B, A))
      {
         std::cerr << "A matrix inversion failed!" << std::endl;
      }
   }

   return B;
}