[go: up one dir, main page]

File: seerIO.cpp

package info (click to toggle)
seer 1.1.4-8
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,692 kB
  • sloc: cpp: 2,945; perl: 596; python: 122; makefile: 92; sh: 43
file content (417 lines) | stat: -rw-r--r-- 11,759 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
/*
 * File: seerIO.cpp
 *
 * Controls: reading in pheno to sample object
 * Open (zipped) dsm file
 * Internal format conversion
 *
 * NB: some input and output used overridden operators in
 * <class>.cpp files
 *
 */
#include "seercommon.hpp"

std::regex gzipped(".+\\.gz"); // matches .gz at the end of a string (as regex match
                               // matches a whole string)
std::regex covar_regex("^(\\d+)$");
std::regex q_covar_regex("^(\\d+)q$");

/* .pheno files
 * space separated:
 * FID, IID, phenotype (1 control, 2 case)
 */
void readPheno(const std::string& filename, std::vector<Sample>& samples, std::unordered_map<std::string,int>& sample_map)
{
   std::ifstream ist(filename.c_str());

   if (!ist)
   {
      throw std::runtime_error("Could not open pheno file " + filename + "\n");
   }

   // Read in
   Sample s;
   while (ist >> s)
   {
      samples.push_back(s);

   }

   // Always keep samples sorted, for consistency between programs
   std::sort(samples.begin(), samples.end(), Sample::compareSamples);

   // Map of sample names to index in vector
   for (unsigned int sample_index = 0; sample_index < samples.size(); ++sample_index)
   {
      sample_map[samples[sample_index].iid()] = sample_index;
   }
}

// Open dsm files, which are possibly zipped
void openDsmFile(igzstream& dsm_stream, const std::string& file_name)
{
   // Check for a .gz extension
   if (!std::regex_match(file_name, gzipped))
   {
      // Warn
      std::cerr << "WARNING: Input file " + file_name
         + " is not gzip compressed, which is recommended\n";
   }

   dsm_stream.open(file_name.c_str()); // Push binary file into buffer

   // Set stream buffer of istream to the one just opened, and check ok
   if (!dsm_stream.good())
   {
      throw std::runtime_error("Could not open kmer file " + file_name + "\n");
   }

}

arma::vec constructVecY(const std::vector<Sample>& samples)
{
   arma::vec y;
   y.zeros(samples.size());

   for (unsigned int i = 0; i < samples.size(); ++i)
   {
      y[i] = samples[i].pheno();
   }

   return y;
}

// Also saves sample information
void writeMDS(const std::string& file_name, const std::vector<Sample>& sample_names, const arma::mat& MDS)
{
   if (MDS.n_rows == sample_names.size())
   {
#ifdef NO_HDF5
      MDS.save(file_name, arma::arma_ascii);
#else
      MDS.save(file_name, arma::hdf5_binary);
#endif

      std::string sample_file_name = file_name + sample_suffix;
      std::ofstream sample_names_file(sample_file_name.c_str());

      // Write out the sample name order
      if (!sample_names_file)
      {
         std::cerr << "Could not write used sample names to " << sample_file_name << std::endl;
      }
      else
      {
         for (auto sample_it = sample_names.begin(); sample_it != sample_names.end(); ++sample_it)
         {
            sample_names_file << sample_it->iid() << std::endl;
         }
      }
   }
   else
   {
      throw std::logic_error("When writing matrix to " + file_name + " the number samples in the .pheno file did not match the number of columns or rows");
   }

}

void writeDistances(const std::string& file_name, const arma::mat& distances)
{
   distances.save(file_name, arma::csv_ascii);
}

arma::mat readHDF5(const std::string& file_name)
{
   arma::mat HDF5;

   if (fileStat(file_name))
   {
      HDF5.load(file_name);
   }
   else
   {
      throw std::runtime_error("Problem with loading MDS input file " + file_name);
   }

   return HDF5;
}

arma::mat readMDS(const std::string& file_name, const std::vector<Sample>& sample_names)
{
   std::map<std::string,unsigned int> mds_idx;
   arma::mat MDS = readHDF5(file_name);

   // Check that the sample names match up
   std::string sample_name_file = file_name + sample_suffix;
   if (fileStat(sample_name_file))
   {
      std::ifstream samples_in(sample_name_file.c_str());
      if (samples_in)
      {
         arma::uvec keep_indices(sample_names.size());
         unsigned int file_row = 0;

         // Read in sample file to get MDS row order
         while (samples_in)
         {
            std::string sample_name;
            samples_in >> sample_name;
            if (samples_in)
            {
               mds_idx[sample_name] = file_row;
            }
            ++file_row;
         }

         // Get MDS rows (using sample file read above) in same sorted order as
         // sample vector
         unsigned int sample_row = 0;
         for (auto it = sample_names.begin(); it != sample_names.end(); ++it)
         {
            auto find_it = mds_idx.find(it->iid());
            if (find_it == mds_idx.end())
            {
               throw std::runtime_error("Could not find sample " + it->iid() + " in the pheno file");
            }
            else
            {
               keep_indices(sample_row) = find_it->second;
            }
            sample_row++;
         }

         // Only keep the rows where the pheno file has data
         if (mds_idx.size() >= sample_names.size())
         {
            MDS = MDS.rows(keep_indices);
         }
         else
         {
            throw std::runtime_error("Could not find all samples in input .pheno file in --struct mat. Was kmds run with all samples included?");
         }
      }
      else
      {
         std::cerr << "WARNING: Could not open sample file " + sample_name_file + ". Ensure kmds and seer -p inputs were identical" << std::endl;
      }
   }
   else
   {
      std::cerr << "WARNING: Could not find sample file " + sample_name_file + ". Ensure kmds and seer -p inputs were identical" << std::endl;
   }

   return MDS;
}

arma::mat readMDSList(const std::string& filename)
{
   std::ifstream ist(filename.c_str());
   if (!ist)
   {
      throw std::runtime_error("Could not open mds list file " + filename + "\n");
   }
   else
   {
      std::cerr << "Reading subsampled matrices from " + filename + "\n";
   }

   std::string matrix_file;
   arma::mat combined_matrix;
   int i = 0;
   while (ist >> matrix_file)
   {
      combined_matrix = join_rows(combined_matrix, readHDF5(matrix_file));

      std::cerr << "Joined matrix " << ++i << "\n";
   }

   return combined_matrix;
}

// Opens a text file containing covariates, and reads the specified columns
// into a matrix. Categorical covariates use dummy coding
arma::mat parseCovars(const std::string& file, const std::string& columns)
{
   // Check file exists
   if (!fileStat(file))
   {
      throw std::runtime_error("Covariate file " + file + " not found");
   }

   // First parse columns required
   std::vector<std::tuple<int,bool>> column_list = parseCovarColumns(columns);

   // Open file, and read in required columns
   std::vector<Covar> covars_in(column_list.size()); // Vector of covariates (each of which is internally a vector)

   std::ifstream ist(file.c_str());
   if (ist)
   {
      std::string line_buf;
      while (std::getline(ist, line_buf)) // Read a line in
      {
         std::string sample_name;
         std::stringstream line_stream(line_buf);

         // Read in sample name for the line
         line_stream >> sample_name;

         auto col_it = column_list.begin();
         size_t row = 0;
         while (col_it != column_list.end()) // Go through all required columns
         {
            std::string col_buf; // Work out how many fields to read
            int skip_cols = std::get<0>(*col_it);
            if (col_it != column_list.begin())
            {
               skip_cols -= std::get<0>(*(col_it-1));
            }
            else
            {
               skip_cols -= 1; // Sample name already read
            }

            for (int i = 0; i<skip_cols; ++i)
            {
               line_stream >> col_buf;
            }

            covars_in[row].addCovarValue(sample_name, col_buf);
            ++col_it; // Move to next column
            ++row;
         }
      }
   }
   else
   {
      throw std::runtime_error("Could not open covariate file" + file);
   }

   // Encode categorical columns. Convert strings to doubles
   // All in arma::mat
   arma::mat covar_matrix;
   auto mat_col = covars_in.begin();
   for (auto col_it = column_list.begin(); col_it != column_list.end(); ++col_it)
   {
      // When converting, first sort on sample name so in the same order as
      // pheno vector that will be merged with
      arma::mat new_col;
      if (std::get<1>(*col_it))
      {
         // Categorical covars need dummy encoding
         new_col = encodeDummy(mat_col->sortCovars());
      }
      else
      {
         // Quantitative covars need normalising
         new_col = vecToMat(mat_col->sortCovars());
         normaliseMatCols(new_col);
      }

      // Add the row in
      if (covar_matrix.n_cols > 0)
      {
         covar_matrix = arma::join_rows(covar_matrix, new_col);
      }
      else
      {
         covar_matrix = new_col;
      }
      ++mat_col;
   }

   return covar_matrix;
}

std::vector<std::tuple<int,bool>> parseCovarColumns(const std::string& columns)
{
   std::vector<std::tuple<int,bool>> column_list; // A vector of column numbers, and whether they
                                                  // are categorical and need
                                                  // dummy encoding
   std::string col_buf;
   std::stringstream ss(columns);
   while(std::getline(ss, col_buf, ','))
   {
      if (std::regex_match(col_buf, q_covar_regex))
      {
         auto new_col = std::make_tuple(stoi(std::regex_replace(col_buf, q_covar_regex, "$1")), 0);
         column_list.push_back(new_col);
      }
      else if (std::regex_match(col_buf, covar_regex))
      {
         auto new_col = std::make_tuple(stoi(std::regex_replace(col_buf, covar_regex, "$1")), 1);
         column_list.push_back(new_col);
      }
      else
      {
         std::cerr << "Warning: couldn't parse covariate column " + col_buf << std::endl;
      }
   }

   if (column_list.size() == 0)
   {
      throw std::runtime_error("No covariate columns selected, but file provided");
   }

   // Sort by column number. Use a lambda function
   std::sort(column_list.begin(), column_list.end(),
         [](const std::tuple<int,bool>& a, const std::tuple<int,bool>& b)
         {
            return std::get<0>(a) < std::get<0>(b);
         });

   return column_list;
}

// Converts categorical variates into columns using dummy encoding
arma::mat encodeDummy(const std::vector<std::string>& in_col)
{
   // Find the possible categories, and put them into an associative array (unordered
   // map)
   std::vector<std::string> col_copy = in_col;
   std::sort(col_copy.begin(), col_copy.end());
   auto unique_it = std::unique(col_copy.begin(), col_copy.end());

   std::unordered_map<std::string, int> dummy_map;
   unsigned int dummy_col = 0;
   for (auto cat_it = col_copy.begin(); cat_it != unique_it; ++cat_it)
   {
      dummy_map[*cat_it] = dummy_col++;
   }

   // For each variable, find the category and code this column as 1
   // There will be n-1 categories, as the control is coded as all zeros
   arma::mat out_mat(in_col.size(), dummy_map.size() - 1);
   for (unsigned int i = 0; i < in_col.size(); ++i)
   {
      unsigned int true_col = dummy_map[in_col[i]];
      for (unsigned int j = 0; j < dummy_map.size() - 1; ++j)
      {
         int out_val = 0;
         if (true_col > 0 && true_col - 1 == j)
         {
            out_val = 1;
         }
         out_mat(i, j) = out_val;
      }
   }

   return out_mat;
}

// Check for file existence
int fileStat(const std::string& filename)
{
   struct stat buffer;
   int success = 1;

   if (stat (filename.c_str(), &buffer) != 0)
   {
      std::cerr << "Can't stat input file: " << filename << "\n";

      success = 0;
   }

   return success;
}