[go: up one dir, main page]

Menu

[9bd658]: / src / Common / delta.hh  Maximize  Restore  History

Download this file

619 lines (509 with data), 18.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
////////////////////////////////////////////////////////////////////////////////
//! \file
//! \author Adam M Phillippy
//! \date 03/26/2003
//!
//! \brief Class declarations for the handling of delta alignment files
//!
//! \see delta.cc
////////////////////////////////////////////////////////////////////////////////
#ifndef __DELTA_HH
#define __DELTA_HH
#include <map>
#include <cassert>
#include <string>
#include <vector>
#include <fstream>
#include <cstdlib>
#include <iostream>
const std::string NUCMER_STRING = "NUCMER"; //!< nucmer id string
const std::string PROMER_STRING = "PROMER"; //!< promer id string
typedef char AlignmentType_t; //!< type of alignment data
const AlignmentType_t NULL_DATA = 0; //!< unknown alignment data type
const AlignmentType_t NUCMER_DATA = 'N'; //!< nucmer alignment data
const AlignmentType_t PROMER_DATA = 'P'; //!< promer alignment data
typedef unsigned char Dir_t; //!< directional type
const Dir_t FORWARD_DIR = 0; //!< forward direction
const Dir_t REVERSE_DIR = 1; //!< reverse direction
//===================================================== DeltaAlignment_t =======
struct DeltaAlignment_t
//!< A single delta encoded alignment region
{
unsigned long int sR; //!< start coordinate in the reference
unsigned long int eR; //!< end coordinate in the reference
unsigned long int sQ; //!< start coordinate in the reference
unsigned long int eQ; //!< end coordinate in the reference
unsigned long int idyc; //!< number of mismatches in the alignment
unsigned long int simc; //!< number of similarity scores < 1 in the alignment
unsigned long int stpc; //!< number of stop codons in the alignment
float idy; //!< percent identity [0 - 100]
float sim; //!< percent similarity [0 - 100]
float stp; //!< percent stop codon [0 - 100]
std::vector<long int> deltas; //!< delta encoded alignment informaiton
DeltaAlignment_t ( )
{
clear ( );
}
void clear ( )
{
sR = eR = sQ = eQ = 0;
idy = sim = stp = 0;
deltas.clear ( );
}
};
//===================================================== DeltaRecord_t ==========
struct DeltaRecord_t
//!< A delta record representing the alignments between two sequences
{
std::string idR; //!< reference contig ID
std::string idQ; //!< query contig ID
unsigned long int lenR; //!< length of the reference contig
unsigned long int lenQ; //!< length of the query contig
std::vector<DeltaAlignment_t> aligns; //!< alignments between the two seqs
DeltaRecord_t ( )
{
clear ( );
}
void clear ( )
{
idR.erase ( );
idQ.erase ( );
lenR = lenQ = 0;
aligns.clear ( );
}
};
//===================================================== DeltaReader_t ==========
//! \brief Delta encoded file reader class
//!
//! Handles the input of delta encoded alignment information for various MUMmer
//! utilities. Very basic functionality, can be expanded as necessary...
//!
//! \see DeltaRecord_t
//==============================================================================
class DeltaReader_t {
private:
std::string delta_path_m; //!< the name of the delta input file
std::ifstream delta_stream_m; //!< the delta file input stream
std::string data_type_m; //!< the type of alignment data
std::string reference_path_m; //!< the name of the reference file
std::string query_path_m; //!< the name of the query file
DeltaRecord_t record_m; //!< the current delta information record
bool is_record_m; //!< there is a valid record in record_m
bool is_open_m; //!< delta stream is open
//--------------------------------------------------- readNextAlignment ------
//! \brief Reads in a delta encoded alignment
//!
//! \param align read info into this structure
//! \param read_deltas read delta information yes/no
//! \pre delta_stream is positioned at the beginning of an alignment header
//! \return void
//!
void readNextAlignment (DeltaAlignment_t & align, const bool read_deltas);
//--------------------------------------------------- readNextRecord ---------
//! \brief Reads in the next delta record from the delta file
//!
//! \param read_deltas read delta information yes/no
//! \pre delta file must be open
//! \return true on success, false on EOF
//!
bool readNextRecord (const bool read_deltas);
//--------------------------------------------------- checkStream ------------
//! \brief Check stream status and abort program if an error has occured
//!
//! \return void
//!
void checkStream ( )
{
if ( !delta_stream_m.good ( ) )
{
std::cerr << "ERROR: Could not parse delta file, "
<< delta_path_m << std::endl;
exit (-1);
}
}
public:
//--------------------------------------------------- DeltaReader_t ----------
//! \brief Default constructor
//!
//! \return void
//!
DeltaReader_t ( )
{
is_record_m = false;
is_open_m = false;
}
//--------------------------------------------------- ~DeltaReader_t ---------
//! \brief Default destructor
//!
//! \return void
//!
~DeltaReader_t ( )
{
close ( );
}
//--------------------------------------------------- open -------------------
//! \brief Opens a delta file by path
//!
//! \param delta file to open
//! \return void
//!
void open (const std::string & delta_path);
//--------------------------------------------------- close ------------------
//! \brief Closes any open delta file and resets class members
//!
//! \return void
//!
void close ( )
{
delta_path_m.erase ( );
delta_stream_m.close ( );
data_type_m.erase ( );
reference_path_m.erase ( );
query_path_m.erase ( );
record_m.clear ( );
is_record_m = false;
is_open_m = false;
}
//--------------------------------------------------- readNext --------------
//! \brief Reads in the next delta record from the delta file
//!
//! \param read_deltas read delta information yes/no
//! \pre delta file must be open
//! \return true on success, false on EOF
//!
inline bool readNext (bool getdeltas = true)
{
return readNextRecord (getdeltas);
}
//--------------------------------------------------- readNextHeadersOnly ----
//! \brief Reads in the next delta record from the delta file
//!
//! Only reads the alignment header information, does not read in the delta
//! information and leaves each alignment structure's delta vector empty.
//!
//! \pre delta file must be open
//! \return true on success, false on EOF
//!
inline bool readNextHeadersOnly ( )
{
return readNextRecord (false);
}
//--------------------------------------------------- getRecord --------------
//! \brief Returns a reference to the current delta record
//!
//! \pre readNext( ) was successfully called in advance
//! \return true on success, false on failure or end of file
//!
const DeltaRecord_t & getRecord ( ) const
{
assert (is_record_m);
return record_m;
}
//--------------------------------------------------- getDeltaPath -----------
//! \brief Get the path of the current delta file
//!
//! \pre delta file is open
//! \return the path of the delta file
//!
const std::string & getDeltaPath ( ) const
{
assert (is_open_m);
return delta_path_m;
}
//--------------------------------------------------- getDataType ------------
//! \brief Get the data type of the current delta file
//!
//! \pre delta file is open
//! \return the data type of the current file
//!
const std::string & getDataType ( ) const
{
assert (is_open_m);
return data_type_m;
}
//--------------------------------------------------- getReferencePath -------
//! \brief Get the path of the MUMmer reference file
//!
//! \pre delta file is open
//! \return the path of the MUMmer reference file
//!
const std::string & getReferencePath ( ) const
{
assert (is_open_m);
return reference_path_m;
}
//--------------------------------------------------- getQueryPath -----------
//! \brief Get the path of the MUMmer query file
//!
//! \pre delta file is open
//! \return the path of the MUMmer query file
//!
const std::string & getQueryPath ( ) const
{
assert (is_open_m);
return query_path_m;
}
};
//===================================================== SNP_t ==================
struct DeltaEdgelet_t;
struct DeltaEdge_t;
struct DeltaNode_t;
struct SNP_t
//!< A single nuc/aa poly
{
long int buff;
char cQ, cR;
long int pQ, pR;
int conQ, conR;
std::string ctxQ, ctxR;
DeltaEdgelet_t * lp;
DeltaEdge_t * ep;
SNP_t ( )
{
cQ = cR = 0;
buff = pQ = pR = 0;
conQ = conR = 0;
};
};
//===================================================== DeltaEdgelet_t =========
struct DeltaEdgelet_t
//!< A piece of a delta graph edge, a single alignment
{
unsigned char isGOOD : 1; //!< meets the requirements
unsigned char isQLIS : 1; //!< is part of the query's LIS
unsigned char isRLIS : 1; //!< is part of the reference's LIS
unsigned char isGLIS : 1; //!< is part of the reference/query LIS
unsigned char dirR : 1; //!< reference match direction
unsigned char dirQ : 1; //!< query match direction
DeltaEdge_t * edge;
DeltaEdgelet_t * next; //!< stores chaining info
float idy, sim, stp; //!< percent identity [0 - 1]
unsigned long int idyc, simc, stpc; //!< idy, sim, stp counts
unsigned long int loQ, hiQ, loR, hiR; //!< alignment bounds
int frmQ, frmR; //!< reading frame
std::string delta; //!< delta information
std::vector<SNP_t *> snps; //!< snps for this edgelet
DeltaEdgelet_t ( )
{
next = NULL;
isGOOD = true;
isQLIS = isRLIS = isGLIS = false;
dirR = dirQ = FORWARD_DIR;
idy = sim = stp = 0;
idyc = simc = stpc = 0;
loQ = hiQ = loR = hiR = 0;
frmQ = frmR = 1;
}
~DeltaEdgelet_t ( )
{
std::vector<SNP_t *>::iterator i;
for ( i = snps . begin( ); i != snps . end( ); ++ i )
delete (*i);
}
};
//===================================================== DeltaEdge_t ============
struct DeltaEdge_t
//!< A delta graph edge, alignments between a single reference and query
{
DeltaNode_t * refnode; //!< the adjacent reference node
DeltaNode_t * qrynode; //!< the adjacent query node
std::vector<DeltaEdgelet_t *> edgelets; //!< the set of individual alignments
std::vector<DeltaEdgelet_t *> chains; //!< stores GLIS chains
DeltaEdge_t ( )
{
refnode = qrynode = NULL;
}
~DeltaEdge_t ( )
{
std::vector<DeltaEdgelet_t *>::iterator i;
for ( i = edgelets . begin( ); i != edgelets . end( ); ++ i )
delete (*i);
}
void build (const DeltaRecord_t & rec);
};
//===================================================== DeltaNode_t ============
struct DeltaNode_t
//!< A delta graph node, contains the sequence information
{
const std::string * id; //!< the id of the sequence
char * seq; //!< the DNA sequence
unsigned long int len; //!< the length of the sequence
std::vector<DeltaEdge_t *> edges; //!< the set of related edges
std::vector<DeltaEdgelet_t *> chains; //!< stores RLIS/QLIS chains
DeltaNode_t ( )
{
id = NULL;
seq = NULL;
len = 0;
}
~DeltaNode_t ( )
{
free (seq);
// DeltaGraph_t will take care of destructing the edges
}
};
//===================================================== DeltaGraph_t ===========
//! \brief A graph of sequences (nodes) and their alignments (edges)
//!
//! A bipartite graph with two partite sets, R and Q, where R is the set of
//! reference sequences and Q is the set of query sequences. These nodes are
//! named "DeltaNode_t". We connect a node in R to a node in Q if an alignment
//! is present between the two sequences. The group of all alignments between
//! the two is named "DeltaEdge_t" and a single alignment between the two is
//! named a "DeltaEdgelet_t". Alignment coordinates reference the forward
//! strand and are stored lo before hi.
//!
//==============================================================================
class DeltaGraph_t
{
public:
std::map<std::string, DeltaNode_t> refnodes;
//!< the reference graph nodes, 1 per aligned sequence
std::map<std::string, DeltaNode_t> qrynodes;
//!< the query graph nodes, 1 per aligned sequence
std::string refpath; //!< path of the reference FastA file
std::string qrypath; //!< path of the query FastA file
AlignmentType_t datatype; //!< alignment data type
DeltaGraph_t ( )
{
datatype = NULL_DATA;
}
~DeltaGraph_t ( )
{
clear( );
}
//--------------------------------------------------- build ------------------
//! \brief Build a new graph from a deltafile
//!
//! Does not populate:
//! node->seq
//! edgelet->frmR/frmQ
//! edgelet->snps
//!
//! \param deltapath The path of the deltafile to read
//! \param getdeltas Read the delta-encoded gap positions? yes/no
//! \return void
//!
void build (const std::string & deltapath, bool getdeltas = true);
//--------------------------------------------------- clean ------------------
//! \brief Clean the graph of all edgelets where isGOOD = false
//!
//! Removes all edgelets from the graph where isGOOD = false. Afterwhich, all
//! now empty edges or nodes are also removed.
//!
//! \return void
//!
void clean ( );
//--------------------------------------------------- clear ------------------
//! \brief Clear the graph of all nodes, edges and edgelets
//!
//! \return void
//!
void clear ( )
{
//-- Make sure the edges only get destructed once
std::map<std::string, DeltaNode_t>::iterator i;
std::vector<DeltaEdge_t *>::iterator j;
for ( i = refnodes . begin( ); i != refnodes . end( ); ++ i )
for ( j = i -> second . edges . begin( );
j != i -> second . edges . end( ); ++ j )
delete (*j);
refnodes.clear( );
qrynodes.clear( );
refpath.erase( );
qrypath.erase( );
datatype = NULL_DATA;
}
//--------------------------------------------------- getNodeCount -----------
//! \brief Counts and returns the number of graph nodes (sequences)
//!
//! \return The number of graph nodes
//!
long int getNodeCount ( );
//--------------------------------------------------- getEdgeCount -----------
//! \brief Counts and returns the number of graph edges
//!
//! \return void
//!
long int getEdgeCount ( );
//--------------------------------------------------- getEdgeletCount --------
//! \brief Counts and returns the number of graph edgelets (alignments)
//!
//! \return void
//!
long int getEdgeletCount ( );
//--------------------------------------------------- flagGLIS ---------------
//! \brief Flag edgelets in the global LIS and unflag those who are not
//!
//! Runs a length*identity weigthed LIS to determine the longest, mutually
//! consistent set of alignments between all pairs of sequences. This
//! essentially constructs the global alignment between all sequence pairs.
//!
//! Sets isGLIS flag for good and unsets isGOOD flag for bad.
//!
//! \param epsilon Keep repeat alignments within epsilon % of the best align
//! \return void
//!
void flagGLIS (float epsilon = -1);
//--------------------------------------------------- flagQLIS ---------------
//! \brief Flag edgelets in the query LIS and unflag those who are not
//!
//! Runs a length*identity weighted LIS to determine the longest, QUERY
//! consistent set of alignments for all query sequences. This effectively
//! identifies the "best" alignments for all positions of each query.
//!
//! Sets isQLIS flag for good and unsets isGOOD flag for bad.
//!
//! \param epsilon Keep repeat alignments within epsilon % of the best align
//! \param maxolap Only allow alignments to overlap by maxolap percent [0-100]
//! \return void
//!
void flagQLIS (float epsilon = -1, float maxolap = 100.0);
//--------------------------------------------------- flagRLIS ---------------
//! \brief Flag edgelets in the reference LIS and unflag those who are not
//!
//! Runs a length*identity weighted LIS to determine the longest, REFERENCE
//! consistent set of alignments for all reference sequences. This effectively
//! identifies the "best" alignments for all positions of each reference.
//!
//! Sets isRLIS flag for good and unsets isGOOD flag for bad.
//!
//! \param epsilon Keep repeat alignments within epsilon % of the best align
//! \param maxolap Only allow alignments to overlap by maxolap percent [0-100]
//! \return void
//!
void flagRLIS (float epsilon = -1, float maxolap = 100.0);
//--------------------------------------------------- flagScore --------------
//! \brief Flag edgelets with scores below a score threshold
//!
//! Unsets isGOOD for bad.
//!
//! \param minlen Flag edgelets if less than minlen in length
//! \param minidy Flag edgelets if less than minidy identity [0-100]
//! \return void
//!
void flagScore (long int minlen, float minidy);
//--------------------------------------------------- flagUNIQ ---------------
//! \brief Flag edgelets with uniqueness below a certain threshold
//!
//! Unsets isGOOD for bad.
//!
//! \param minuniq Flag edgelets if less that minuniq percent [0-100] unique
//! \return void
//!
void flagUNIQ (float minuniq);
//--------------------------------------------------- loadSequences ----------
//! \brief Load the sequence information into the DeltaNodes
//!
//! \return void
//!
void loadSequences ( );
//--------------------------------------------------- outputDelta ------------
//! \brief Outputs the contents of the graph as a deltafile
//!
//! \param out The output stream to write to
//! \return The output stream
//!
std::ostream & outputDelta (std::ostream & out);
};
#endif // #ifndef __DELTA_HH