[go: up one dir, main page]

File: Allele.h

package info (click to toggle)
freebayes 1.3.9-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 6,984 kB
  • sloc: cpp: 125,778; ansic: 4,581; sh: 1,084; python: 672; asm: 271; javascript: 94; lisp: 85; makefile: 37; perl: 27
file content (396 lines) | stat: -rw-r--r-- 15,646 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
#ifndef FREEBAYES_ALLELE_H
#define FREEBAYES_ALLELE_H

#include <cstdio>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <list>
#include <map>
#include <limits>
#include <sstream>
#include <assert.h>
#include "Utility.h"
#include <smithwaterman/convert.h>

//#ifdef HAVE_BAMTOOLS
//#include "api/BamAlignment.h"
//using namespace BamTools;
//#endif

using namespace std;

class Allele;

// Allele recycling allocator
// without we spend 30% of our runtime deleting Allele instances

class AlleleFreeList {

public:
    AlleleFreeList()
        : _p(NULL)
        , _size(0)
        , _allocs(0)
        , _min_size(0)
        , _max_size(0)
        , _tick_allocs(1000000)  // attempt realloc every million allocs
    { }

    ~AlleleFreeList();
    void Purge();
    void Resize(int new_size);
    void* NewAllele();
    void Recycle(void* mem);

private:
    Allele* _p;
    int _size;    // number of alleles on list
    int _allocs;  // allocation counter
    int _min_size; // min size within some previous number of calls to new
    int _max_size;
    int _tick_allocs; // GC cycle length

};


// a structure describing an allele

enum AlleleType {
    ALLELE_GENOTYPE = 1,
    ALLELE_REFERENCE = 2,
    ALLELE_MNP = 4,
    ALLELE_SNP = 8,
    ALLELE_INSERTION = 16,
    ALLELE_DELETION = 32,
    ALLELE_COMPLEX = 64,
    ALLELE_NULL = 128
};

// used in making allele type filter vectors
//const int numberOfPossibleAlleleTypes = 7;

enum AlleleStrand {
    STRAND_FORWARD,
    STRAND_REVERSE
};

typedef long int Position;

class Allele {

    friend class AlleleFreeList;

    friend string stringForAllele(const Allele &a);
    friend string stringForAlleles(vector<Allele> &av);

    friend bool operator<(const Allele &a, const Allele &b);
    friend bool operator==(const Allele &a, const Allele &b);
    friend bool operator!=(const Allele &a, const Allele &b);

    friend ostream &operator<<(ostream &out, vector<Allele> &a);
    friend ostream &operator<<(ostream &out, vector<Allele*> &a);
    friend ostream &operator<<(ostream &out, list<Allele*> &a);

    friend ostream &operator<<(ostream &out, Allele &a);
    friend ostream &operator<<(ostream &out, Allele* &a);

    friend string tojson(vector<Allele*> &alleles, long int &position);
    friend string tojson(vector<Allele*> &alleles);
    friend string tojson(Allele &allele, long int &position);
    friend string tojson(Allele* &allele);

public:

    AlleleType type;        // type of the allele, enumerated above
    string referenceName;   // reference name, for sanity checking
    string referenceSequence; // reference sequence or "" (in case of insertions)
    string alternateSequence; // alternate sequence or "" (in case of deletions and reference alleles)
    string sequencingTechnology; // the technology used to generate this allele
    long int position;      // position 0-based against reference
    long int* currentReferencePosition; // pointer to the current reference position (which may be updated during the life of this allele)
    char* currentReferenceBase;  // pointer to current reference base
    unsigned int length;    // and event length (deletion implies 0, snp implies 1, insertion >1)
    unsigned int referenceLength; // length of the event relative to the reference
    long int repeatRightBoundary;  // if this allele is an indel, and if it is embedded in a tandem repeat 
    // TODO cleanup
    int basesLeft;  // these are the "updated" versions of the above
    int basesRight;
    AlleleStrand strand;          // strand, true = +, false = -
    string sampleID;        // representative sample ID
    string readGroupID;     // read group membership
    string readID;          // id of the read which the allele is drawn from
    vector<short> baseQualities;
    long double quality;          // base quality score associated with this allele, updated every position in the case of reference alleles
    long double lnquality;  // log version of above
    string currentBase;       // current base, meant to be updated every position
    short mapQuality;       // map quality for the originating read
    long double lnmapQuality;       // map quality for the originating read
    double readMismatchRate; // per-base mismatch rate for the read
    double readIndelRate;  // only considering gaps
    double readSNPRate;    // only considering snps/mnps
    bool isProperPair;    // if the allele is supported by a properly paired read
    bool isPaired;  // if the allele is supported by a read that is part of a pair
    bool isMateMapped;  // if the mate in the pair is mapped
    bool genotypeAllele;    // if this is an abstract 'genotype' allele
    bool processed; // flag to mark if we've presented this allele for analysis
    string cigar; // a cigar representation of the allele
    vector<Allele>* alignmentAlleles;
    long int alignmentStart;
    long int alignmentEnd;

    // default constructor, for converting alignments into allele observations
    Allele(AlleleType t, 
           string& refname,
           long int pos, 
           long int* crefpos,
           char* crefbase,
           unsigned int len,
           long int rrbound,
           int bleft,
           int bright,
           string alt,
           string& sampleid,
           string& readid,
           string& readgroupid,
           string& sqtech,
           bool strnd, 
           long double qual,
           string qstr, 
           short mapqual,
           bool ispair,
           bool ismm,
           bool isproppair,
           string cigarstr,
           vector<Allele>* ra,
           long int bas,
           long int bae)
        : type(t)
        , referenceName(refname)
        , position(pos)
        , currentReferencePosition(crefpos)
        , currentReferenceBase(crefbase)
        , length(len)
        , repeatRightBoundary(rrbound)
        , basesLeft(bleft)
        , basesRight(bright)
        , currentBase(alt)
        , alternateSequence(alt)
        , sampleID(sampleid)
        , readID(readid)
        , readGroupID(readgroupid)
        , sequencingTechnology(sqtech)
        , strand(strnd ? STRAND_FORWARD : STRAND_REVERSE)
        , quality((qual == -1) ? averageQuality(qstr) : qual) // passing -1 as quality triggers this calculation
        , lnquality(phred2ln((qual == -1) ? averageQuality(qstr) : qual))
        , mapQuality(mapqual) 
        , lnmapQuality(phred2ln(mapqual))
        , isProperPair(isproppair)
        , isPaired(ispair)
        , isMateMapped(ismm)
        , genotypeAllele(false)
        , processed(false)
        , readMismatchRate(0)
        , readIndelRate(0)
        , readSNPRate(0)
        , cigar(cigarstr)
        , alignmentAlleles(ra)
        , alignmentStart(bas)
        , alignmentEnd(bae)
    {

        baseQualities.resize(qstr.size()); // cache qualities
        transform(qstr.begin(), qstr.end(), baseQualities.begin(), qualityChar2ShortInt);
        referenceLength = referenceLengthFromCigar();

    }

    // for constructing genotype alleles
    Allele(AlleleType t,
	   string alt,
	   unsigned int len,
	   unsigned int reflen,
	   string cigarStr,
	   long int pos=0,
	   long int rrbound=0,
	   bool gallele=true) 
        : type(t)
        , alternateSequence(alt)
        , length(len)
        , referenceLength(reflen)
        , repeatRightBoundary(rrbound)
        , quality(0)
        , lnquality(1)
        , position(pos)
        , genotypeAllele(true)
        , readMismatchRate(0)
        , readIndelRate(0)
        , readSNPRate(0)
        , cigar(cigarStr)
        , alignmentAlleles(NULL)
        , processed(false)
    {
        currentBase = base();
        baseQualities.assign(alternateSequence.size(), 0);
        referenceLength = referenceLengthFromCigar();
    }

    bool equivalent(Allele &a);  // heuristic 'equivalency' between two alleles, which depends on their type
    string typeStr(void) const; // return a string representation of the allele type, for output
    bool isReference(void) const; // true if type == ALLELE_REFERENCE
    bool isSNP(void) const; // true if type == ALLELE_SNP
    bool isInsertion(void) const; // true if type == ALLELE_INSERTION
    bool isDeletion(void) const; // true if type == ALLELE_DELETION
    bool isMNP(void) const; // true if type == ALLELE_MNP
    bool isComplex(void) const; // true if type == ALLELE_COMPLEX
    bool isNull(void) const; // true if type == ALLELE_NULL
    int referenceOffset(void) const;
    const short currentQuality(void) const;  // for getting the quality of a given position in multi-bp alleles
    const long double lncurrentQuality(void) const;
    const int subquality(int startpos, int len) const;
    const long double lnsubquality(int startpos, int len) const;
    const int subquality(const Allele &a) const;
    const long double lnsubquality(const Allele &a) const;
    //const int basesLeft(void) const; // returns the bases left within the read of the current position within the allele
    //const int basesRight(void) const; // returns the bases right within the read of the current position within the allele
    bool sameSample(Allele &other);  // if the other allele has the same sample as this one
    void update(int haplotypeLength = 1); // for reference alleles, updates currentBase and quality
    void setQuality(void); // sets 'current quality' for alleles
    // TODO update this to reflect different insertions (e.g. IATGC instead of I4)
    const string base(void) const;  // the 'current' base of the allele or a string describing the allele, e.g. I10 or D2
                                    //  this is used to update cached data in the allele prior to presenting the allele for analysis
                                    //  for the current base, just use allele.currentBase

    string tojson(void);
    unsigned int getLengthOnReference(void);
    int referenceLengthFromCigar(void);

    string readSeq(void);
    string read5p(void);
    string read3p(void);
    string read5pNonNull(void);
    string read3pNonNull(void);

    // the number of bases from the 5p edge of the allele until the end or the next null allele
    int read5pNonNullBases(void);
    // the number of bases from the 3p edge of the allele until the end or the next null allele
    int read3pNonNullBases(void);

    // wish list...
    //string readRefRelativeSubstr(long int start, long int end);
    //string readRefStartLenSubstr(long int start, int bp);

    vector<Allele*> extend(int pos, int haplotypeLength);
    void squash(void);
    void subtract(int subtractFromRefStart,
            int subtractFromRefEnd,
            string& substart,
            string& subend,
            vector<pair<int, string> >& cigarstart,
            vector<pair<int, string> >& cigarend,
            vector<short>& qsubstart,
            vector<short>& qsubend);

    void add(string& addToStart,
            string& addToEnd,
            vector<pair<int, string> >& cigarStart,
            vector<pair<int, string> >& cigarEnd,
            vector<short>& qaddToStart,
            vector<short>& qaddToEnd);


    void subtractFromStart(int bp, string& seq, vector<pair<int, string> >& cig, vector<short>& quals);
    void subtractFromEnd(int bp, string& seq, vector<pair<int, string> >& cig, vector<short>& quals);
    void addToStart(string& seq, vector<pair<int, string> >& cig, vector<short>& quals);
    void addToEnd(string& seq, vector<pair<int, string> >& cig, vector<short>& quals);

    void mergeAllele(const Allele& allele, AlleleType newType);

    void updateTypeAndLengthFromCigar(void);
    int bpLeft(void); // how many bases are in the read to the left of the allele
    int bpRight(void); // how many bases are in the read to the left of the allele


};

// for sorting pairs of alleles and ints
class AllelePairIntCompare {
public:
    bool operator()(const pair<Allele, int>& a, const pair<Allele, int>& b) {
        return a.second > b.second;
    }
};

class AllelePositionCompare {
public:
    bool operator()(const Allele& a, const Allele& b) {
        return a.position < b.position;
    }
};


void updateAllelesCachedData(vector<Allele*>& alleles);

map<string, vector<Allele*> > groupAllelesBySample(list<Allele*>& alleles);
void groupAllelesBySample(list<Allele*>& alleles, map<string, vector<Allele*> >& groups);

int allowedAlleleTypes(vector<AlleleType>& allowedEnumeratedTypes);
void filterAlleles(list<Allele*>& alleles, int allowedTypes);
int countAlleles(map<string, vector<Allele*> >& sampleGroups);
int baseCount(vector<Allele*>& alleles, string base, AlleleStrand strand);
pair<pair<int, int>, pair<int, int> >
baseCount(vector<Allele*>& alleles, string refbase, string altbase);
int countAllelesWithBase(vector<Allele*>& alleles, string base);

bool areHomozygous(vector<Allele*>& alleles);

map<Allele, int> countAlleles(vector<Allele*>& alleles);
map<string, int> countAllelesString(vector<Allele*>& alleles);
map<string, int> countAllelesString(vector<Allele>& alleles);
map<Allele, int> countAlleles(vector<Allele>& alleles);
map<Allele, int> countAlleles(list<Allele*>& alleles);

vector<Allele> uniqueAlleles(list<Allele*>& alleles);

bool allelesSameType(Allele* &a, Allele* &b);
bool allelesEquivalent(Allele* &a, Allele* &b);
bool allelesSameSample(Allele* &a, Allele* &b);
bool allelesSameType(Allele &a, Allele &b);
bool allelesEquivalent(Allele &a, Allele &b);
bool allelesSameSample(Allele &a, Allele &b);
bool allelesEqual(Allele &a, Allele &b);

void groupAlleles(map<string, vector<Allele*> >& sampleGroups, map<string, vector<Allele*> >& alleleGroups);
void homogenizeAlleles(map<string, vector<Allele*> >& alleleGroups, string& refseq, Allele& refallele);
void resetProcessedFlag(map<string, vector<Allele*> >& alleleGroups);

vector<Allele> alleleUnion(vector<Allele>& a1, vector<Allele>& a2);

// XXX cleanup
// is there a way to template these?  difficult as the syntax for pointer-based comparisons is different than non-pointer
vector<vector<Allele*> >  groupAlleles(list<Allele*> &alleles, bool (*fncompare)(Allele* &a, Allele* &b));
vector<vector<Allele*> >  groupAlleles(list<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b));
vector<vector<Allele*> > groupAlleles(vector<Allele*> &alleles, bool (*fncompare)(Allele &a, Allele &b));
vector<vector<Allele*> > groupAlleles(vector<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b));
vector<vector<Allele*> > groupAlleles(map<string, vector<Allele*> > &alleles, bool (*fncompare)(Allele &a, Allele &b));
vector<vector<Allele> > groupAlleles_copy(vector<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b));
vector<vector<Allele> > groupAlleles_copy(list<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b));
vector<vector<Allele> > groupAlleles_copy(vector<Allele> &alleles);
vector<Allele> genotypeAllelesFromAlleleGroups(vector<vector<Allele> > &groups);
vector<Allele> genotypeAllelesFromAlleleGroups(vector<vector<Allele*> > &groups);
vector<Allele> genotypeAllelesFromAlleles(vector<Allele> &alleles);
vector<Allele> genotypeAllelesFromAlleles(vector<Allele*> &alleles);
Allele genotypeAllele(Allele& a);
Allele genotypeAllele(AlleleType type, string alt = "", unsigned int length = 0, string cigar = "", unsigned int reflen = 0, long int position = 0, long int rrbound = 0);

bool isEmptyAllele(const Allele& allele);
bool isDividedIndel(const Allele& allele);
bool isEmptyAlleleOrIsDividedIndel(const Allele& allele);
bool isUnflankedIndel(const Allele& allele);

int referenceLengthFromCigar(string& cigar);

//AlleleFreeList Allele::_freeList;

#endif