1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
|
/** @file
* @brief This file contains all functions for the mutation matrix and the
* estimation of evolutionary distances thereof.
*/
#include "model.h"
#include "global.h"
#include <gsl/gsl_randist.h>
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
/**
* @brief Sum some mutation count specified by `summands`. Intended to be used
* through the `model_sum` macro.
*
* @param MM - The mutation matrix.
* @param summands - The mutations to add.
* @returns The sum of mutations.
*/
static size_t model_sum_types(const model *MM, const int summands[]) {
size_t total = 0;
for (int i = 0; summands[i] != MUTCOUNTS; ++i) {
total += MM->counts[summands[i]];
}
return total;
}
#define model_sum(MM, ...) \
model_sum_types((MM), (int[]){__VA_ARGS__, MUTCOUNTS})
/**
* @brief Average two mutation matrices.
*
* @param MM - One matrix
* @param NN - Second matrix
* @returns The average (sum) of two mutation matrices.
*/
model model_average(const model *MM, const model *NN) {
model ret = *MM;
for (int i = 0; i != MUTCOUNTS; ++i) {
ret.counts[i] += NN->counts[i];
}
ret.seq_len += NN->seq_len;
return ret;
}
/**
* @brief Compute the total number of nucleotides in the pairwise alignment.
*
* @param MM - The mutation matrix.
* @returns The length of the alignment.
*/
size_t model_total(const model *MM) {
size_t total = 0;
for (size_t i = 0; i < MUTCOUNTS; ++i) {
total += MM->counts[i];
}
return total;
}
/**
* @brief Compute the coverage of an alignment.
*
* @param MM - The mutation matrix.
* @returns The relative coverage
*/
double model_coverage(const model *MM) {
size_t covered = model_total(MM);
size_t actual = MM->seq_len;
return (double)covered / (double)actual;
}
/**
* @brief Estimate the uncorrected distance of a pairwise alignment.
*
* @param MM - The mutation matrix.
* @returns The uncorrected substitution rate.
*/
double estimate_RAW(const model *MM) {
size_t nucl = model_total(MM);
size_t SNPs = model_sum(MM, AtoC, AtoG, AtoT, CtoA, CtoG, CtoT, GtoA, GtoC,
GtoT, TtoA, TtoC, TtoG);
// Insignificant results. All abort the fail train.
if (nucl <= 3) {
return NAN;
}
return (double)SNPs / (double)nucl;
}
/**
* @brief Compute the Jukes-Cantor distance.
*
* @param MM - The mutation matrix.
* @returns The corrected JC distance.
*/
double estimate_JC(const model *MM) {
double dist = estimate_RAW(MM);
dist = -0.75 * log(1.0 - (4.0 / 3.0) * dist); // jukes cantor
// fix negative zero
return dist <= 0.0 ? 0.0 : dist;
}
/** @brief computes the evolutionary distance using K80.
*
* @param MM - The mutation matrix.
* @returns The corrected Kimura distance.
*/
double estimate_KIMURA(const model *MM) {
size_t nucl = model_total(MM);
size_t transitions = model_sum(MM, AtoG, GtoA, CtoT, TtoC);
size_t transversions =
model_sum(MM, AtoC, CtoA, AtoT, TtoA, GtoC, CtoG, GtoT, TtoG);
double P = (double)transitions / (double)nucl;
double Q = (double)transversions / (double)nucl;
double tmp = 1.0 - 2.0 * P - Q;
double dist = -0.25 * log((1.0 - 2.0 * Q) * tmp * tmp);
// fix negative zero
return dist <= 0.0 ? 0.0 : dist;
}
/** @brief computes the evolutionary distance using LogDet.
*
* The LogDet distance between sequence X and and sequence Y
* is given as
*
* -(1 / K) * (log(det(Fxy)) - 0.5 * log(det(Fxx * Fyy)))
*
* Where K is the number of character states, Fxy is the site-pattern
* frequency matrix, and diagonal matrices Fxx and Fyy give the
* frequencies of the different character states in sequences X and Y.
*
* Each i,j-th entry in Fxy is the proportion of homologous sites
* where sequences X and Y have character states i and j, respectively.
*
* For our purposes, X is the Subject (From) sequence and Y is the
* Query (To) sequence and matrix Fxy looks like
*
* To A C G T
* From
* A ( )
* C ( )
* G ( )
* T ( )
*
* @param MM - The mutation matrix.
* @returns The LogDet distance.
*/
double estimate_LOGDET(const model *MM) {
double nucl = (double)model_total(MM);
double P[MUTCOUNTS];
for (int i = 0; i < MUTCOUNTS; i++) {
P[i] = MM->counts[i] / nucl;
}
double logDetFxxFyy =
// log determinant of diagonal matrix of row sums
log(model_sum(MM, AtoA, AtoC, AtoG, AtoT) / nucl) +
log(model_sum(MM, CtoA, CtoC, CtoG, CtoT) / nucl) +
log(model_sum(MM, GtoA, GtoC, GtoG, GtoT) / nucl) +
log(model_sum(MM, TtoA, TtoC, TtoG, TtoT) / nucl) +
// log determinant of diagonal matrix of column sums
log(model_sum(MM, AtoA, CtoA, GtoA, TtoA) / nucl) +
log(model_sum(MM, AtoC, CtoC, GtoC, TtoC) / nucl) +
log(model_sum(MM, AtoG, CtoG, GtoG, TtoG) / nucl) +
log(model_sum(MM, AtoT, CtoT, GtoT, TtoT) / nucl);
// determinant of the site-pattern frequency matrix
double detFxy =
P[AtoA] * P[CtoC] * (P[GtoG] * P[TtoT] - P[TtoG] * P[GtoT]) -
P[AtoA] * P[CtoG] * (P[GtoC] * P[TtoT] - P[TtoC] * P[GtoT]) +
P[AtoA] * P[CtoT] * (P[GtoC] * P[TtoG] - P[TtoC] * P[GtoG]) -
P[AtoC] * P[CtoA] * (P[GtoG] * P[TtoT] - P[TtoG] * P[GtoT]) +
P[AtoC] * P[CtoG] * (P[GtoA] * P[TtoT] - P[TtoA] * P[GtoT]) -
P[AtoC] * P[CtoT] * (P[GtoA] * P[TtoG] - P[TtoA] * P[GtoG]) +
P[AtoG] * P[CtoA] * (P[GtoC] * P[TtoT] - P[TtoC] * P[GtoT]) -
P[AtoG] * P[CtoC] * (P[GtoA] * P[TtoT] - P[TtoA] * P[GtoT]) +
P[AtoG] * P[CtoT] * (P[GtoA] * P[TtoC] - P[TtoA] * P[GtoC]) -
P[AtoT] * P[CtoA] * (P[GtoC] * P[TtoG] - P[TtoC] * P[GtoG]) +
P[AtoT] * P[CtoC] * (P[GtoA] * P[TtoG] - P[TtoA] * P[GtoG]) -
P[AtoT] * P[CtoG] * (P[GtoA] * P[TtoC] - P[TtoA] * P[GtoC]);
double dist = -0.25 * (log(detFxy) - 0.5 * logDetFxxFyy);
// fix negative zero
return dist <= 0.0 ? 0.0 : dist;
}
/** @brief Bootstrap a mutation matrix.
*
* The classical bootstrapping process, as described by Felsenstein, resamples
* all nucleotides of a MSA. As andi only computes a pairwise alignment, this
* process boils down to a simple multinomial distribution. We just have to
* resample the elements of the mutation matrix. See Klötzl & Haubold (2016)
* for details. http://www.mdpi.com/2075-1729/6/1/11/htm
*
* @param datum - The original mutation matrix.
* @returns A bootstrapped mutation matrix.
*/
model model_bootstrap(model datum) {
size_t nucl = model_total(&datum);
double p[MUTCOUNTS];
for (size_t i = 0; i < MUTCOUNTS; ++i) {
p[i] = datum.counts[i] / (double)nucl;
}
gsl_ran_multinomial(RNG, MUTCOUNTS, nucl, p, datum.counts);
return datum;
}
/**
* @brief Given an anchor, classify nucleotides.
*
* For anchors we already know that the nucleotides of the subject and the query
* are equal. Thus only one sequence has to be analysed. Most models don't
* actually care about the individual nucleotides as long as they are equal in
* the two sequences. For these models, we just assume equal distribution.
*
* @param MM - The mutation matrix
* @param S - The subject
* @param len - The anchor length
*/
void model_count_equal(model *MM, const char *S, size_t len) {
if (MODEL == M_RAW || MODEL == M_JC || MODEL == M_KIMURA) {
size_t fourth = len / 4;
MM->counts[AtoA] += fourth;
MM->counts[CtoC] += fourth;
MM->counts[GtoG] += fourth;
MM->counts[TtoT] += fourth + (len & 3);
return;
}
// Fall-back algorithm for future models. Note, as this works on a
// per-character basis it is slow.
size_t local_counts[4] = {0};
for (; len--;) {
char s = *S++;
// ';!#' are all smaller than 'A'
if (s < 'A') {
// Technically, s can only be ';' at this point.
continue;
}
// The four canonical nucleotides can be uniquely identified by the bits
// 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed.
local_counts[(s >> 1) & 3]++;
}
MM->counts[AtoA] += local_counts[0];
MM->counts[CtoC] += local_counts[1];
MM->counts[GtoG] += local_counts[3];
MM->counts[TtoT] += local_counts[2];
}
/** @brief Convert a nucleotide to a 2bit representation.
*
* We want to map characters:
* A → 0
* C → 1
* G → 2
* T → 3
* The trick used below is that the three lower bits of the
* characters are unique. Thus, they can be used to compute the mapping
* above. The mapping itself is done via tricky bitwise operations.
*
* @param c - input nucleotide
* @returns 2bit representation.
*/
char nucl2bit(unsigned char c) {
c &= 6;
c ^= c >> 1;
return c >> 1;
}
/**
* @brief Count the substitutions and add them to the mutation matrix.
*
* @param MM - The mutation matrix.
* @param S - The subject
* @param Q - The query
* @param len - The length of the alignment
*/
void model_count(model *MM, const char *S, const char *Q, size_t len) {
size_t local_counts[MUTCOUNTS] = {0};
for (size_t i = 0; i < len; S++, Q++, i++) {
char s = *S;
char q = *Q;
// Skip special characters.
if (s < 'A' || q < 'A') {
continue;
}
// Pick the correct two bits representing s and q.
unsigned char foo = nucl2bit(s);
unsigned char bar = nucl2bit(q);
/*
* Finally, we want to map the indices to the correct mutation. This is
* done by utilising the mutation types in model.h.
*/
unsigned int index = (foo << 2) + bar;
local_counts[index]++;
}
for (int i = 0; i != MUTCOUNTS; ++i) {
MM->counts[i] += local_counts[i];
}
}
|