[go: up one dir, main page]

File: model.c

package info (click to toggle)
andi 0.14-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 988 kB
  • sloc: ansic: 2,296; sh: 426; cpp: 99; makefile: 76; awk: 51
file content (326 lines) | stat: -rw-r--r-- 9,323 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
/** @file
 * @brief This file contains all functions for the mutation matrix and the
 * estimation of evolutionary distances thereof.
 */

#include "model.h"
#include "global.h"
#include <gsl/gsl_randist.h>
#include <inttypes.h>
#include <math.h>
#include <stdio.h>

/**
 * @brief Sum some mutation count specified by `summands`. Intended to be used
 * through the `model_sum` macro.
 *
 * @param MM - The mutation matrix.
 * @param summands - The mutations to add.
 * @returns The sum of mutations.
 */
static size_t model_sum_types(const model *MM, const int summands[]) {
	size_t total = 0;
	for (int i = 0; summands[i] != MUTCOUNTS; ++i) {
		total += MM->counts[summands[i]];
	}
	return total;
}

#define model_sum(MM, ...)                                                     \
	model_sum_types((MM), (int[]){__VA_ARGS__, MUTCOUNTS})

/**
 * @brief Average two mutation matrices.
 *
 * @param MM - One matrix
 * @param NN - Second matrix
 * @returns The average (sum) of two mutation matrices.
 */
model model_average(const model *MM, const model *NN) {
	model ret = *MM;
	for (int i = 0; i != MUTCOUNTS; ++i) {
		ret.counts[i] += NN->counts[i];
	}
	ret.seq_len += NN->seq_len;
	return ret;
}

/**
 * @brief Compute the total number of nucleotides in the pairwise alignment.
 *
 * @param MM - The mutation matrix.
 * @returns The length of the alignment.
 */
size_t model_total(const model *MM) {
	size_t total = 0;
	for (size_t i = 0; i < MUTCOUNTS; ++i) {
		total += MM->counts[i];
	}
	return total;
}

/**
 * @brief Compute the coverage of an alignment.
 *
 * @param MM - The mutation matrix.
 * @returns The relative coverage
 */
double model_coverage(const model *MM) {
	size_t covered = model_total(MM);
	size_t actual = MM->seq_len;

	return (double)covered / (double)actual;
}

/**
 * @brief Estimate the uncorrected distance of a pairwise alignment.
 *
 * @param MM - The mutation matrix.
 * @returns The uncorrected substitution rate.
 */
double estimate_RAW(const model *MM) {
	size_t nucl = model_total(MM);
	size_t SNPs = model_sum(MM, AtoC, AtoG, AtoT, CtoA, CtoG, CtoT, GtoA, GtoC,
							GtoT, TtoA, TtoC, TtoG);

	// Insignificant results. All abort the fail train.
	if (nucl <= 3) {
		return NAN;
	}

	return (double)SNPs / (double)nucl;
}

/**
 * @brief Compute the Jukes-Cantor distance.
 *
 * @param MM - The mutation matrix.
 * @returns The corrected JC distance.
 */
double estimate_JC(const model *MM) {
	double dist = estimate_RAW(MM);
	dist = -0.75 * log(1.0 - (4.0 / 3.0) * dist); // jukes cantor

	// fix negative zero
	return dist <= 0.0 ? 0.0 : dist;
}

/** @brief computes the evolutionary distance using K80.
 *
 * @param MM - The mutation matrix.
 * @returns The corrected Kimura distance.
 */
double estimate_KIMURA(const model *MM) {
	size_t nucl = model_total(MM);
	size_t transitions = model_sum(MM, AtoG, GtoA, CtoT, TtoC);
	size_t transversions =
		model_sum(MM, AtoC, CtoA, AtoT, TtoA, GtoC, CtoG, GtoT, TtoG);

	double P = (double)transitions / (double)nucl;
	double Q = (double)transversions / (double)nucl;

	double tmp = 1.0 - 2.0 * P - Q;
	double dist = -0.25 * log((1.0 - 2.0 * Q) * tmp * tmp);

	// fix negative zero
	return dist <= 0.0 ? 0.0 : dist;
}

/** @brief computes the evolutionary distance using LogDet.
 *
 * The LogDet distance between sequence X and and sequence Y
 * is given as
 *
 * -(1 / K) * (log(det(Fxy)) - 0.5 * log(det(Fxx * Fyy)))
 *
 * Where K is the number of character states, Fxy is the site-pattern
 * frequency matrix, and diagonal matrices Fxx and Fyy give the
 * frequencies of the different character states in sequences X and Y.
 *
 * Each i,j-th entry in Fxy is the proportion of homologous sites
 * where sequences X and Y have character states i and j, respectively.
 *
 * For our purposes, X is the Subject (From) sequence and Y is the
 * Query (To) sequence and matrix Fxy looks like
 *
 *  To   A  C  G  T
 * From
 *  A  (            )
 *  C  (            )
 *  G  (            )
 *  T  (            )
 *
 * @param MM - The mutation matrix.
 * @returns The LogDet distance.
 */
double estimate_LOGDET(const model *MM) {

	double nucl = (double)model_total(MM);
	double P[MUTCOUNTS];
	for (int i = 0; i < MUTCOUNTS; i++) {
		P[i] = MM->counts[i] / nucl;
	}

	double logDetFxxFyy =
		// log determinant of diagonal matrix of row sums
		log(model_sum(MM, AtoA, AtoC, AtoG, AtoT) / nucl) +
		log(model_sum(MM, CtoA, CtoC, CtoG, CtoT) / nucl) +
		log(model_sum(MM, GtoA, GtoC, GtoG, GtoT) / nucl) +
		log(model_sum(MM, TtoA, TtoC, TtoG, TtoT) / nucl) +
		// log determinant of diagonal matrix of column sums
		log(model_sum(MM, AtoA, CtoA, GtoA, TtoA) / nucl) +
		log(model_sum(MM, AtoC, CtoC, GtoC, TtoC) / nucl) +
		log(model_sum(MM, AtoG, CtoG, GtoG, TtoG) / nucl) +
		log(model_sum(MM, AtoT, CtoT, GtoT, TtoT) / nucl);

	// determinant of the site-pattern frequency matrix
	double detFxy =
		P[AtoA] * P[CtoC] * (P[GtoG] * P[TtoT] - P[TtoG] * P[GtoT]) -
		P[AtoA] * P[CtoG] * (P[GtoC] * P[TtoT] - P[TtoC] * P[GtoT]) +
		P[AtoA] * P[CtoT] * (P[GtoC] * P[TtoG] - P[TtoC] * P[GtoG]) -

		P[AtoC] * P[CtoA] * (P[GtoG] * P[TtoT] - P[TtoG] * P[GtoT]) +
		P[AtoC] * P[CtoG] * (P[GtoA] * P[TtoT] - P[TtoA] * P[GtoT]) -
		P[AtoC] * P[CtoT] * (P[GtoA] * P[TtoG] - P[TtoA] * P[GtoG]) +

		P[AtoG] * P[CtoA] * (P[GtoC] * P[TtoT] - P[TtoC] * P[GtoT]) -
		P[AtoG] * P[CtoC] * (P[GtoA] * P[TtoT] - P[TtoA] * P[GtoT]) +
		P[AtoG] * P[CtoT] * (P[GtoA] * P[TtoC] - P[TtoA] * P[GtoC]) -

		P[AtoT] * P[CtoA] * (P[GtoC] * P[TtoG] - P[TtoC] * P[GtoG]) +
		P[AtoT] * P[CtoC] * (P[GtoA] * P[TtoG] - P[TtoA] * P[GtoG]) -
		P[AtoT] * P[CtoG] * (P[GtoA] * P[TtoC] - P[TtoA] * P[GtoC]);

	double dist = -0.25 * (log(detFxy) - 0.5 * logDetFxxFyy);

	// fix negative zero
	return dist <= 0.0 ? 0.0 : dist;
}

/** @brief Bootstrap a mutation matrix.
 *
 * The classical bootstrapping process, as described by Felsenstein, resamples
 * all nucleotides of a MSA. As andi only computes a pairwise alignment, this
 * process boils down to a simple multinomial distribution. We just have to
 * resample the elements of the mutation matrix. See Klötzl & Haubold (2016)
 * for details. http://www.mdpi.com/2075-1729/6/1/11/htm
 *
 * @param datum - The original mutation matrix.
 * @returns A bootstrapped mutation matrix.
 */
model model_bootstrap(model datum) {
	size_t nucl = model_total(&datum);
	double p[MUTCOUNTS];
	for (size_t i = 0; i < MUTCOUNTS; ++i) {
		p[i] = datum.counts[i] / (double)nucl;
	}

	gsl_ran_multinomial(RNG, MUTCOUNTS, nucl, p, datum.counts);

	return datum;
}

/**
 * @brief Given an anchor, classify nucleotides.
 *
 * For anchors we already know that the nucleotides of the subject and the query
 * are equal. Thus only one sequence has to be analysed. Most models don't
 * actually care about the individual nucleotides as long as they are equal in
 * the two sequences. For these models, we just assume equal distribution.
 *
 * @param MM - The mutation matrix
 * @param S - The subject
 * @param len - The anchor length
 */
void model_count_equal(model *MM, const char *S, size_t len) {
	if (MODEL == M_RAW || MODEL == M_JC || MODEL == M_KIMURA) {
		size_t fourth = len / 4;
		MM->counts[AtoA] += fourth;
		MM->counts[CtoC] += fourth;
		MM->counts[GtoG] += fourth;
		MM->counts[TtoT] += fourth + (len & 3);
		return;
	}

	// Fall-back algorithm for future models. Note, as this works on a
	// per-character basis it is slow.

	size_t local_counts[4] = {0};

	for (; len--;) {
		char s = *S++;

		// ';!#' are all smaller than 'A'
		if (s < 'A') {
			// Technically, s can only be ';' at this point.
			continue;
		}

		// The four canonical nucleotides can be uniquely identified by the bits
		// 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed.
		local_counts[(s >> 1) & 3]++;
	}

	MM->counts[AtoA] += local_counts[0];
	MM->counts[CtoC] += local_counts[1];
	MM->counts[GtoG] += local_counts[3];
	MM->counts[TtoT] += local_counts[2];
}

/** @brief Convert a nucleotide to a 2bit representation.
 *
 * We want to map characters:
 *  A → 0
 *  C → 1
 *  G → 2
 *  T → 3
 * The trick used below is that the three lower bits of the
 * characters are unique. Thus, they can be used to compute the mapping
 * above. The mapping itself is done via tricky bitwise operations.
 *
 * @param c - input nucleotide
 * @returns 2bit representation.
 */
char nucl2bit(unsigned char c) {
	c &= 6;
	c ^= c >> 1;
	return c >> 1;
}

/**
 * @brief Count the substitutions and add them to the mutation matrix.
 *
 * @param MM - The mutation matrix.
 * @param S - The subject
 * @param Q - The query
 * @param len - The length of the alignment
 */
void model_count(model *MM, const char *S, const char *Q, size_t len) {
	size_t local_counts[MUTCOUNTS] = {0};

	for (size_t i = 0; i < len; S++, Q++, i++) {
		char s = *S;
		char q = *Q;

		// Skip special characters.
		if (s < 'A' || q < 'A') {
			continue;
		}

		// Pick the correct two bits representing s and q.
		unsigned char foo = nucl2bit(s);
		unsigned char bar = nucl2bit(q);

		/*
		 * Finally, we want to map the indices to the correct mutation. This is
		 * done by utilising the mutation types in model.h.
		 */
		unsigned int index = (foo << 2) + bar;

		local_counts[index]++;
	}

	for (int i = 0; i != MUTCOUNTS; ++i) {
		MM->counts[i] += local_counts[i];
	}
}