1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
|
/**
* @file
* @brief This file contains various distance methods.
*/
#include "process.h"
#include "esa.h"
#include "global.h"
#include "io.h"
#include "model.h"
#include "sequence.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef _OPENMP
#include <omp.h>
#endif
int calculate_bootstrap(const struct model *M, const seq_t *sequences,
size_t n);
typedef _Bool bool;
#define false 0
#define true !false
/**
* @brief This structure captures properties of an anchor.
*/
struct anchor {
/** The position on the subject. */
size_t pos_S;
/** The position on the query. */
size_t pos_Q;
/** The length of the exact match. */
size_t length;
};
/**
* @brief This is a structure of assorted variables needed for anchor finding.
*/
struct context {
const esa_s *C;
const char *query;
size_t query_length;
size_t threshold;
};
/**
* @brief Compute the length of the longest common prefix of two strings.
*
* @param S - One string.
* @param Q - Another string.
* @param remaining - The length of one of the strings.
* @returns the length of the lcp.
*/
static inline size_t lcp(const char *S, const char *Q, size_t remaining) {
size_t length = 0;
while (length < remaining && S[length] == Q[length]) {
length++;
}
return length;
}
/**
* @brief Check whether the last anchor can be extended by a lucky anchor.
*
* Anchors are defined to be unique and of a minimum length. The uniqueness
* requires us to search throw the suffix array for a second appearance of the
* anchor. However, if a left anchor is already unique, we could be sloppy and
* drop the uniqueness criterion for the second anchor. This way we can skip the
* lookup and just compare characters directly. However, for a lucky anchor the
* match still has to be longer than the threshold.
*
* @param ctx - Matching context of various variables.
* @param last_match - The last anchor.
* @param this_match - Input/Output variable for the current match.
* @returns true iff the current match is a lucky anchor.
*/
static inline bool lucky_anchor(const struct context *ctx,
const struct anchor *last_match,
struct anchor *this_match) {
size_t advance = this_match->pos_Q - last_match->pos_Q;
size_t gap = this_match->pos_Q - last_match->pos_Q - last_match->length;
size_t try_pos_S = last_match->pos_S + advance;
if (try_pos_S >= (size_t)ctx->C->len || gap > ctx->threshold) {
return false;
}
this_match->pos_S = try_pos_S;
this_match->length =
lcp(ctx->query + this_match->pos_Q, ctx->C->S + try_pos_S,
ctx->query_length - this_match->pos_Q);
return this_match->length >= ctx->threshold;
}
/**
* @brief Check for a new anchor.
*
* Given the current context and starting position check if the new match is an
* anchor. The latter requires uniqueness and a certain minimum length.
*
* @param ctx - Matching context of various variables.
* @param last_match - (unused)
* @param this_match - Input/Output variable for the current match.
* @returns true iff an anchor was found.
*/
static inline bool anchor(const struct context *ctx,
const struct anchor *last_match,
struct anchor *this_match) {
lcp_inter_t inter = get_match_cached(ctx->C, ctx->query + this_match->pos_Q,
ctx->query_length - this_match->pos_Q);
this_match->pos_S = ctx->C->SA[inter.i];
this_match->length = inter.l <= 0 ? 0 : inter.l;
return inter.i == inter.j && this_match->length >= ctx->threshold;
}
/**
* @brief Divergence estimation using the anchor technique.
*
* The dist_anchor() function estimates the divergence between two
* DNA sequences. The subject is given as an ESA, whereas the query
* is a simple string. This function then looks for *anchors* -- long
* substrings that exist in both sequences. Then it manually checks for
* mutations between those anchors.
*
* @param C - The enhanced suffix array of the subject.
* @param query - The actual query string.
* @param query_length - The length of the query string. Needed for speed
* reasons.
* @param threshold - Minimal length for an anchor.
* @returns A matrix with estimates of base substitutions.
*/
model dist_anchor(const esa_s *C, const char *query, size_t query_length,
size_t threshold) {
struct model ret = {.seq_len = query_length, .counts = {0}};
struct anchor this_match = {0};
struct anchor last_match = {0};
bool last_was_right_anchor = false;
size_t border = C->len / 2;
struct context ctx = {C, query, query_length, threshold};
// Iterate over the complete query.
while (this_match.pos_Q < query_length) {
// Check for lucky anchors and fall back to normal strategy.
if (lucky_anchor(&ctx, &last_match, &this_match) ||
anchor(&ctx, &last_match, &this_match)) {
// We have reached a new anchor.
size_t end_S = last_match.pos_S + last_match.length;
size_t end_Q = last_match.pos_Q + last_match.length;
// Check if this can be a right anchor to the last one.
if (this_match.pos_S > end_S &&
this_match.pos_Q - end_Q == this_match.pos_S - end_S &&
(this_match.pos_S < border) == (last_match.pos_S < border)) {
// classify nucleotides in the left qanchor
model_count_equal(&ret, query + last_match.pos_Q,
last_match.length);
// Count the SNPs in between.
model_count(&ret, C->S + end_S, query + end_Q,
this_match.pos_Q - end_Q);
last_was_right_anchor = true;
} else {
if (last_was_right_anchor) {
// If the last was a right anchor, but with the current one,
// we cannot extend, then add its length.
model_count_equal(&ret, query + last_match.pos_Q,
last_match.length);
} else if (last_match.length >= threshold * 2) {
// The last anchor wasn't neither a left or right anchor.
// But, it was as long as an anchor pair. So still count it.
model_count_equal(&ret, query + last_match.pos_Q,
last_match.length);
}
last_was_right_anchor = false;
}
// Cache values for later
last_match = this_match;
}
// Advance
this_match.pos_Q += this_match.length + 1;
}
// Very special case: The sequences are identical
if (last_match.length >= query_length) {
model_count_equal(&ret, query, query_length);
return ret;
}
// We might miss a few nucleotides if the last anchor was also a right
// anchor. The logic is the same as a few lines above.
if (last_was_right_anchor) {
model_count_equal(&ret, query + last_match.pos_Q, last_match.length);
} else if (last_match.length >= threshold * 2) {
model_count_equal(&ret, query + last_match.pos_Q, last_match.length);
}
return ret;
}
/*
* Include distMatrix and distMatrixLM.
*/
#define FAST
#include "dist_hack.h"
#undef FAST
#include "dist_hack.h"
/**
* @brief Calculates and prints the distance matrix
* @param sequences - An array of pointers to the sequences.
* @param n - The number of sequences.
*/
void calculate_distances(seq_t *sequences, size_t n) {
struct model *M = NULL;
// The maximum number of sequences is near 457'845'052.
size_t intermediate = SIZE_MAX / sizeof(*M) / n;
if (intermediate < n) {
size_t root = (size_t)sqrt(SIZE_MAX / sizeof(*M));
err(1, "Comparison is limited to %zu sequences (%zu given).", root, n);
}
M = malloc(n * n * sizeof(*M));
if (!M) {
err(errno, "Could not allocate enough memory for the comparison "
"matrix. Try using --join or --low-memory.");
}
// compute the distances
if (FLAGS & F_LOW_MEMORY) {
distMatrixLM(M, sequences, n);
} else {
distMatrix(M, sequences, n);
}
// print the results
print_distances(M, sequences, n, 1);
// print additional information.
if (FLAGS & F_VERBOSE) {
print_coverages(M, n);
}
// create new bootstrapped distance matrices
if (BOOTSTRAP) {
int res = calculate_bootstrap(M, sequences, n);
if (res) {
soft_errx("Bootstrapping failed.");
}
}
free(M);
}
/** Yet another hack. */
#define B(X, Y) (B[(X)*n + (Y)])
/** @brief Computes a bootstrap from _pairwise_ alignments.
*
* Doing bootstrapping for alignments with only two sequences is easy. It boils
* down to a simple multi-nomial process over the substitution matrix.
*
* @param M - the initial distance matrix
* @param sequences - a list of the sequences, containing their lengths
* @param n - the number of sequences
*
* The number of bootstrapped distance matrices to print is implicitly
* passed via the global `BOOTSTRAP` variable.
*
* @returns 0 iff successful.
*/
int calculate_bootstrap(const struct model *M, const seq_t *sequences,
size_t n) {
if (!M || !sequences || !n) {
return 1;
}
// B is the new bootstrap matrix
struct model *B = malloc(n * n * sizeof(*B));
CHECK_MALLOC(B);
// Compute a number of new distance matrices
while (BOOTSTRAP--) {
for (size_t i = 0; i < n; i++) {
for (size_t j = i; j < n; j++) {
if (i == j) {
B(i, j) = (struct model){.seq_len = 1.0, .counts = {1.0}};
continue;
}
// Bootstrapping should only be used with averaged distances.
model datum = model_average(&M(i, j), &M(j, i));
datum = model_bootstrap(datum);
B(j, i) = B(i, j) = datum;
}
}
print_distances(B, sequences, n, 0);
}
free(B);
return 0;
}
|