1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
|
#ifndef TARANTOOL_BOX_TUPLE_BLOOM_H_INCLUDED
#define TARANTOOL_BOX_TUPLE_BLOOM_H_INCLUDED
/*
* Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* 1. Redistributions of source code must retain the above
* copyright notice, this list of conditions and the
* following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
* THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "salad/bloom.h"
#if defined(__cplusplus)
extern "C" {
#endif /* defined(__cplusplus) */
struct tuple;
struct key_def;
/**
* Tuple bloom filter.
*
* Consists of a set of bloom filters, one per each partial key.
* When a key is checked to be hashed in the bloom, all its
* partial keys are checked as well, which lowers the probability
* of false positive results.
*/
struct tuple_bloom {
/**
* If the following flag is set, this is a legacy
* bloom filter that stores hashes only for full keys
* (see tuple_bloom_decode_legacy).
*/
bool is_legacy;
/** Number of key parts. */
uint32_t part_count;
/** Array of bloom filters, one per each partial key. */
struct bloom parts[0];
};
/**
* Array of tuple hashes.
*/
struct tuple_hash_array {
/** Number of hashes stored in the array. */
uint32_t count;
/** Capacity of the array of hashes. */
uint32_t capacity;
/** Array of hashes. */
uint32_t *values;
};
/**
* Tuple bloom filter builder.
*
* Construction of a bloom filter proceeds as follows.
*
* First, tuples of the target set are added to a builder object.
* For further calculations to be correct, tuples MUST be added
* in the order defined by the provided key definition. The builder
* object stores hashes of all added tuples for each partial key,
* e.g. for tuple (1, 2, 3) it stores hashes of (1), (1, 2), and
* (1, 2, 3) in separate arrays. It doesn't store the same hash
* multiple times in the same array (that's what the order is
* required for), thus it knows how many unique encounters of each
* partial key are there.
*
* Once all tuples have been hashed, the builder can be used to
* create a bloom filter having the given false positive rate
* for all lookups, both by full and by partial key. Since when
* checking a tuple against a bloom filter, we check not only the
* full key bloom, but also all partial key blooms, the actual
* FPR of checking keys consisting of i parts will be equal to
* the multiplication of FPRs of individual bloom filters storing
* hashes of parts <= i. This allows us to use smaller FPR for
* partial bloom filters and hence reduce the bloom filter size.
* For more details, see tuple_bloom_new() implementation.
*/
struct tuple_bloom_builder {
/** Number of key parts. */
uint32_t part_count;
/** Hash arrays, one per each partial key. */
struct tuple_hash_array parts[0];
};
/**
* Create a new tuple bloom filter builder.
* @param part_count - number of key parts
* @return bloom filter builder on success or NULL on OOM
*/
struct tuple_bloom_builder *
tuple_bloom_builder_new(uint32_t part_count);
/**
* Destroy a tuple bloom filter builder.
* @param builder - bloom filter builder to delete
*/
void
tuple_bloom_builder_delete(struct tuple_bloom_builder *builder);
/**
* Add a tuple hash to a tuple bloom filter builder.
* @param builder - bloom filter builder
* @param tuple - tuple to add
* @param key_def - key definition
* @param multikey_idx - multikey index hint
* @return 0 on success, -1 on OOM
*/
int
tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
struct tuple *tuple, struct key_def *key_def,
int multikey_idx);
/**
* Add a key hash to a tuple bloom filter builder.
* @param builder - bloom filter builder
* @param key - key to add
* @param part_count - number of parts in the key
* @param key_def - key definition
* @return 0 on success, -1 on OOM
*/
int
tuple_bloom_builder_add_key(struct tuple_bloom_builder *builder,
const char *key, uint32_t part_count,
struct key_def *key_def);
/**
* Create a new tuple bloom filter.
* @param builder - bloom filter builder
* @param fpr - desired false positive rate
* @return bloom filter on success or NULL on OOM
*/
struct tuple_bloom *
tuple_bloom_new(struct tuple_bloom_builder *builder, double fpr);
/**
* Delete a tuple bloom filter.
* @param bloom - bloom filter to delete
*/
void
tuple_bloom_delete(struct tuple_bloom *bloom);
/**
* Check if a tuple was stored in a tuple bloom filter.
* @param bloom - bloom filter
* @param tuple - tuple to check
* @param key_def - key definition
* @param multikey_idx - multikey index hint
* @return true if the tuple may have been stored in the bloom,
* false if the tuple is definitely not in the bloom
*/
bool
tuple_bloom_maybe_has(const struct tuple_bloom *bloom, struct tuple *tuple,
struct key_def *key_def, int multikey_idx);
/**
* Check if a tuple matching a key was stored in a tuple bloom filter.
* @param bloom - bloom filter
* @param key - key to check
* @param part_count - number of parts in the key
* @param key_def - key definition
* @return true if there may be a tuple matching the key stored in
* the bloom, false if there is definitely no such tuple
*/
bool
tuple_bloom_maybe_has_key(const struct tuple_bloom *bloom,
const char *key, uint32_t part_count,
struct key_def *key_def);
/**
* Return the size of a tuple bloom filter when encoded.
* @param bloom - bloom filter
* @return size of the bloom filter, in bytes
*/
size_t
tuple_bloom_size(const struct tuple_bloom *bloom);
/**
* Encode a tuple bloom filter in MsgPack.
* @param bloom - bloom filter
* @param buf - buffer where to store the bloom filter
* @return pointer to the first byte following encoded data
*/
char *
tuple_bloom_encode(const struct tuple_bloom *bloom, char *buf);
/**
* Decode a tuple bloom filter from MsgPack.
* @param data - pointer to buffer storing encoded bloom filter;
* on success it is advanced by the number of decoded bytes
* @return the decoded bloom on success or NULL on OOM
*/
struct tuple_bloom *
tuple_bloom_decode(const char **data);
/**
* Decode a legacy bloom filter from MsgPack.
* @param data - pointer to buffer storing encoded bloom filter;
* on success it is advanced by the number of decoded bytes
* @return the decoded bloom on success or NULL on OOM
*
* We used to store only full key bloom filters. This function
* decodes such a bloom filter from MsgPack and initializes a
* tuple_bloom object accordingly.
*/
struct tuple_bloom *
tuple_bloom_decode_legacy(const char **data);
#if defined(__cplusplus)
} /* extern "C" */
#endif /* defined(__cplusplus) */
#endif /* TARANTOOL_BOX_TUPLE_BLOOM_H_INCLUDED */
|