[go: up one dir, main page]

File: tuple_bloom.h

package info (click to toggle)
tarantool 2.6.0-1.4
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 85,412 kB
  • sloc: ansic: 513,775; cpp: 69,493; sh: 25,650; python: 19,190; perl: 14,973; makefile: 4,178; yacc: 1,329; sql: 1,074; pascal: 620; ruby: 190; awk: 18; lisp: 7
file content (239 lines) | stat: -rw-r--r-- 7,822 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#ifndef TARANTOOL_BOX_TUPLE_BLOOM_H_INCLUDED
#define TARANTOOL_BOX_TUPLE_BLOOM_H_INCLUDED
/*
 * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * 1. Redistributions of source code must retain the above
 *    copyright notice, this list of conditions and the
 *    following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials
 *    provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "salad/bloom.h"

#if defined(__cplusplus)
extern "C" {
#endif /* defined(__cplusplus) */

struct tuple;
struct key_def;

/**
 * Tuple bloom filter.
 *
 * Consists of a set of bloom filters, one per each partial key.
 * When a key is checked to be hashed in the bloom, all its
 * partial keys are checked as well, which lowers the probability
 * of false positive results.
 */
struct tuple_bloom {
	/**
	 * If the following flag is set, this is a legacy
	 * bloom filter that stores hashes only for full keys
	 * (see tuple_bloom_decode_legacy).
	 */
	bool is_legacy;
	/** Number of key parts. */
	uint32_t part_count;
	/** Array of bloom filters, one per each partial key. */
	struct bloom parts[0];
};

/**
 * Array of tuple hashes.
 */
struct tuple_hash_array {
	/** Number of hashes stored in the array. */
	uint32_t count;
	/** Capacity of the array of hashes. */
	uint32_t capacity;
	/** Array of hashes. */
	uint32_t *values;
};

/**
 * Tuple bloom filter builder.
 *
 * Construction of a bloom filter proceeds as follows.
 *
 * First, tuples of the target set are added to a builder object.
 * For further calculations to be correct, tuples MUST be added
 * in the order defined by the provided key definition. The builder
 * object stores hashes of all added tuples for each partial key,
 * e.g. for tuple (1, 2, 3) it stores hashes of (1), (1, 2), and
 * (1, 2, 3) in separate arrays. It doesn't store the same hash
 * multiple times in the same array (that's what the order is
 * required for), thus it knows how many unique encounters of each
 * partial key are there.
 *
 * Once all tuples have been hashed, the builder can be used to
 * create a bloom filter having the given false positive rate
 * for all lookups, both by full and by partial key. Since when
 * checking a tuple against a bloom filter, we check not only the
 * full key bloom, but also all partial key blooms, the actual
 * FPR of checking keys consisting of i parts will be equal to
 * the multiplication of FPRs of individual bloom filters storing
 * hashes of parts <= i. This allows us to use smaller FPR for
 * partial bloom filters and hence reduce the bloom filter size.
 * For more details, see tuple_bloom_new() implementation.
 */
struct tuple_bloom_builder {
	/** Number of key parts. */
	uint32_t part_count;
	/** Hash arrays, one per each partial key. */
	struct tuple_hash_array parts[0];
};

/**
 * Create a new tuple bloom filter builder.
 * @param part_count - number of key parts
 * @return bloom filter builder on success or NULL on OOM
 */
struct tuple_bloom_builder *
tuple_bloom_builder_new(uint32_t part_count);

/**
 * Destroy a tuple bloom filter builder.
 * @param builder - bloom filter builder to delete
 */
void
tuple_bloom_builder_delete(struct tuple_bloom_builder *builder);

/**
 * Add a tuple hash to a tuple bloom filter builder.
 * @param builder - bloom filter builder
 * @param tuple - tuple to add
 * @param key_def - key definition
 * @param multikey_idx - multikey index hint
 * @return 0 on success, -1 on OOM
 */
int
tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
			struct tuple *tuple, struct key_def *key_def,
			int multikey_idx);

/**
 * Add a key hash to a tuple bloom filter builder.
 * @param builder - bloom filter builder
 * @param key - key to add
 * @param part_count - number of parts in the key
 * @param key_def - key definition
 * @return 0 on success, -1 on OOM
 */
int
tuple_bloom_builder_add_key(struct tuple_bloom_builder *builder,
			    const char *key, uint32_t part_count,
			    struct key_def *key_def);

/**
 * Create a new tuple bloom filter.
 * @param builder - bloom filter builder
 * @param fpr - desired false positive rate
 * @return bloom filter on success or NULL on OOM
 */
struct tuple_bloom *
tuple_bloom_new(struct tuple_bloom_builder *builder, double fpr);

/**
 * Delete a tuple bloom filter.
 * @param bloom - bloom filter to delete
 */
void
tuple_bloom_delete(struct tuple_bloom *bloom);

/**
 * Check if a tuple was stored in a tuple bloom filter.
 * @param bloom - bloom filter
 * @param tuple - tuple to check
 * @param key_def - key definition
 * @param multikey_idx - multikey index hint
 * @return true if the tuple may have been stored in the bloom,
 *  false if the tuple is definitely not in the bloom
 */
bool
tuple_bloom_maybe_has(const struct tuple_bloom *bloom, struct tuple *tuple,
		      struct key_def *key_def, int multikey_idx);

/**
 * Check if a tuple matching a key was stored in a tuple bloom filter.
 * @param bloom - bloom filter
 * @param key - key to check
 * @param part_count - number of parts in the key
 * @param key_def - key definition
 * @return true if there may be a tuple matching the key stored in
 *  the bloom, false if there is definitely no such tuple
 */
bool
tuple_bloom_maybe_has_key(const struct tuple_bloom *bloom,
			  const char *key, uint32_t part_count,
			  struct key_def *key_def);

/**
 * Return the size of a tuple bloom filter when encoded.
 * @param bloom - bloom filter
 * @return size of the bloom filter, in bytes
 */
size_t
tuple_bloom_size(const struct tuple_bloom *bloom);

/**
 * Encode a tuple bloom filter in MsgPack.
 * @param bloom - bloom filter
 * @param buf - buffer where to store the bloom filter
 * @return pointer to the first byte following encoded data
 */
char *
tuple_bloom_encode(const struct tuple_bloom *bloom, char *buf);

/**
 * Decode a tuple bloom filter from MsgPack.
 * @param data - pointer to buffer storing encoded bloom filter;
 *  on success it is advanced by the number of decoded bytes
 * @return the decoded bloom on success or NULL on OOM
 */
struct tuple_bloom *
tuple_bloom_decode(const char **data);

/**
 * Decode a legacy bloom filter from MsgPack.
 * @param data - pointer to buffer storing encoded bloom filter;
 *  on success it is advanced by the number of decoded bytes
 * @return the decoded bloom on success or NULL on OOM
 *
 * We used to store only full key bloom filters. This function
 * decodes such a bloom filter from MsgPack and initializes a
 * tuple_bloom object accordingly.
 */
struct tuple_bloom *
tuple_bloom_decode_legacy(const char **data);

#if defined(__cplusplus)
} /* extern "C" */
#endif /* defined(__cplusplus) */

#endif /* TARANTOOL_BOX_TUPLE_BLOOM_H_INCLUDED */