-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbloom_filter.hpp
159 lines (139 loc) · 4.14 KB
/
bloom_filter.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/**
* MALVA - genotyping by Mapping-free ALternate-allele detection of known VAriants
* Copyright (C) 2019 Giulia Bernardini, Luca Denti, Marco Previtali
*
* This file is part of MALVA.
*
* MALVA is free software: you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MALVA is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with MALVA; see the file LICENSE. If not, see
* <https://www.gnu.org/licenses/>.
**/
#ifndef _BLOOM_FILTER_HPP
#define _BLOOM_FILTER_HPP
#include <algorithm>
#include <array>
#include <cstring>
#include <sdsl/bit_vectors.hpp>
#include "xxhash.h"
#include "kmc_file.h"
using namespace std;
using namespace sdsl;
static const char RCN[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50
0, 0, 0, 0, 0, 'T', 0, 'G', 0, 0, // 60
0, 'C', 0, 0, 0, 0, 0, 0, 'N', 0, // 70
0, 0, 0, 0, 'A', 0, 0, 0, 0, 0, // 80
0, 0, 0, 0, 0, 0, 0, 'T', 0, 'G', // 90
0, 0, 0, 'G', 0, 0, 0, 0, 0, 0, // 100
'N', 0, 0, 0, 0, 0, 'A', 0, 0, 0, // 110
0, 0, 0, 0, 0, 0, 0, 0 // 120
};
class BF
{
private:
static const char _compl(const char &c) { return RCN[c]; }
void _canonical(const char *kmer, char *ckmer, const int &k) const
{
strcpy(ckmer, kmer);
transform(ckmer, ckmer + k, ckmer, _compl);
reverse(ckmer, ckmer + k);
if (strcmp(kmer, ckmer) < 0)
memmove(ckmer, kmer, k);
}
uint64_t _get_hash(const char *kmer) const
{
uint k = strlen(kmer);
char ckmer[k + 1];
_canonical(kmer, ckmer, k);
uint64_t hashes = XXH3_64bits(ckmer, k);
return hashes;
}
public:
BF() : _mode(false), _bf(0, 0) { _size = 0; };
BF(const size_t size) : _mode(false), _bf(size, 0) { _size = size; }
~BF() {}
void add_key(const char *kmer)
{
uint64_t hash = _get_hash(kmer);
_bf[hash % _size] = 1;
}
bool test_key(const char *kmer) const
{
uint64_t hash = _get_hash(kmer);
return _bf[hash % _size];
}
void switch_mode()
{
_mode = true;
_brank = rank_support_v<1>(&_bf);
_counts = int_vector<16>(_brank(_size), 0, 16);
}
bool increment(const char *kmer, const uint32 counter)
{
if (!_mode)
return false;
uint64_t hash = _get_hash(kmer);
size_t bf_idx = hash % _size;
if (_bf[bf_idx])
{
size_t cnts_idx = _brank(bf_idx);
uint32 new_value = _counts[cnts_idx] + counter;
_counts[cnts_idx] = new_value;
}
return true;
}
uint16_t get_count(const char *kmer) const
{
if (_mode)
{
uint64_t hash = _get_hash(kmer);
size_t bf_idx = hash % _size;
if (_bf[bf_idx])
return _counts[_brank(bf_idx)];
}
return 0;
}
ostream &operator>>(ostream &stream)
{
stream.write(reinterpret_cast<const char *>(&_mode), sizeof(bool));
stream.write(reinterpret_cast<const char *>(&_size), sizeof(size_t));
_bf.serialize(stream);
// We don't serialize _brank since loading it will crash.
// TODO: this need some further investigation.
_counts.serialize(stream);
return stream;
}
istream &operator<<(istream &stream)
{
stream.read(reinterpret_cast<char *>(&_mode), sizeof(bool));
stream.read(reinterpret_cast<char *>(&_size), sizeof(size_t));
_bf.load(stream);
_brank = rank_support_v<1>(&_bf);
_counts.load(stream);
return stream;
}
private:
const BF &operator=(const BF &other) { return *this; }
const BF &operator=(const BF &&other) { return *this; }
bool _mode; // false = write, true = read
size_t _size;
bit_vector _bf;
rank_support_v<1> _brank;
int_vector<16> _counts;
};
#endif