Skip to content

Commit 80ae11c

Browse files
authored
Merge pull request #77 from github/sc-20250730-sim-hash
`SimHash` encapsulation
2 parents f6bc74b + 728814c commit 80ae11c

File tree

2 files changed

+29
-9
lines changed

2 files changed

+29
-9
lines changed

crates/geo_filters/src/diff_count.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ mod sim_hash;
1818

1919
use bitvec::*;
2020
pub use config::{GeoDiffConfig13, GeoDiffConfig7};
21-
pub use sim_hash::{SimHash, SIM_BUCKETS, SIM_BUCKET_SIZE};
21+
pub use sim_hash::SimHash;
2222

2323
/// Diff count filter with a relative error standard deviation of ~0.125.
2424
pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7>;

crates/geo_filters/src/diff_count/sim_hash.rs

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@ use crate::Diff;
1212
use super::BitVec;
1313

1414
// TODO migrate these const values to be defined in configuration
15-
// The current values are only really appropriate for smaller
16-
// configurations
15+
// The current values are only really appropriate for the smaller
16+
// diff configuration.
1717

1818
/// Number of bits covered by each SimHash bucket.
19-
pub const SIM_BUCKET_SIZE: usize = 6;
19+
const SIM_BUCKET_SIZE: usize = 6;
2020
/// Number of consecutive SimHash buckets used for searching.
21-
pub const SIM_BUCKETS: usize = 20;
21+
const SIM_BUCKETS: usize = 20;
2222

2323
pub type BucketId = usize;
2424

@@ -77,7 +77,7 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
7777
/// The first argument in the tuple is the bucket id of the `SimHash` which can be used
7878
/// to select a certain subset of `SimHashes`. SimHashes are returned in decreasing order
7979
/// of bucket ids, since that's their natural construction order.
80-
pub fn sim_hashes(&self) -> impl Iterator<Item = (BucketId, SimHash)> + '_ {
80+
pub fn sim_hashes(&self) -> impl ExactSizeIterator<Item = (BucketId, SimHash)> + '_ {
8181
SimHashIterator::new(self)
8282
}
8383

@@ -89,15 +89,29 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
8989
.map(|(_, sim_hash)| sim_hash)
9090
}
9191

92+
/// Get the `SimHash`es for this filter for the purpose of performing a search.
93+
///
94+
/// Returns an iterator of the `SimHash`es and a number representing the minimum number
95+
/// of matches required to consider this filter a match to a given filter, given
96+
/// the expected diff size.
97+
///
98+
/// The geo_filter can be used to do an "exact" search by setting expected_diff_size to zero.
99+
/// In this case, all the buckets must match. Similarly, small differences can be found by
100+
/// requiring (SIM_BUCKETS - expected_diff_size) many buckets to match. For larger differences
101+
/// SIM_BUCKETS / 2 many buckets have to match.
92102
pub fn sim_hashes_search(
93103
&self,
94104
expected_diff_size: usize,
95-
) -> impl Iterator<Item = SimHash> + '_ {
105+
) -> (impl Iterator<Item = SimHash> + '_, usize) {
96106
let range = self.sim_hash_range(expected_diff_size);
97-
self.sim_hashes()
107+
let sim_hash_iter = self.sim_hashes();
108+
let n = range.len().min(sim_hash_iter.len());
109+
let min_matches = n.saturating_sub(expected_diff_size).max(SIM_BUCKETS / 2);
110+
let filtered_iter = sim_hash_iter
98111
.skip_while(move |(bucket_id, _)| *bucket_id >= range.end)
99112
.take_while(move |(bucket_id, _)| *bucket_id >= range.start)
100-
.map(|(_, sim_hash)| sim_hash)
113+
.map(|(_, sim_hash)| sim_hash);
114+
(filtered_iter, min_matches)
101115
}
102116
}
103117

@@ -152,8 +166,14 @@ impl<C: GeoConfig<Diff>> Iterator for SimHashIterator<'_, C> {
152166
SimHash::new(self.prev_bucket_id, self.sim_hash[bucket]),
153167
))
154168
}
169+
170+
fn size_hint(&self) -> (usize, Option<usize>) {
171+
(self.prev_bucket_id, Some(self.prev_bucket_id))
172+
}
155173
}
156174

175+
impl<C: GeoConfig<Diff>> ExactSizeIterator for SimHashIterator<'_, C> {}
176+
157177
impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
158178
/// n specifies the desired zero-based index of the most significant one.
159179
/// The zero-based index of the desired one bit is returned.

0 commit comments

Comments
 (0)