@@ -12,13 +12,13 @@ use crate::Diff;
1212use super :: BitVec ;
1313
1414// TODO migrate these const values to be defined in configuration
15- // The current values are only really appropriate for smaller
16- // configurations
15+ // The current values are only really appropriate for the smaller
16+ // diff configuration.
1717
1818/// Number of bits covered by each SimHash bucket.
19- pub const SIM_BUCKET_SIZE : usize = 6 ;
19+ const SIM_BUCKET_SIZE : usize = 6 ;
2020/// Number of consecutive SimHash buckets used for searching.
21- pub const SIM_BUCKETS : usize = 20 ;
21+ const SIM_BUCKETS : usize = 20 ;
2222
2323pub type BucketId = usize ;
2424
@@ -77,7 +77,7 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
7777 /// The first argument in the tuple is the bucket id of the `SimHash` which can be used
7878 /// to select a certain subset of `SimHashes`. SimHashes are returned in decreasing order
7979 /// of bucket ids, since that's their natural construction order.
80- pub fn sim_hashes ( & self ) -> impl Iterator < Item = ( BucketId , SimHash ) > + ' _ {
80+ pub fn sim_hashes ( & self ) -> impl ExactSizeIterator < Item = ( BucketId , SimHash ) > + ' _ {
8181 SimHashIterator :: new ( self )
8282 }
8383
@@ -89,15 +89,29 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
8989 . map ( |( _, sim_hash) | sim_hash)
9090 }
9191
92+ /// Get the `SimHash`es for this filter for the purpose of performing a search.
93+ ///
94+ /// Returns an iterator of the `SimHash`es and a number representing the minimum number
95+ /// of matches required to consider this filter a match to a given filter, given
96+ /// the expected diff size.
97+ ///
98+ /// The geo_filter can be used to do an "exact" search by setting expected_diff_size to zero.
99+ /// In this case, all the buckets must match. Similarly, small differences can be found by
100+ /// requiring (SIM_BUCKETS - expected_diff_size) many buckets to match. For larger differences
101+ /// SIM_BUCKETS / 2 many buckets have to match.
92102 pub fn sim_hashes_search (
93103 & self ,
94104 expected_diff_size : usize ,
95- ) -> impl Iterator < Item = SimHash > + ' _ {
105+ ) -> ( impl Iterator < Item = SimHash > + ' _ , usize ) {
96106 let range = self . sim_hash_range ( expected_diff_size) ;
97- self . sim_hashes ( )
107+ let sim_hash_iter = self . sim_hashes ( ) ;
108+ let n = range. len ( ) . min ( sim_hash_iter. len ( ) ) ;
109+ let min_matches = n. saturating_sub ( expected_diff_size) . max ( SIM_BUCKETS / 2 ) ;
110+ let filtered_iter = sim_hash_iter
98111 . skip_while ( move |( bucket_id, _) | * bucket_id >= range. end )
99112 . take_while ( move |( bucket_id, _) | * bucket_id >= range. start )
100- . map ( |( _, sim_hash) | sim_hash)
113+ . map ( |( _, sim_hash) | sim_hash) ;
114+ ( filtered_iter, min_matches)
101115 }
102116}
103117
@@ -152,8 +166,14 @@ impl<C: GeoConfig<Diff>> Iterator for SimHashIterator<'_, C> {
152166 SimHash :: new ( self . prev_bucket_id , self . sim_hash [ bucket] ) ,
153167 ) )
154168 }
169+
170+ fn size_hint ( & self ) -> ( usize , Option < usize > ) {
171+ ( self . prev_bucket_id , Some ( self . prev_bucket_id ) )
172+ }
155173}
156174
175+ impl < C : GeoConfig < Diff > > ExactSizeIterator for SimHashIterator < ' _ , C > { }
176+
157177impl < C : GeoConfig < Diff > > GeoDiffCount < ' _ , C > {
158178 /// n specifies the desired zero-based index of the most significant one.
159179 /// The zero-based index of the desired one bit is returned.
0 commit comments