@@ -4,8 +4,10 @@ use axum::{
44 extract:: { Query , State } ,
55 response:: sse:: { Event , Sse } ,
66} ;
7+ use chrono:: { Datelike , Utc } ;
78use futures_util:: stream:: Stream ;
89use serde_json:: json;
10+ use std:: collections:: HashSet ;
911use std:: convert:: Infallible ;
1012use std:: sync:: Arc ;
1113use tokio:: sync:: broadcast:: error:: RecvError ;
@@ -87,8 +89,16 @@ async fn fetch_relevant_knowledge(
8789 selected
8890}
8991
92+ /// Selects knowledge entries using a multi-tier allocation strategy to prevent
93+ /// popularity bias (rich-get-richer death spiral where 93%+ entries never get retrieved).
94+ ///
95+ /// Tier allocation for `limit=10`:
96+ /// - Tier 1 (40%): Project-relevant entries sorted by confidence
97+ /// - Tier 2 (20%): Recent entries (recency boost for new knowledge)
98+ /// - Tier 3 (20%): Proven entries with highest usage_count
99+ /// - Tier 4 (remaining): Exploration — never-seen entries rotated daily
90100fn select_relevant_knowledge (
91- mut entries : Vec < GlobalKnowledge > ,
101+ entries : Vec < GlobalKnowledge > ,
92102 project : & str ,
93103 limit : usize ,
94104) -> Vec < GlobalKnowledge > {
@@ -97,37 +107,88 @@ fn select_relevant_knowledge(
97107 }
98108
99109 let normalized_project = project. trim ( ) . to_ascii_lowercase ( ) ;
100- entries. sort_by ( |a, b| {
110+ let mut selected: Vec < GlobalKnowledge > = Vec :: with_capacity ( limit) ;
111+ let mut used_ids: HashSet < String > = HashSet :: new ( ) ;
112+
113+ // Tier 1: Project-relevant entries (up to 40% of slots)
114+ let project_slots = ( limit * 2 / 5 ) . max ( 1 ) ;
115+ let mut project_entries: Vec < _ > = entries
116+ . iter ( )
117+ . filter ( |k| {
118+ k. source_projects
119+ . iter ( )
120+ . any ( |p| p. trim ( ) . to_ascii_lowercase ( ) == normalized_project)
121+ } )
122+ . cloned ( )
123+ . collect ( ) ;
124+ project_entries. sort_by ( |a, b| b. confidence . total_cmp ( & a. confidence ) ) ;
125+ for entry in project_entries. into_iter ( ) . take ( project_slots) {
126+ used_ids. insert ( entry. id . clone ( ) ) ;
127+ selected. push ( entry) ;
128+ }
129+
130+ // Tier 2: Recent entries (up to 20% of slots) — recency boost for new knowledge
131+ let recency_slots = ( limit / 5 ) . max ( 1 ) ;
132+ let mut recent_entries: Vec < _ > = entries
133+ . iter ( )
134+ . filter ( |k| !used_ids. contains ( & k. id ) )
135+ . cloned ( )
136+ . collect ( ) ;
137+ recent_entries. sort_by ( |a, b| b. created_at . cmp ( & a. created_at ) ) ;
138+ for entry in recent_entries. into_iter ( ) . take ( recency_slots) {
139+ used_ids. insert ( entry. id . clone ( ) ) ;
140+ selected. push ( entry) ;
141+ }
142+
143+ // Tier 3: High-value proven entries (up to 20% of slots) — usage_count matters here
144+ let proven_slots = ( limit / 5 ) . max ( 1 ) ;
145+ let mut proven_entries: Vec < _ > = entries
146+ . iter ( )
147+ . filter ( |k| !used_ids. contains ( & k. id ) && k. usage_count > 0 )
148+ . cloned ( )
149+ . collect ( ) ;
150+ proven_entries. sort_by ( |a, b| {
101151 b. usage_count
102152 . cmp ( & a. usage_count )
103153 . then_with ( || b. confidence . total_cmp ( & a. confidence ) )
104- . then_with ( || a. title . cmp ( & b. title ) )
105154 } ) ;
106-
107- let mut selected = Vec :: with_capacity ( limit) ;
108-
109- for entry in & entries {
110- if entry. source_projects . iter ( ) . any ( |source| {
111- let normalized_source = source. trim ( ) . to_ascii_lowercase ( ) ;
112- normalized_source == normalized_project
113- } ) {
114- selected. push ( entry. clone ( ) ) ;
115- if selected. len ( ) == limit {
116- return selected;
117- }
118- }
155+ for entry in proven_entries. into_iter ( ) . take ( proven_slots) {
156+ used_ids. insert ( entry. id . clone ( ) ) ;
157+ selected. push ( entry) ;
119158 }
120159
121- for entry in entries {
122- if selected. iter ( ) . any ( |picked| picked. id == entry. id ) {
123- continue ;
160+ // Tier 4: Exploration — never-seen entries (remaining slots)
161+ // Breaks the death spiral: entries with usage_count=0 get a chance.
162+ // Deterministic daily rotation ensures different entries surface each day.
163+ let remaining = limit. saturating_sub ( selected. len ( ) ) ;
164+ if remaining > 0 {
165+ let mut unseen: Vec < _ > = entries
166+ . iter ( )
167+ . filter ( |k| !used_ids. contains ( & k. id ) && k. usage_count == 0 )
168+ . cloned ( )
169+ . collect ( ) ;
170+ if !unseen. is_empty ( ) {
171+ let day_seed = u32:: try_from ( Utc :: now ( ) . num_days_from_ce ( ) ) . unwrap_or ( 0 ) as usize ;
172+ let rotate_by = day_seed % unseen. len ( ) ;
173+ unseen. rotate_left ( rotate_by) ;
124174 }
125- selected. push ( entry) ;
126- if selected. len ( ) == limit {
127- break ;
175+ for entry in unseen. into_iter ( ) . take ( remaining) {
176+ selected. push ( entry) ;
128177 }
129178 }
130179
180+ // If we still have room (fewer unseen than remaining), backfill from any unused
181+ let still_remaining = limit. saturating_sub ( selected. len ( ) ) ;
182+ if still_remaining > 0 {
183+ let final_used: HashSet < _ > = selected. iter ( ) . map ( |k| k. id . as_str ( ) ) . collect ( ) ;
184+ let backfill: Vec < _ > = entries
185+ . into_iter ( )
186+ . filter ( |k| !final_used. contains ( k. id . as_str ( ) ) )
187+ . take ( still_remaining)
188+ . collect ( ) ;
189+ selected. extend ( backfill) ;
190+ }
191+
131192 selected
132193}
133194
@@ -183,6 +244,24 @@ mod tests {
183244 title : & str ,
184245 source_projects : Vec < & str > ,
185246 usage_count : i64 ,
247+ ) -> GlobalKnowledge {
248+ sample_knowledge_full (
249+ id,
250+ title,
251+ source_projects,
252+ usage_count,
253+ 0.5 ,
254+ "2026-01-01T00:00:00Z" ,
255+ )
256+ }
257+
258+ fn sample_knowledge_full (
259+ id : & str ,
260+ title : & str ,
261+ source_projects : Vec < & str > ,
262+ usage_count : i64 ,
263+ confidence : f64 ,
264+ created_at : & str ,
186265 ) -> GlobalKnowledge {
187266 GlobalKnowledge :: new (
188267 id. to_owned ( ) ,
@@ -193,17 +272,17 @@ mod tests {
193272 vec ! [ ] ,
194273 source_projects. into_iter ( ) . map ( str:: to_owned) . collect ( ) ,
195274 vec ! [ ] ,
196- 0.5 ,
275+ confidence ,
197276 usage_count,
198277 None ,
199- "2026-01-01T00:00:00Z" . to_owned ( ) ,
200- "2026-01-01T00:00:00Z" . to_owned ( ) ,
278+ created_at . to_owned ( ) ,
279+ created_at . to_owned ( ) ,
201280 None ,
202281 )
203282 }
204283
205284 #[ test]
206- fn select_relevant_knowledge_prioritizes_project_matches_then_usage ( ) {
285+ fn select_relevant_knowledge_prioritizes_project_matches ( ) {
207286 let entries = vec ! [
208287 sample_knowledge( "global-100" , "global high" , vec![ ] , 100 ) ,
209288 sample_knowledge( "global-90" , "global medium" , vec![ ] , 90 ) ,
@@ -214,9 +293,123 @@ mod tests {
214293 let selected = select_relevant_knowledge ( entries, "demo" , 3 ) ;
215294
216295 assert_eq ! ( selected. len( ) , 3 ) ;
217- assert_eq ! ( selected[ 0 ] . id, "project-10" ) ;
218- assert_eq ! ( selected[ 1 ] . id, "project-1" ) ;
219- assert_eq ! ( selected[ 2 ] . id, "global-100" ) ;
296+ // Tier 1 should include both project entries (limit*2/5 = 1 for limit=3, but we have 2 project entries)
297+ assert ! ( selected. iter( ) . any( |k| k. id == "project-10" ) ) ;
298+ }
299+
300+ #[ test]
301+ fn empty_input_returns_empty ( ) {
302+ let selected = select_relevant_knowledge ( vec ! [ ] , "demo" , 10 ) ;
303+ assert ! ( selected. is_empty( ) ) ;
304+ }
305+
306+ #[ test]
307+ fn zero_limit_returns_empty ( ) {
308+ let entries = vec ! [ sample_knowledge( "a" , "a" , vec![ ] , 0 ) ] ;
309+ let selected = select_relevant_knowledge ( entries, "demo" , 0 ) ;
310+ assert ! ( selected. is_empty( ) ) ;
311+ }
312+
313+ #[ test]
314+ fn tier4_exploration_surfaces_unseen_entries ( ) {
315+ // All entries have usage_count=0, none are project-specific
316+ let entries: Vec < _ > = ( 0 ..20 )
317+ . map ( |i| sample_knowledge ( & format ! ( "k-{i}" ) , & format ! ( "knowledge {i}" ) , vec ! [ ] , 0 ) )
318+ . collect ( ) ;
319+
320+ let selected = select_relevant_knowledge ( entries, "demo" , 10 ) ;
321+
322+ assert_eq ! ( selected. len( ) , 10 ) ;
323+ // With daily rotation, different entries get surfaced
324+ }
325+
326+ #[ test]
327+ fn no_duplicate_ids_in_selection ( ) {
328+ let entries = vec ! [
329+ sample_knowledge_full(
330+ "a" ,
331+ "proj entry" ,
332+ vec![ "demo" ] ,
333+ 5 ,
334+ 0.9 ,
335+ "2026-03-15T00:00:00Z" ,
336+ ) ,
337+ sample_knowledge_full( "b" , "recent" , vec![ ] , 0 , 0.5 , "2026-03-14T00:00:00Z" ) ,
338+ sample_knowledge_full( "c" , "proven" , vec![ ] , 10 , 0.7 , "2026-01-01T00:00:00Z" ) ,
339+ sample_knowledge_full( "d" , "unseen" , vec![ ] , 0 , 0.3 , "2026-01-01T00:00:00Z" ) ,
340+ ] ;
341+
342+ let selected = select_relevant_knowledge ( entries, "demo" , 4 ) ;
343+
344+ let ids: Vec < _ > = selected. iter ( ) . map ( |k| k. id . as_str ( ) ) . collect ( ) ;
345+ let unique: std:: collections:: HashSet < _ > = ids. iter ( ) . collect ( ) ;
346+ assert_eq ! (
347+ ids. len( ) ,
348+ unique. len( ) ,
349+ "duplicate IDs in selection: {ids:?}"
350+ ) ;
351+ }
352+
353+ #[ test]
354+ fn multi_tier_allocation_with_limit_10 ( ) {
355+ let mut entries = Vec :: new ( ) ;
356+ // 3 project entries
357+ for i in 0 ..3 {
358+ entries. push ( sample_knowledge_full (
359+ & format ! ( "proj-{i}" ) ,
360+ & format ! ( "proj {i}" ) ,
361+ vec ! [ "demo" ] ,
362+ 0 ,
363+ 0.9 - ( i as f64 * 0.1 ) ,
364+ "2026-01-01T00:00:00Z" ,
365+ ) ) ;
366+ }
367+ // 3 proven entries (high usage)
368+ for i in 0 ..3 {
369+ entries. push ( sample_knowledge_full (
370+ & format ! ( "proven-{i}" ) ,
371+ & format ! ( "proven {i}" ) ,
372+ vec ! [ ] ,
373+ 50 - ( i as i64 * 10 ) ,
374+ 0.8 ,
375+ "2026-01-01T00:00:00Z" ,
376+ ) ) ;
377+ }
378+ // 10 unseen entries
379+ for i in 0 ..10 {
380+ entries. push ( sample_knowledge_full (
381+ & format ! ( "unseen-{i}" ) ,
382+ & format ! ( "unseen {i}" ) ,
383+ vec ! [ ] ,
384+ 0 ,
385+ 0.5 ,
386+ "2026-01-01T00:00:00Z" ,
387+ ) ) ;
388+ }
389+
390+ let selected = select_relevant_knowledge ( entries, "demo" , 10 ) ;
391+
392+ assert_eq ! ( selected. len( ) , 10 ) ;
393+
394+ let project_count = selected
395+ . iter ( )
396+ . filter ( |k| k. id . starts_with ( "proj-" ) )
397+ . count ( ) ;
398+ let proven_count = selected
399+ . iter ( )
400+ . filter ( |k| k. id . starts_with ( "proven-" ) )
401+ . count ( ) ;
402+ let unseen_count = selected
403+ . iter ( )
404+ . filter ( |k| k. id . starts_with ( "unseen-" ) )
405+ . count ( ) ;
406+
407+ // Tier 1: 4 project slots, but only 3 available
408+ assert ! ( project_count >= 1 , "should include project entries" ) ;
409+ // Tier 3: proven entries should appear
410+ assert ! ( proven_count >= 1 , "should include proven entries" ) ;
411+ // Tier 4: unseen entries should fill remaining slots
412+ assert ! ( unseen_count >= 1 , "should include exploration entries" ) ;
220413 }
221414}
222415
0 commit comments