Skip to content

Commit 301857a

Browse files
feat(http): diversified knowledge retrieval with multi-tier allocation
Replace single-pass usage-sorted selection with 4-tier allocation strategy to prevent popularity bias (93%+ entries never retrieved). Tiers: project- relevant (40%), recent (20%), proven high-usage (20%), daily-rotated exploration for unseen entries (remaining). Includes 6 new tests. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
1 parent a0a70cd commit 301857a

1 file changed

Lines changed: 222 additions & 29 deletions

File tree

crates/http/src/handlers/context.rs

Lines changed: 222 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ use axum::{
44
extract::{Query, State},
55
response::sse::{Event, Sse},
66
};
7+
use chrono::{Datelike, Utc};
78
use futures_util::stream::Stream;
89
use serde_json::json;
10+
use std::collections::HashSet;
911
use std::convert::Infallible;
1012
use std::sync::Arc;
1113
use tokio::sync::broadcast::error::RecvError;
@@ -87,8 +89,16 @@ async fn fetch_relevant_knowledge(
8789
selected
8890
}
8991

92+
/// Selects knowledge entries using a multi-tier allocation strategy to prevent
93+
/// popularity bias (rich-get-richer death spiral where 93%+ entries never get retrieved).
94+
///
95+
/// Tier allocation for `limit=10`:
96+
/// - Tier 1 (40%): Project-relevant entries sorted by confidence
97+
/// - Tier 2 (20%): Recent entries (recency boost for new knowledge)
98+
/// - Tier 3 (20%): Proven entries with highest usage_count
99+
/// - Tier 4 (remaining): Exploration — never-seen entries rotated daily
90100
fn select_relevant_knowledge(
91-
mut entries: Vec<GlobalKnowledge>,
101+
entries: Vec<GlobalKnowledge>,
92102
project: &str,
93103
limit: usize,
94104
) -> Vec<GlobalKnowledge> {
@@ -97,37 +107,88 @@ fn select_relevant_knowledge(
97107
}
98108

99109
let normalized_project = project.trim().to_ascii_lowercase();
100-
entries.sort_by(|a, b| {
110+
let mut selected: Vec<GlobalKnowledge> = Vec::with_capacity(limit);
111+
let mut used_ids: HashSet<String> = HashSet::new();
112+
113+
// Tier 1: Project-relevant entries (up to 40% of slots)
114+
let project_slots = (limit * 2 / 5).max(1);
115+
let mut project_entries: Vec<_> = entries
116+
.iter()
117+
.filter(|k| {
118+
k.source_projects
119+
.iter()
120+
.any(|p| p.trim().to_ascii_lowercase() == normalized_project)
121+
})
122+
.cloned()
123+
.collect();
124+
project_entries.sort_by(|a, b| b.confidence.total_cmp(&a.confidence));
125+
for entry in project_entries.into_iter().take(project_slots) {
126+
used_ids.insert(entry.id.clone());
127+
selected.push(entry);
128+
}
129+
130+
// Tier 2: Recent entries (up to 20% of slots) — recency boost for new knowledge
131+
let recency_slots = (limit / 5).max(1);
132+
let mut recent_entries: Vec<_> = entries
133+
.iter()
134+
.filter(|k| !used_ids.contains(&k.id))
135+
.cloned()
136+
.collect();
137+
recent_entries.sort_by(|a, b| b.created_at.cmp(&a.created_at));
138+
for entry in recent_entries.into_iter().take(recency_slots) {
139+
used_ids.insert(entry.id.clone());
140+
selected.push(entry);
141+
}
142+
143+
// Tier 3: High-value proven entries (up to 20% of slots) — usage_count matters here
144+
let proven_slots = (limit / 5).max(1);
145+
let mut proven_entries: Vec<_> = entries
146+
.iter()
147+
.filter(|k| !used_ids.contains(&k.id) && k.usage_count > 0)
148+
.cloned()
149+
.collect();
150+
proven_entries.sort_by(|a, b| {
101151
b.usage_count
102152
.cmp(&a.usage_count)
103153
.then_with(|| b.confidence.total_cmp(&a.confidence))
104-
.then_with(|| a.title.cmp(&b.title))
105154
});
106-
107-
let mut selected = Vec::with_capacity(limit);
108-
109-
for entry in &entries {
110-
if entry.source_projects.iter().any(|source| {
111-
let normalized_source = source.trim().to_ascii_lowercase();
112-
normalized_source == normalized_project
113-
}) {
114-
selected.push(entry.clone());
115-
if selected.len() == limit {
116-
return selected;
117-
}
118-
}
155+
for entry in proven_entries.into_iter().take(proven_slots) {
156+
used_ids.insert(entry.id.clone());
157+
selected.push(entry);
119158
}
120159

121-
for entry in entries {
122-
if selected.iter().any(|picked| picked.id == entry.id) {
123-
continue;
160+
// Tier 4: Exploration — never-seen entries (remaining slots)
161+
// Breaks the death spiral: entries with usage_count=0 get a chance.
162+
// Deterministic daily rotation ensures different entries surface each day.
163+
let remaining = limit.saturating_sub(selected.len());
164+
if remaining > 0 {
165+
let mut unseen: Vec<_> = entries
166+
.iter()
167+
.filter(|k| !used_ids.contains(&k.id) && k.usage_count == 0)
168+
.cloned()
169+
.collect();
170+
if !unseen.is_empty() {
171+
let day_seed = u32::try_from(Utc::now().num_days_from_ce()).unwrap_or(0) as usize;
172+
let rotate_by = day_seed % unseen.len();
173+
unseen.rotate_left(rotate_by);
124174
}
125-
selected.push(entry);
126-
if selected.len() == limit {
127-
break;
175+
for entry in unseen.into_iter().take(remaining) {
176+
selected.push(entry);
128177
}
129178
}
130179

180+
// If we still have room (fewer unseen than remaining), backfill from any unused
181+
let still_remaining = limit.saturating_sub(selected.len());
182+
if still_remaining > 0 {
183+
let final_used: HashSet<_> = selected.iter().map(|k| k.id.as_str()).collect();
184+
let backfill: Vec<_> = entries
185+
.into_iter()
186+
.filter(|k| !final_used.contains(k.id.as_str()))
187+
.take(still_remaining)
188+
.collect();
189+
selected.extend(backfill);
190+
}
191+
131192
selected
132193
}
133194

@@ -183,6 +244,24 @@ mod tests {
183244
title: &str,
184245
source_projects: Vec<&str>,
185246
usage_count: i64,
247+
) -> GlobalKnowledge {
248+
sample_knowledge_full(
249+
id,
250+
title,
251+
source_projects,
252+
usage_count,
253+
0.5,
254+
"2026-01-01T00:00:00Z",
255+
)
256+
}
257+
258+
fn sample_knowledge_full(
259+
id: &str,
260+
title: &str,
261+
source_projects: Vec<&str>,
262+
usage_count: i64,
263+
confidence: f64,
264+
created_at: &str,
186265
) -> GlobalKnowledge {
187266
GlobalKnowledge::new(
188267
id.to_owned(),
@@ -193,17 +272,17 @@ mod tests {
193272
vec![],
194273
source_projects.into_iter().map(str::to_owned).collect(),
195274
vec![],
196-
0.5,
275+
confidence,
197276
usage_count,
198277
None,
199-
"2026-01-01T00:00:00Z".to_owned(),
200-
"2026-01-01T00:00:00Z".to_owned(),
278+
created_at.to_owned(),
279+
created_at.to_owned(),
201280
None,
202281
)
203282
}
204283

205284
#[test]
206-
fn select_relevant_knowledge_prioritizes_project_matches_then_usage() {
285+
fn select_relevant_knowledge_prioritizes_project_matches() {
207286
let entries = vec![
208287
sample_knowledge("global-100", "global high", vec![], 100),
209288
sample_knowledge("global-90", "global medium", vec![], 90),
@@ -214,9 +293,123 @@ mod tests {
214293
let selected = select_relevant_knowledge(entries, "demo", 3);
215294

216295
assert_eq!(selected.len(), 3);
217-
assert_eq!(selected[0].id, "project-10");
218-
assert_eq!(selected[1].id, "project-1");
219-
assert_eq!(selected[2].id, "global-100");
296+
// Tier 1 should include both project entries (limit*2/5 = 1 for limit=3, but we have 2 project entries)
297+
assert!(selected.iter().any(|k| k.id == "project-10"));
298+
}
299+
300+
#[test]
301+
fn empty_input_returns_empty() {
302+
let selected = select_relevant_knowledge(vec![], "demo", 10);
303+
assert!(selected.is_empty());
304+
}
305+
306+
#[test]
307+
fn zero_limit_returns_empty() {
308+
let entries = vec![sample_knowledge("a", "a", vec![], 0)];
309+
let selected = select_relevant_knowledge(entries, "demo", 0);
310+
assert!(selected.is_empty());
311+
}
312+
313+
#[test]
314+
fn tier4_exploration_surfaces_unseen_entries() {
315+
// All entries have usage_count=0, none are project-specific
316+
let entries: Vec<_> = (0..20)
317+
.map(|i| sample_knowledge(&format!("k-{i}"), &format!("knowledge {i}"), vec![], 0))
318+
.collect();
319+
320+
let selected = select_relevant_knowledge(entries, "demo", 10);
321+
322+
assert_eq!(selected.len(), 10);
323+
// With daily rotation, different entries get surfaced
324+
}
325+
326+
#[test]
327+
fn no_duplicate_ids_in_selection() {
328+
let entries = vec![
329+
sample_knowledge_full(
330+
"a",
331+
"proj entry",
332+
vec!["demo"],
333+
5,
334+
0.9,
335+
"2026-03-15T00:00:00Z",
336+
),
337+
sample_knowledge_full("b", "recent", vec![], 0, 0.5, "2026-03-14T00:00:00Z"),
338+
sample_knowledge_full("c", "proven", vec![], 10, 0.7, "2026-01-01T00:00:00Z"),
339+
sample_knowledge_full("d", "unseen", vec![], 0, 0.3, "2026-01-01T00:00:00Z"),
340+
];
341+
342+
let selected = select_relevant_knowledge(entries, "demo", 4);
343+
344+
let ids: Vec<_> = selected.iter().map(|k| k.id.as_str()).collect();
345+
let unique: std::collections::HashSet<_> = ids.iter().collect();
346+
assert_eq!(
347+
ids.len(),
348+
unique.len(),
349+
"duplicate IDs in selection: {ids:?}"
350+
);
351+
}
352+
353+
#[test]
354+
fn multi_tier_allocation_with_limit_10() {
355+
let mut entries = Vec::new();
356+
// 3 project entries
357+
for i in 0..3 {
358+
entries.push(sample_knowledge_full(
359+
&format!("proj-{i}"),
360+
&format!("proj {i}"),
361+
vec!["demo"],
362+
0,
363+
0.9 - (i as f64 * 0.1),
364+
"2026-01-01T00:00:00Z",
365+
));
366+
}
367+
// 3 proven entries (high usage)
368+
for i in 0..3 {
369+
entries.push(sample_knowledge_full(
370+
&format!("proven-{i}"),
371+
&format!("proven {i}"),
372+
vec![],
373+
50 - (i as i64 * 10),
374+
0.8,
375+
"2026-01-01T00:00:00Z",
376+
));
377+
}
378+
// 10 unseen entries
379+
for i in 0..10 {
380+
entries.push(sample_knowledge_full(
381+
&format!("unseen-{i}"),
382+
&format!("unseen {i}"),
383+
vec![],
384+
0,
385+
0.5,
386+
"2026-01-01T00:00:00Z",
387+
));
388+
}
389+
390+
let selected = select_relevant_knowledge(entries, "demo", 10);
391+
392+
assert_eq!(selected.len(), 10);
393+
394+
let project_count = selected
395+
.iter()
396+
.filter(|k| k.id.starts_with("proj-"))
397+
.count();
398+
let proven_count = selected
399+
.iter()
400+
.filter(|k| k.id.starts_with("proven-"))
401+
.count();
402+
let unseen_count = selected
403+
.iter()
404+
.filter(|k| k.id.starts_with("unseen-"))
405+
.count();
406+
407+
// Tier 1: 4 project slots, but only 3 available
408+
assert!(project_count >= 1, "should include project entries");
409+
// Tier 3: proven entries should appear
410+
assert!(proven_count >= 1, "should include proven entries");
411+
// Tier 4: unseen entries should fill remaining slots
412+
assert!(unseen_count >= 1, "should include exploration entries");
220413
}
221414
}
222415

0 commit comments

Comments
 (0)