@@ -66,10 +66,15 @@ void InstanceMgr::init() {
6666 for (auto & it : ETCD_KEYS_PREFIX_MAP) {
6767 etcd_client_->get_prefix (it.second , &instances_);
6868 }
69- // create ttft predictor for each instance
70- for (auto & pair : instances_) {
71- ttft_predictors_.insert_or_assign (
72- pair.first , TtftPredictor (pair.second .ttft_profiling_data ));
69+ // create ttft predictor and request metrics for each instance
70+ {
71+ std::lock_guard<std::mutex> ttft_predictor_lock (ttft_predictor_mutex_);
72+ std::lock_guard<std::mutex> request_metrics_lock (request_metrics_mutex_);
73+ for (auto & pair : instances_) {
74+ ttft_predictors_.insert_or_assign (
75+ pair.first , TtftPredictor (pair.second .ttft_profiling_data ));
76+ request_metrics_.insert_or_assign (pair.first , RequestMetrics ());
77+ }
7378 }
7479 LOG (INFO) << " Load instance info from etcd:" << instances_.size ();
7580 std::vector<std::string> channel_creat_fail_insts;
@@ -99,7 +104,13 @@ void InstanceMgr::init() {
99104 }
100105 for (auto & name : channel_creat_fail_insts) {
101106 instances_.erase (name);
102- ttft_predictors_.erase (name);
107+ {
108+ std::lock_guard<std::mutex> ttft_predictor_lock (ttft_predictor_mutex_);
109+ std::lock_guard<std::mutex> request_metrics_lock (
110+ request_metrics_mutex_);
111+ ttft_predictors_.erase (name);
112+ request_metrics_.erase (name);
113+ }
103114 }
104115 }
105116 {
@@ -340,9 +351,18 @@ void InstanceMgr::update_instance_metainfo(const etcd::Response& response,
340351 continue ;
341352 }
342353
343- // create ttft predictor for instance
344- ttft_predictors_.emplace (
345- iter.first , TtftPredictor (iter.second .ttft_profiling_data ));
354+ {
355+ std::lock_guard<std::mutex> ttft_predictor_lock (
356+ ttft_predictor_mutex_);
357+ std::lock_guard<std::mutex> request_metrics_lock (
358+ request_metrics_mutex_);
359+ // create ttft predictor for instance
360+ ttft_predictors_.emplace (
361+ iter.first , TtftPredictor (iter.second .ttft_profiling_data ));
362+
363+ // create request metrics for instance
364+ request_metrics_.emplace (iter.first , RequestMetrics ());
365+ }
346366
347367 instances_.insert (std::make_pair (iter.first , std::move (iter.second )));
348368
@@ -395,8 +415,15 @@ void InstanceMgr::update_instance_metainfo(const etcd::Response& response,
395415 }
396416
397417 instances_.erase (iter);
398- ttft_predictors_.erase (iter);
399418 cached_channels_.erase (iter);
419+ {
420+ std::lock_guard<std::mutex> ttft_predictor_lock (
421+ ttft_predictor_mutex_);
422+ std::lock_guard<std::mutex> request_metrics_lock (
423+ request_metrics_mutex_);
424+ ttft_predictors_.erase (iter);
425+ request_metrics_.erase (iter);
426+ }
400427 {
401428 std::lock_guard<std::mutex> lock (update_mutex_);
402429 updated_metrics_.erase (iter);
@@ -450,4 +477,74 @@ void InstanceMgr::update_load_metrics(const etcd::Response& response,
450477 });
451478}
452479
480+ void InstanceMgr::update_latency_metrics (
481+ const std::string& instance_name,
482+ const proto::LatencyMetrics& latency_metrics) {
483+ std::lock_guard<std::mutex> lock (latency_metrics_mutex_);
484+
485+ latency_metrics_.insert_or_assign (
486+ instance_name,
487+ LatencyMetrics (latency_metrics.recent_max_ttft (),
488+ latency_metrics.recent_max_tbt ()));
489+ }
490+
491+ void InstanceMgr::update_request_metrics (std::shared_ptr<Request> request,
492+ RequestAction action) {
493+ std::lock_guard<std::mutex> lock (request_metrics_mutex_);
494+
495+ auto prefill_it = request_metrics_.find (request->routing .prefill_name );
496+ if (prefill_it == request_metrics_.end ()) {
497+ LOG (ERROR) << " Failed to find instance request metrics, instance name : "
498+ << request->routing .prefill_name ;
499+ return ;
500+ }
501+
502+ auto decode_it = request_metrics_.find (request->routing .decode_name );
503+ if (decode_it == request_metrics_.end ()) {
504+ LOG (ERROR) << " Failed to find instance request metrics, instance name : "
505+ << request->routing .decode_name ;
506+ return ;
507+ }
508+
509+ int64_t token_length = request->token_ids .size ();
510+ switch (action) {
511+ case RequestAction::SCHEDULE:
512+ // update the request metrics for prefill and decode instances when
513+ // request is scheduled
514+ prefill_it->second .prefill_request_num += 1 ;
515+ prefill_it->second .prefill_token_num += token_length;
516+ prefill_it->second .estimated_prefill_time += request->estimated_ttft ;
517+
518+ decode_it->second .decode_request_num += 1 ;
519+ decode_it->second .decode_token_num += token_length;
520+ break ;
521+ case RequestAction::FINISH_PREFILL:
522+ // only update the request metrics for prefill instance when request
523+ // finishes the prefill phase
524+ prefill_it->second .prefill_request_num -= 1 ;
525+ prefill_it->second .prefill_token_num -= token_length;
526+ prefill_it->second .estimated_prefill_time -= request->estimated_ttft ;
527+ break ;
528+ case RequestAction::FINISH_DECODE:
529+ // update the request metrics for decode instance when request finishes
530+ // the decode phase
531+ decode_it->second .decode_request_num -= 1 ;
532+ decode_it->second .decode_token_num -= token_length;
533+ break ;
534+ case RequestAction::CANCEL:
535+ // update the request metrics for prefill and decode instances when
536+ // request is cancelled
537+ prefill_it->second .prefill_request_num -= 1 ;
538+ prefill_it->second .prefill_token_num -= token_length;
539+ prefill_it->second .estimated_prefill_time -= request->estimated_ttft ;
540+
541+ decode_it->second .decode_request_num -= 1 ;
542+ decode_it->second .decode_token_num -= token_length;
543+ break ;
544+ default :
545+ LOG (ERROR) << " Unknown RequestAction: " << static_cast <int32_t >(action);
546+ break ;
547+ }
548+ }
549+
453550} // namespace xllm_service
0 commit comments