Skip to content

Commit 7cf4c23

Browse files
authored
feat: support SLO aware scheduling policy. (#20)
1 parent 40c17f0 commit 7cf4c23

File tree

17 files changed

+533
-101
lines changed

17 files changed

+533
-101
lines changed

xllm_service/common/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ cc_library(
1212
macros.h
1313
slice.h
1414
threadpool.h
15-
ttft_predictor.h
15+
time_predictor.h
1616
types.h
1717
utils.h
1818
hash_util.h
@@ -23,7 +23,7 @@ cc_library(
2323
global_gflags.cpp
2424
json_reader.cpp
2525
threadpool.cpp
26-
ttft_predictor.cpp
26+
time_predictor.cpp
2727
utils.cpp
2828
hash_util.cpp
2929
xllm/uuid.cpp

xllm_service/common/global_gflags.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ limitations under the License.
1515

1616
#include "common/global_gflags.h"
1717

18+
#include "brpc/reloadable_flags.h"
19+
1820
DEFINE_string(server_host,
1921
"",
2022
"Server listen address, may be IPV4/IPV6/UDS."
@@ -83,9 +85,21 @@ DEFINE_int32(detect_disconnected_instance_interval,
8385
"The interval that server detect the disconnected instance.");
8486

8587
DEFINE_int32(block_size,
86-
16,
87-
"Number of slots per kv cache block. Default is 16.");
88+
128,
89+
"Number of slots per kv cache block. Default is 128.");
8890

8991
DEFINE_string(tokenizer_path, "", "tokenizer config path.");
9092

9193
DEFINE_bool(enable_request_trace, false, "Whether to enable request trace");
94+
95+
DEFINE_int32(target_ttft,
96+
1000,
97+
"Target Time to First Token (TTFT), in milliseconds.");
98+
99+
BRPC_VALIDATE_GFLAG(target_ttft, brpc::NonNegativeInteger);
100+
101+
DEFINE_int32(target_tpot,
102+
50,
103+
"Target Time Per Output Token (TPOT), in milliseconds.");
104+
105+
BRPC_VALIDATE_GFLAG(target_tpot, brpc::NonNegativeInteger);

xllm_service/common/global_gflags.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,7 @@ DECLARE_int32(block_size);
6060
DECLARE_string(tokenizer_path);
6161

6262
DECLARE_bool(enable_request_trace);
63+
64+
DECLARE_int32(target_ttft);
65+
66+
DECLARE_int32(target_tpot);
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/* Copyright 2025 The xLLM Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#include "time_predictor.h"
17+
18+
static constexpr int32_t kDegree = 2;
19+
20+
namespace xllm_service {
21+
22+
TimePredictor::TimePredictor(
23+
const std::vector<std::pair<int32_t, double>>& ttft_profiling_data,
24+
const std::vector<std::tuple<int32_t, int32_t, double>>&
25+
tpot_profiling_data) {
26+
if (!ttft_profiling_data.empty()) {
27+
// construct Vandermonde matrix
28+
int32_t m = ttft_profiling_data.size();
29+
int32_t n = kDegree + 1;
30+
Eigen::MatrixXd matrix(m, n);
31+
for (int32_t i = 0; i < m; ++i) {
32+
for (int32_t j = 0; j < n; ++j) {
33+
matrix(i, j) = std::pow(ttft_profiling_data[i].first, j);
34+
}
35+
}
36+
37+
// construct target vector
38+
Eigen::VectorXd target(m);
39+
for (int32_t i = 0; i < m; ++i) {
40+
target(i) = ttft_profiling_data[i].second;
41+
}
42+
43+
// get coefficients
44+
ttft_coefficients_ = matrix.colPivHouseholderQr().solve(target);
45+
} else {
46+
ttft_coefficients_ = Eigen::VectorXd::Zero(1);
47+
}
48+
49+
if (!tpot_profiling_data.empty()) {
50+
int32_t m = tpot_profiling_data.size();
51+
int32_t n = kDegree + 1;
52+
Eigen::MatrixXd matrix(m, n);
53+
for (int32_t i = 0; i < m; ++i) {
54+
int32_t avg_length = std::get<0>(tpot_profiling_data[i]);
55+
int32_t batch_size = std::get<1>(tpot_profiling_data[i]);
56+
57+
matrix(i, 0) = 1.0; // the index 0 is always for constant
58+
matrix(i, 1) = batch_size;
59+
matrix(i, 2) = batch_size * (avg_length - 1);
60+
}
61+
62+
// construct target vector
63+
Eigen::VectorXd target(m);
64+
for (int32_t i = 0; i < m; ++i) {
65+
target(i) = std::get<2>(tpot_profiling_data[i]);
66+
}
67+
68+
// get coefficients
69+
tpot_coefficients_ = matrix.colPivHouseholderQr().solve(target);
70+
} else {
71+
ttft_coefficients_ = Eigen::VectorXd::Zero(3);
72+
}
73+
}
74+
75+
double TimePredictor::predict_ttft(int32_t length) {
76+
double result = 0.0;
77+
double power = 1.0;
78+
for (int32_t i = 0; i < ttft_coefficients_.size(); ++i) {
79+
result += ttft_coefficients_(i) * power;
80+
power *= length;
81+
}
82+
83+
return result;
84+
}
85+
86+
double TimePredictor::predict_tpot(int32_t total_length, int32_t batch_size) {
87+
double result = 0.0;
88+
result = tpot_coefficients_(0) + tpot_coefficients_(1) * batch_size +
89+
tpot_coefficients_(2) * total_length;
90+
return result;
91+
}
92+
93+
} // namespace xllm_service
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/* Copyright 2025 The xLLM Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#pragma once
17+
18+
#include <Eigen/Dense>
19+
20+
namespace xllm_service {
21+
22+
// Predictor for predicting TTFT and TPOT
23+
class TimePredictor final {
24+
public:
25+
TimePredictor(
26+
const std::vector<std::pair<int32_t, double>>& ttft_profiling_data,
27+
const std::vector<std::tuple<int32_t, int32_t, double>>&
28+
tpot_profiling_data);
29+
~TimePredictor() = default;
30+
31+
double predict_ttft(int32_t length);
32+
33+
double predict_tpot(int32_t total_length, int32_t batch_size);
34+
35+
private:
36+
Eigen::VectorXd ttft_coefficients_;
37+
Eigen::VectorXd tpot_coefficients_;
38+
};
39+
40+
} // namespace xllm_service

xllm_service/common/ttft_predictor.cpp

Lines changed: 0 additions & 59 deletions
This file was deleted.

xllm_service/common/types.h

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ enum class InstanceType : int8_t {
7474
PREFILL = 1,
7575
// decode instance
7676
DECODE = 2,
77+
// mix instance
78+
MIX = 3,
7779
};
7880

7981
struct LoadMetrics {
@@ -114,6 +116,8 @@ struct LoadMetrics {
114116

115117
// Record the latency monitoring metrics of the instance over the recent period
116118
struct LatencyMetrics {
119+
LatencyMetrics() : recent_max_ttft(0), recent_max_tbt(0) {}
120+
117121
LatencyMetrics(const int64_t& recent_max_ttft, const int64_t& recent_max_tbt)
118122
: recent_max_ttft(recent_max_ttft), recent_max_tbt(recent_max_tbt) {}
119123

@@ -125,8 +129,9 @@ struct LatencyMetrics {
125129
enum class RequestAction : int32_t {
126130
SCHEDULE = 0,
127131
FINISH_PREFILL = 1,
128-
FINISH_DECODE = 2,
129-
CANCEL = 3,
132+
GENERATE = 2,
133+
FINISH_DECODE = 3,
134+
CANCEL = 4,
130135
};
131136

132137
// Record the request metrics of the instance
@@ -172,13 +177,19 @@ struct InstanceMetaInfo {
172177
std::vector<uint64_t> v_cache_ids;
173178
int32_t dp_size;
174179
// ttft profiling data
175-
std::vector<std::pair<int32_t, int64_t>> ttft_profiling_data;
180+
std::vector<std::pair<int32_t, double>> ttft_profiling_data;
181+
// tpot profiling data
182+
std::vector<std::tuple<int32_t, int32_t, double>> tpot_profiling_data;
176183

177184
// latest heatbeat timestamp
178185
uint64_t latest_timestamp = 0;
179186

180187
uint64_t instance_index = -1;
181188

189+
// Used to indicate the exact instance type of a MIX type instance currently,
190+
// only used when the SLO Aware scheduling policy is enabled.
191+
InstanceType current_type = InstanceType::PREFILL;
192+
182193
nlohmann::json serialize_to_json() const {
183194
nlohmann::json json_val;
184195
json_val["name"] = name;
@@ -190,6 +201,7 @@ struct InstanceMetaInfo {
190201
json_val["v_cache_ids"] = v_cache_ids;
191202
json_val["dp_size"] = dp_size;
192203
json_val["ttft_profiling_data"] = ttft_profiling_data;
204+
json_val["tpot_profiling_data"] = tpot_profiling_data;
193205
return json_val;
194206
}
195207

@@ -230,6 +242,12 @@ struct InstanceMetaInfo {
230242
}
231243
}
232244

245+
for (const auto& item : json_value.at("tpot_profiling_data")) {
246+
if (item.is_array() && item.size() == 3) {
247+
tpot_profiling_data.emplace_back(item[0], item[1], item[2]);
248+
}
249+
}
250+
233251
set_init_timestamp();
234252
} catch (const std::exception& e) {
235253
LOG(ERROR) << "json str:" << json_str

xllm_service/proto/xllm_rpc_service.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ enum InstanceType {
2121
DEFAULT = 0;
2222
PREFILL = 1;
2323
DECODE = 2;
24+
MIX = 3;
2425
}
2526

2627
message WorkerKVAddr {

xllm_service/request/request.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ struct Request {
4949
// instance routing
5050
Routing routing;
5151

52+
// the number of generated tokens
53+
int64_t num_generated_tokens = 0;
54+
5255
// the estimated TTFT obtained from the TTFT predictor
5356
int64_t estimated_ttft = 0;
5457

xllm_service/rpc_service/client.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ ErrorCode XllmRpcClient::register_instance(const InstanceMetaInfo& metainfo) {
8888
req.set_type(proto::InstanceType::PREFILL);
8989
} else if (metainfo.type == InstanceType::DECODE) {
9090
req.set_type(proto::InstanceType::DECODE);
91+
} else if (metainfo.type == InstanceType::MIX) {
92+
req.set_type(proto::InstanceType::MIX);
9193
} else {
9294
req.set_type(proto::InstanceType::DEFAULT);
9395
}

0 commit comments

Comments
 (0)