Skip to content

Commit 40c17f0

Browse files
authored
feat: add offline field in request. (#19)
1 parent e98847e commit 40c17f0

File tree

11 files changed

+60
-0
lines changed

11 files changed

+60
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@
2121
# rust
2222
Cargo.lock
2323

24+
/local

xllm_service/proto/xllm/chat.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ message ChatRequest {
113113
repeated int32 token_ids = 26;
114114

115115
Routing routing = 27;
116+
117+
optional bool offline = 28;
116118
}
117119

118120
message ChatLogProbData {

xllm_service/proto/xllm/completion.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ message CompletionRequest {
9090
repeated int32 token_ids = 26;
9191

9292
Routing routing = 27;
93+
94+
optional bool offline = 28;
9395
}
9496

9597
message LogProbs {

xllm_service/proto/xllm_rpc_service.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ service XllmRpcService {
144144
rpc GetInstanceInfo(InstanceID) returns (InstanceMetaInfo) {}
145145
rpc Heartbeat(HeartbeatRequest) returns (Status) {}
146146
rpc GetStaticDecodeList(InstanceID) returns (InstanceIDs) {}
147+
rpc GetStaticPrefillList(InstanceID) returns (InstanceIDs) {}
147148
rpc GetConfig(Empty) returns (ServiceConfig) {}
148149

149150
// xllm service receive response from decode instance directly in disagg pd mode.

xllm_service/request/request.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ struct Request {
3535
// whether to return usage
3636
bool include_usage = false;
3737

38+
bool offline = false;
39+
3840
// input prompt
3941
std::string prompt;
4042

xllm_service/rpc_service/service.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,11 @@ std::vector<std::string> XllmRpcServiceImpl::get_static_decode_list(
4949
return scheduler_->get_static_decode_list(instance_name);
5050
}
5151

52+
std::vector<std::string> XllmRpcServiceImpl::get_static_prefill_list(
53+
const std::string& instance_name) {
54+
return scheduler_->get_static_prefill_list(instance_name);
55+
}
56+
5257
bool XllmRpcServiceImpl::handle_generation(
5358
const llm::RequestOutput& request_output) {
5459
return scheduler_->handle_generation(request_output);
@@ -126,6 +131,19 @@ void XllmRpcService::GetStaticDecodeList(
126131
}
127132
}
128133

134+
void XllmRpcService::GetStaticPrefillList(
135+
google::protobuf::RpcController* cntl_base,
136+
const proto::InstanceID* req,
137+
proto::InstanceIDs* resp,
138+
google::protobuf::Closure* done) {
139+
brpc::ClosureGuard done_guard(done);
140+
std::vector<std::string> prefill_list =
141+
xllm_rpc_service_impl_->get_static_prefill_list(req->name());
142+
for (auto& p : prefill_list) {
143+
*(resp->mutable_names()->Add()) = std::move(p);
144+
}
145+
}
146+
129147
void XllmRpcService::Generations(google::protobuf::RpcController* cntl_base,
130148
const proto::DisaggStreamGenerations* req,
131149
proto::StatusSet* resp,

xllm_service/rpc_service/service.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ class XllmRpcServiceImpl final {
5252
std::vector<std::string> get_static_decode_list(
5353
const std::string& prefill_name);
5454

55+
std::vector<std::string> get_static_prefill_list(
56+
const std::string& decode_name);
57+
5558
public:
5659
// handle generations from prefill/decode instance
5760
bool handle_generation(const llm::RequestOutput& request_output);
@@ -103,6 +106,11 @@ class XllmRpcService : public proto::XllmRpcService {
103106
proto::InstanceIDs* resp,
104107
google::protobuf::Closure* done) override;
105108

109+
virtual void GetStaticPrefillList(google::protobuf::RpcController* cntl_base,
110+
const proto::InstanceID* req,
111+
proto::InstanceIDs* resp,
112+
google::protobuf::Closure* done) override;
113+
106114
// xllm service receive response from decode instance directly in disagg pd
107115
// mode. This can eliminate the cost brought by forwarding through prefill.
108116
virtual void Generations(google::protobuf::RpcController* cntl_base,

xllm_service/scheduler/managers/instance_mgr.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,21 @@ std::vector<std::string> InstanceMgr::get_static_decode_list(
169169
return decode_list;
170170
}
171171

172+
// TODO: refactor later, currently return all prefill instances
173+
std::vector<std::string> InstanceMgr::get_static_prefill_list(
174+
const std::string& instance_name) {
175+
std::vector<std::string> prefill_list;
176+
std::shared_lock<std::shared_mutex> lock(inst_mutex_);
177+
for (auto& inst : instances_) {
178+
if (inst.second.type == InstanceType::PREFILL ||
179+
inst.second.type == InstanceType::DEFAULT) {
180+
prefill_list.emplace_back(inst.second.name);
181+
}
182+
}
183+
184+
return prefill_list;
185+
}
186+
172187
void InstanceMgr::get_load_metrics(LoadBalanceInfos* infos) {
173188
std::shared_lock<std::shared_mutex> inst_lock(inst_mutex_);
174189
std::shared_lock<std::shared_mutex> metric_lock(load_metric_mutex_);

xllm_service/scheduler/managers/instance_mgr.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ class InstanceMgr final {
4848
std::vector<std::string> get_static_decode_list(
4949
const std::string& instance_name);
5050

51+
std::vector<std::string> get_static_prefill_list(
52+
const std::string& instance_name);
53+
5154
void get_load_metrics(LoadBalanceInfos* infos);
5255

5356
std::shared_ptr<brpc::Channel> get_channel(const std::string& instance_name);

xllm_service/scheduler/scheduler.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,11 @@ std::vector<std::string> Scheduler::get_static_decode_list(
152152
return instance_mgr_->get_static_decode_list(instance_name);
153153
}
154154

155+
std::vector<std::string> Scheduler::get_static_prefill_list(
156+
const std::string& instance_name) {
157+
return instance_mgr_->get_static_prefill_list(instance_name);
158+
}
159+
155160
Tokenizer* Scheduler::get_tls_tokenizer() {
156161
thread_local std::unique_ptr<Tokenizer> tls_tokenizer(tokenizer_->clone());
157162
return tls_tokenizer.get();

0 commit comments

Comments
 (0)