Skip to content

Commit

Permalink
lighthouse, manager: support multiple quorum rooms
Browse files Browse the repository at this point in the history
  • Loading branch information
d4l3k committed Dec 19, 2024
1 parent 6d6e9a4 commit 1982445
Show file tree
Hide file tree
Showing 7 changed files with 288 additions and 206 deletions.
16 changes: 12 additions & 4 deletions proto/torchft.proto
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ message Quorum {
}

message LighthouseQuorumRequest {
QuorumMember requester = 1;
// room_id is the specific quorum channel to use. All workers/replicas
// participating in the quorum must specify the same channel.
// Multiple channels can be active simultaneously.
string room_id = 1;
QuorumMember requester = 2;
}

message LighthouseQuorumResponse {
Expand All @@ -69,9 +73,13 @@ service LighthouseService {
}

message ManagerQuorumRequest {
int64 rank = 1;
int64 step = 2;
string checkpoint_server_addr = 3;
// room_id is the specific quorum channel to use. All workers/replicas
// participating in the quorum must specify the same channel.
// Multiple channels can be active simultaneously.
string room_id = 1;
int64 rank = 2;
int64 step = 3;
string checkpoint_server_addr = 4;
}

message ManagerQuorumResponse {
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,14 @@ impl ManagerClient {
fn quorum(
&mut self,
py: Python<'_>,
room_id: String,
rank: i64,
step: i64,
checkpoint_server_addr: String,
) -> PyResult<(i64, i64, i64, String, String, i64, Option<i64>, i64, bool)> {
py.allow_threads(move || {
let mut request = tonic::Request::new(ManagerQuorumRequest {
room_id: room_id,
rank: rank,
step: step,
checkpoint_server_addr: checkpoint_server_addr,
Expand Down
Loading

0 comments on commit 1982445

Please sign in to comment.