Skip to content

Commit

Permalink
old EC metadata shouldn't terminate cluster-wide rebalance
Browse files Browse the repository at this point in the history
* the sequence:
  - run and terminate EC workload
  - decommission a node
  - and join another one
* during rebalance, receive (an old) ec metafile that has invalid checksum
* drop it, warn, and keep going

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Jul 8, 2023
1 parent b68f5a3 commit 4f7ae74
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 3 deletions.
6 changes: 4 additions & 2 deletions ais/test/maintain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,13 @@ func TestMaintenanceMD(t *testing.T) {
api.WaitForXactionIC(baseParams, args)
})

tlog.Logf("Decommission %s\n", dcmTarget.StringEx())
cmd := tools.GetRestoreCmd(dcmTarget)
msg := &apc.ActValRmNode{DaemonID: dcmTarget.ID(), SkipRebalance: true, KeepInitialConfig: true}
_, err := api.DecommissionNode(baseParams, msg)
tassert.CheckFatal(t, err)

_, err = tools.WaitForClusterState(proxyURL, "target decommission", smap.Version, smap.CountActivePs(),
_, err = tools.WaitForClusterState(proxyURL, "target decommissioned", smap.Version, smap.CountActivePs(),
smap.CountTargets()-1)
tassert.CheckFatal(t, err)

Expand Down Expand Up @@ -210,11 +211,12 @@ func TestMaintenanceDecommissionRebalance(t *testing.T) {
tassert.CheckFatal(t, err)
}

tlog.Logf("Decommission %s\n", dcmTarget.StringEx())
cmd := tools.GetRestoreCmd(dcmTarget)
msg := &apc.ActValRmNode{DaemonID: dcmTarget.ID(), RmUserData: true, KeepInitialConfig: true}
rebID, err := api.DecommissionNode(baseParams, msg)
tassert.CheckError(t, err)
_, err = tools.WaitForClusterState(proxyURL, "target decommission",
_, err = tools.WaitForClusterState(proxyURL, "target decommissioned",
smap.Version, origActiveProxyCount, origTargetCount-1, dcmTarget.ID())
tassert.CheckFatal(t, err)

Expand Down
2 changes: 2 additions & 0 deletions ais/test/target_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/NVIDIA/aistore/tools"
"github.com/NVIDIA/aistore/tools/readers"
"github.com/NVIDIA/aistore/tools/tassert"
"github.com/NVIDIA/aistore/tools/tlog"
)

func TestPutObjectNoDaemonID(t *testing.T) {
Expand Down Expand Up @@ -55,6 +56,7 @@ func TestDeleteInvalidDaemonID(t *testing.T) {
SkipRebalance: true,
KeepInitialConfig: true,
}
tlog.Logf("Decommission invalid node %s (expecting to fail)\n", val.DaemonID)
if _, err := api.DecommissionNode(tools.BaseAPIParams(), val); err == nil {
t.Errorf("Error is nil, expected NotFound error on a delete of a non-existing target")
}
Expand Down
3 changes: 2 additions & 1 deletion reb/recv.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,9 @@ func (reb *Reb) recvECData(hdr transport.ObjHdr, unpacker *cos.ByteUnpack, reade
err := reb.receiveMD(req, hdr)
if err != nil {
nlog.Errorf("failed to receive MD for %s: %v", hdr.Cname(), err)
nlog.Errorf("Warning: (g%d, %s) ignoring, proceeding anyway...", req.rebID, reb.t) // TODO: revisit
}
return err
return nil
}
if err := reb.receiveCT(req, hdr, reader); err != nil {
nlog.Errorf("failed to receive CT for %s: %v", hdr.Cname(), err)
Expand Down

0 comments on commit 4f7ae74

Please sign in to comment.