From 0b618bc11bf9fd5b462ab265b9954bbfa5776fe8 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 3 Oct 2023 16:53:06 +0200 Subject: [PATCH 01/23] WIP --- fullrt.go | 167 ++++++++++++++++++ internal/coord/coordt/coretypes.go | 5 + internal/coord/cplutil/cpl.go | 3 +- internal/coord/query.go | 1 + internal/coord/query/crawl.go | 272 +++++++++++++++++++++++++++++ internal/coord/query/crawl_test.go | 99 +++++++++++ internal/coord/query/pool.go | 2 + internal/coord/query/query.go | 1 + internal/coord/routing.go | 2 +- internal/coord/routing/explore.go | 9 +- 10 files changed, 551 insertions(+), 10 deletions(-) create mode 100644 fullrt.go create mode 100644 internal/coord/query/crawl.go create mode 100644 internal/coord/query/crawl_test.go diff --git a/fullrt.go b/fullrt.go new file mode 100644 index 0000000..4e219d4 --- /dev/null +++ b/fullrt.go @@ -0,0 +1,167 @@ +package zikade + +import ( + "context" + "fmt" + + "github.com/ipfs/go-cid" + record "github.com/libp2p/go-libp2p-record" + "github.com/libp2p/go-libp2p/core/network" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/core/peerstore" + "github.com/libp2p/go-libp2p/core/routing" + "go.opentelemetry.io/otel/attribute" + otel "go.opentelemetry.io/otel/trace" + + "github.com/plprobelab/zikade/internal/coord/coordt" + "github.com/plprobelab/zikade/kadt" + "github.com/plprobelab/zikade/pb" +) + +type FullRT struct { + *DHT +} + +var _ routing.Routing = (*FullRT)(nil) + +func (f *FullRT) FindPeer(ctx context.Context, id peer.ID) (peer.AddrInfo, error) { + ctx, span := f.tele.Tracer.Start(ctx, "DHT.FindPeer") + defer span.End() + + // First check locally. If we are or were recently connected to the peer, + // return the addresses from our peerstore unless the information doesn't + // contain any. + switch f.host.Network().Connectedness(id) { + case network.Connected, network.CanConnect: + addrInfo := f.host.Peerstore().PeerInfo(id) + if addrInfo.ID != "" && len(addrInfo.Addrs) > 0 { + return addrInfo, nil + } + default: + // we're not connected or were recently connected + } + + var foundPeer peer.ID + fn := func(ctx context.Context, visited kadt.PeerID, msg *pb.Message, stats coordt.QueryStats) error { + if peer.ID(visited) == id { + foundPeer = peer.ID(visited) + return coordt.ErrSkipRemaining + } + return nil + } + + _, _, err := f.kad.QueryClosest(ctx, kadt.PeerID(id).Key(), fn, 20) + if err != nil { + return peer.AddrInfo{}, fmt.Errorf("failed to run query: %w", err) + } + + if foundPeer == "" { + return peer.AddrInfo{}, fmt.Errorf("peer record not found") + } + + return f.host.Peerstore().PeerInfo(foundPeer), nil +} + +func (f *FullRT) Provide(ctx context.Context, c cid.Cid, brdcst bool) error { + ctx, span := f.tele.Tracer.Start(ctx, "DHT.Provide", otel.WithAttributes(attribute.String("cid", c.String()))) + defer span.End() + + // verify if this DHT supports provider records by checking if a "providers" + // backend is registered. + b, found := f.backends[namespaceProviders] + if !found { + return routing.ErrNotSupported + } + + // verify that it's "defined" CID (not empty) + if !c.Defined() { + return fmt.Errorf("invalid cid: undefined") + } + + // store ourselves as one provider for that CID + _, err := b.Store(ctx, string(c.Hash()), peer.AddrInfo{ID: f.host.ID()}) + if err != nil { + return fmt.Errorf("storing own provider record: %w", err) + } + + // if broadcast is "false" we won't query the DHT + if !brdcst { + return nil + } + + // construct message + addrInfo := peer.AddrInfo{ + ID: f.host.ID(), + Addrs: f.host.Addrs(), + } + + msg := &pb.Message{ + Type: pb.Message_ADD_PROVIDER, + Key: c.Hash(), + ProviderPeers: []*pb.Message_Peer{ + pb.FromAddrInfo(addrInfo), + }, + } + + // finally, find the closest peers to the target key. + return f.kad.BroadcastRecord(ctx, msg) +} + +// PutValue satisfies the [routing.Routing] interface and will add the given +// value to the k-closest nodes to keyStr. The parameter keyStr should have the +// format `/$namespace/$binary_id`. Namespace examples are `pk` or `ipns`. To +// identify the closest peers to keyStr, that complete string will be SHA256 +// hashed. +func (f *FullRT) PutValue(ctx context.Context, keyStr string, value []byte, opts ...routing.Option) error { + ctx, span := f.tele.Tracer.Start(ctx, "DHT.PutValue") + defer span.End() + + // first parse the routing options + rOpt := routing.Options{} // routing config + if err := rOpt.Apply(opts...); err != nil { + return fmt.Errorf("apply routing options: %w", err) + } + + // then always store the given value locally + if err := f.putValueLocal(ctx, keyStr, value); err != nil { + return fmt.Errorf("put value locally: %w", err) + } + + // if the routing system should operate in offline mode, stop here + if rOpt.Offline { + return nil + } + + // construct Kademlia-key. Yes, we hash the complete key string which + // includes the namespace prefix. + msg := &pb.Message{ + Type: pb.Message_PUT_VALUE, + Key: []byte(keyStr), + Record: record.MakePutRecord(keyStr, value), + } + + // finally, find the closest peers to the target key. + err := f.kad.BroadcastRecord(ctx, msg) + if err != nil { + return fmt.Errorf("query error: %w", err) + } + + return nil +} + +func (f *FullRT) Bootstrap(ctx context.Context) error { + ctx, span := f.tele.Tracer.Start(ctx, "DHT.Bootstrap") + defer span.End() + f.log.Info("Starting bootstrap") + + seed := make([]kadt.PeerID, len(f.cfg.BootstrapPeers)) + for i, addrInfo := range f.cfg.BootstrapPeers { + seed[i] = kadt.PeerID(addrInfo.ID) + // TODO: how to handle TTL if BootstrapPeers become dynamic and don't + // point to stable peers or consist of ephemeral peers that we have + // observed during a previous run. + f.host.Peerstore().AddAddrs(addrInfo.ID, addrInfo.Addrs, peerstore.PermanentAddrTTL) + } + + return f.kad.Bootstrap(ctx, seed) +} diff --git a/internal/coord/coordt/coretypes.go b/internal/coord/coordt/coretypes.go index 2c21aa6..640c658 100644 --- a/internal/coord/coordt/coretypes.go +++ b/internal/coord/coordt/coretypes.go @@ -69,3 +69,8 @@ type Router[K kad.Key[K], N kad.NodeID[K], M Message] interface { // closest to the target key. GetClosestNodes(ctx context.Context, to N, target K) ([]N, error) } + +// NodeIDForCplFunc is a function that given a cpl generates a [kad.NodeID] with a key that has +// a common prefix length with k of length cpl. +// Invariant: CommonPrefixLength(k, node.Key()) = cpl +type NodeIDForCplFunc[K kad.Key[K], N kad.NodeID[K]] func(k K, cpl int) (N, error) diff --git a/internal/coord/cplutil/cpl.go b/internal/coord/cplutil/cpl.go index ef0611c..daae603 100644 --- a/internal/coord/cplutil/cpl.go +++ b/internal/coord/cplutil/cpl.go @@ -6,7 +6,6 @@ import ( "fmt" mh "github.com/multiformats/go-multihash" - "github.com/plprobelab/zikade/kadt" ) @@ -25,7 +24,7 @@ func GenRandPeerID(k kadt.Key, cpl int) (kadt.PeerID, error) { key := keyPrefixMap[targetPrefix] id := [32 + 2]byte{mh.SHA2_256, 32} binary.BigEndian.PutUint32(id[2:], key) - return kadt.PeerID(string(id[:])), nil + return kadt.PeerID(id[:]), nil } type keybit interface { diff --git a/internal/coord/query.go b/internal/coord/query.go index 05f23cb..e565820 100644 --- a/internal/coord/query.go +++ b/internal/coord/query.go @@ -298,6 +298,7 @@ func (p *PooledQueryBehaviour) perfomNextInbound(ctx context.Context) (Behaviour cmd = &query.EventPoolNodeResponse[kadt.Key, kadt.PeerID]{ NodeID: ev.To, QueryID: ev.QueryID, + Target: ev.Target, CloserNodes: ev.CloserNodes, } case *EventGetCloserNodesFailure: diff --git a/internal/coord/query/crawl.go b/internal/coord/query/crawl.go new file mode 100644 index 0000000..0dbe3bc --- /dev/null +++ b/internal/coord/query/crawl.go @@ -0,0 +1,272 @@ +package query + +import ( + "context" + "fmt" + + "github.com/benbjohnson/clock" + "github.com/plprobelab/go-libdht/kad" + "github.com/plprobelab/go-libdht/kad/key" + "go.opentelemetry.io/otel/trace" + + "github.com/plprobelab/zikade/errs" + "github.com/plprobelab/zikade/internal/coord/coordt" + "github.com/plprobelab/zikade/tele" +) + +// CrawlConfig specifies optional configuration for a Crawl +type CrawlConfig struct { + MaxCPL int // the maximum CPL until we should crawl the peer + Concurrency int // the maximum number of concurrent peers that we may query + Clock clock.Clock // a clock that may replaced by a mock when testing +} + +// Validate checks the configuration options and returns an error if any have invalid values. +func (cfg *CrawlConfig) Validate() error { + if cfg.Clock == nil { + return &errs.ConfigurationError{ + Component: "CrawlConfig", + Err: fmt.Errorf("clock must not be nil"), + } + } + return nil +} + +// DefaultCrawlConfig returns the default configuration options for a Crawl. +// Options may be overridden before passing to NewCrawl +func DefaultCrawlConfig() *CrawlConfig { + return &CrawlConfig{ + MaxCPL: 16, + Concurrency: 1, + Clock: clock.New(), // use standard time + } +} + +type crawlJob[K kad.Key[K], N kad.NodeID[K]] struct { + node N + target K +} + +func (c *crawlJob[K, N]) mapKey() string { + return c.node.String() + key.HexString(c.target) +} + +type Crawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { + self N + id coordt.QueryID + + // cfg is a copy of the optional configuration supplied to the query + cfg CrawlConfig + cplFn coordt.NodeIDForCplFunc[K, N] + + todo []crawlJob[K, N] + cpls map[string]int + waiting map[string]N + success map[string]N + failed map[string]N + errors map[string]error +} + +func NewCrawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message](self N, id coordt.QueryID, cplFn coordt.NodeIDForCplFunc[K, N], seed []N, cfg *CrawlConfig) (*Crawl[K, N, M], error) { + if cfg == nil { + cfg = DefaultCrawlConfig() + } else if err := cfg.Validate(); err != nil { + return nil, err + } + + if len(seed) == 0 { + return nil, fmt.Errorf("empty seed") + } + + c := &Crawl[K, N, M]{ + self: self, + id: id, + cfg: *cfg, + cplFn: cplFn, + todo: make([]crawlJob[K, N], 0, len(seed)*cfg.MaxCPL), + cpls: map[string]int{}, + waiting: map[string]N{}, + success: map[string]N{}, + failed: map[string]N{}, + errors: map[string]error{}, + } + + for _, node := range seed { + // exclude self from closest nodes + if key.Equal(node.Key(), self.Key()) { + continue + } + + for i := 0; i < c.cfg.MaxCPL; i++ { + target, err := cplFn(node.Key(), i) + if err != nil { + return nil, fmt.Errorf("generate cpl: %w", err) + } + + job := crawlJob[K, N]{ + node: node, + target: target.Key(), + } + + c.cpls[job.mapKey()] = i + c.todo = append(c.todo, job) + } + } + + return c, nil +} + +func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlState) { + ctx, span := tele.StartSpan(ctx, "Crawl.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) + defer func() { + span.SetAttributes(tele.AttrOutEvent(out)) + span.End() + }() + + switch tev := ev.(type) { + case *EventCrawlCancel: + // TODO: ... + case *EventCrawlNodeResponse[K, N]: + job := crawlJob[K, N]{ + node: tev.NodeID, + target: tev.Target, + } + + mapKey := job.mapKey() + if _, found := c.waiting[mapKey]; !found { + break + } + + delete(c.waiting, mapKey) + c.success[mapKey] = tev.NodeID + + for _, node := range tev.CloserNodes { + for i := 0; i < c.cfg.MaxCPL; i++ { + target, err := c.cplFn(node.Key(), i) + if err != nil { + // TODO: log + continue + } + + job := crawlJob[K, N]{ + node: node, + target: target.Key(), + } + + mapKey := job.mapKey() + + if _, found := c.cpls[mapKey]; found { + continue + } + + c.cpls[mapKey] = i + c.todo = append(c.todo, job) + } + } + + case *EventCrawlNodeFailure[K, N]: + job := crawlJob[K, N]{ + node: tev.NodeID, + target: tev.Target, + } + + mapKey := job.mapKey() + if _, found := c.waiting[mapKey]; !found { + break + } + + delete(c.waiting, mapKey) + c.failed[mapKey] = tev.NodeID + c.errors[mapKey] = tev.Error + + case *EventCrawlPoll: + // no event to process + default: + panic(fmt.Sprintf("unexpected event: %T", tev)) + } + + if len(c.waiting) >= c.cfg.MaxCPL*c.cfg.Concurrency { + return &StateCrawlWaitingAtCapacity{ + QueryID: c.id, + } + } + + if len(c.todo) > 0 { + + // pop next crawl job from queue + var job crawlJob[K, N] + job, c.todo = c.todo[0], c.todo[1:] + + // mark the job as waiting + c.waiting[job.mapKey()] = job.node + + return &StateCrawlFindCloser[K, N]{ + QueryID: c.id, + Target: job.target, + NodeID: job.node, + } + } + + if len(c.waiting) > 0 { + return &StateCrawlWaitingWithCapacity{ + QueryID: c.id, + } + } + + return &StateCrawlFinished{} +} + +type CrawlState interface { + crawlState() +} + +type StateCrawlIdle struct{} + +type StateCrawlFinished struct{} + +type StateCrawlWaitingAtCapacity struct { + QueryID coordt.QueryID +} +type StateCrawlWaitingWithCapacity struct { + QueryID coordt.QueryID +} + +type StateCrawlFindCloser[K kad.Key[K], N kad.NodeID[K]] struct { + QueryID coordt.QueryID + Target K // the key that the query wants to find closer nodes for + NodeID N // the node to send the message to +} + +// crawlState() ensures that only [Crawl] states can be assigned to a CrawlState. +func (*StateCrawlFinished) crawlState() {} +func (*StateCrawlFindCloser[K, N]) crawlState() {} +func (*StateCrawlWaitingAtCapacity) crawlState() {} +func (*StateCrawlWaitingWithCapacity) crawlState() {} +func (*StateCrawlIdle) crawlState() {} + +type CrawlEvent interface { + crawlEvent() +} + +// EventCrawlPoll is an event that signals a [Crawl] that it can perform housekeeping work. +type EventCrawlPoll struct{} + +type EventCrawlCancel struct{} + +type EventCrawlNodeResponse[K kad.Key[K], N kad.NodeID[K]] struct { + NodeID N // the node the message was sent to + Target K // the key that the node was asked for + CloserNodes []N // the closer nodes sent by the node +} + +type EventCrawlNodeFailure[K kad.Key[K], N kad.NodeID[K]] struct { + NodeID N // the node the message was sent to + Target K // the key that the node was asked for + Error error // the error that caused the failure, if any +} + +// crawlEvent() ensures that only events accepted by [Crawl] can be assigned to a [CrawlEvent]. +func (*EventCrawlPoll) crawlEvent() {} +func (*EventCrawlCancel) crawlEvent() {} +func (*EventCrawlNodeResponse[K, N]) crawlEvent() {} +func (*EventCrawlNodeFailure[K, N]) crawlEvent() {} diff --git a/internal/coord/query/crawl_test.go b/internal/coord/query/crawl_test.go new file mode 100644 index 0000000..4d14b33 --- /dev/null +++ b/internal/coord/query/crawl_test.go @@ -0,0 +1,99 @@ +package query + +import ( + "context" + "testing" + + "github.com/benbjohnson/clock" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/plprobelab/zikade/internal/coord/coordt" + "github.com/plprobelab/zikade/internal/coord/internal/tiny" +) + +var _ coordt.StateMachine[CrawlEvent, CrawlState] = (*Crawl[tiny.Key, tiny.Node, tiny.Message])(nil) + +func TestCrawl_Advance(t *testing.T) { + ctx := context.Background() + + self := tiny.NewNode(0) + a := tiny.NewNode(0b10000100) + b := tiny.NewNode(0b11000000) + c := tiny.NewNode(0b10100000) + seed := []tiny.Node{self, a, b} + + clk := clock.NewMock() + + cfg := DefaultCrawlConfig() + cfg.Clock = clk + cfg.MaxCPL = 4 + cfg.Concurrency = 2 + + queryID := coordt.QueryID("test") + + qry, err := NewCrawl[tiny.Key, tiny.Node, tiny.Message](self, queryID, tiny.NodeWithCpl, seed, cfg) + require.NoError(t, err) + + assert.Len(t, qry.todo, 2*cfg.MaxCPL) + assert.Len(t, qry.cpls, 2*cfg.MaxCPL) + assert.Len(t, qry.waiting, 0) + assert.Len(t, qry.success, 0) + assert.Len(t, qry.failed, 0) + + reqs := make([]*StateCrawlFindCloser[tiny.Key, tiny.Node], 2*cfg.MaxCPL) + for i := 0; i < 2*cfg.MaxCPL; i++ { + state := qry.Advance(ctx, &EventCrawlPoll{}) + tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + reqs[i] = tstate + } + + assert.Len(t, qry.todo, 0) + assert.Len(t, qry.cpls, 2*cfg.MaxCPL) + assert.Len(t, qry.waiting, 2*cfg.MaxCPL) + assert.Len(t, qry.success, 0) + assert.Len(t, qry.failed, 0) + + state := qry.Advance(ctx, &EventCrawlPoll{}) + require.IsType(t, &StateCrawlWaitingAtCapacity{}, state) + + state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + NodeID: reqs[0].NodeID, + Target: reqs[0].Target, + CloserNodes: []tiny.Node{}, + }) + require.IsType(t, &StateCrawlWaitingWithCapacity{}, state) + + assert.Len(t, qry.todo, 0) + assert.Len(t, qry.cpls, 2*cfg.MaxCPL) + assert.Len(t, qry.waiting, 2*cfg.MaxCPL-1) + assert.Len(t, qry.success, 1) + assert.Len(t, qry.failed, 0) + + state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + NodeID: reqs[1].NodeID, + Target: reqs[1].Target, + CloserNodes: []tiny.Node{c}, + }) + + tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + assert.Equal(t, tstate.NodeID, c) + + assert.Len(t, qry.todo, 3) + assert.Len(t, qry.cpls, 3*cfg.MaxCPL) + assert.Len(t, qry.waiting, 2*cfg.MaxCPL-1) + assert.Len(t, qry.success, 2) + assert.Len(t, qry.failed, 0) + + for i := 2; i < len(reqs); i++ { + state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + NodeID: reqs[i].NodeID, + Target: reqs[i].Target, + CloserNodes: []tiny.Node{}, + }) + } + + require.IsType(t, &StateCrawlIdle{}, state) +} diff --git a/internal/coord/query/pool.go b/internal/coord/query/pool.go index f817a89..d6c58d4 100644 --- a/internal/coord/query/pool.go +++ b/internal/coord/query/pool.go @@ -138,6 +138,7 @@ func (p *Pool[K, N, M]) Advance(ctx context.Context, ev PoolEvent) PoolState { if qry, ok := p.queryIndex[tev.QueryID]; ok { state, terminal := p.advanceQuery(ctx, qry, &EventQueryNodeResponse[K, N]{ NodeID: tev.NodeID, + Target: tev.Target, CloserNodes: tev.CloserNodes, }) if terminal { @@ -396,6 +397,7 @@ type EventPoolStopQuery struct { type EventPoolNodeResponse[K kad.Key[K], N kad.NodeID[K]] struct { QueryID coordt.QueryID // the id of the query that sent the message NodeID N // the node the message was sent to + Target K // the target key that the node was asked for CloserNodes []N // the closer nodes sent by the node } diff --git a/internal/coord/query/query.go b/internal/coord/query/query.go index f010eff..7f0348d 100644 --- a/internal/coord/query/query.go +++ b/internal/coord/query/query.go @@ -432,6 +432,7 @@ type EventQueryCancel struct{} // EventQueryNodeResponse notifies a [Query] that an attempt to contact a node has received a successful response. type EventQueryNodeResponse[K kad.Key[K], N kad.NodeID[K]] struct { NodeID N // the node the message was sent to + Target K // the target key that the node was asked for CloserNodes []N // the closer nodes sent by the node } diff --git a/internal/coord/routing.go b/internal/coord/routing.go index ff9d863..d1c4529 100644 --- a/internal/coord/routing.go +++ b/internal/coord/routing.go @@ -366,7 +366,7 @@ func NewRoutingBehaviour(self kadt.PeerID, rt routing.RoutingTableCpl[kadt.Key, return nil, fmt.Errorf("explore schedule: %w", err) } - explore, err := routing.NewExplore[kadt.Key](self, rt, cplutil.GenRandPeerID, schedule, exploreCfg) + explore, err := routing.NewExplore[kadt.Key](self, rt, cplutil.GenRandPeerID[kadt.Key], schedule, exploreCfg) if err != nil { return nil, fmt.Errorf("explore: %w", err) } diff --git a/internal/coord/routing/explore.go b/internal/coord/routing/explore.go index 62f4bed..422f3b7 100644 --- a/internal/coord/routing/explore.go +++ b/internal/coord/routing/explore.go @@ -41,7 +41,7 @@ type Explore[K kad.Key[K], N kad.NodeID[K]] struct { // rt is the local routing table rt RoutingTableCpl[K, N] - cplFn NodeIDForCplFunc[K, N] + cplFn coordt.NodeIDForCplFunc[K, N] // qry is the query used by the explore process qry *query.Query[K, N, any] @@ -73,11 +73,6 @@ type Explore[K kad.Key[K], N kad.NodeID[K]] struct { cplAttributeSet atomic.Value // holds a [attribute.Set] } -// NodeIDForCplFunc is a function that given a cpl generates a [kad.NodeID] with a key that has -// a common prefix length with k of length cpl. -// Invariant: CommonPrefixLength(k, node.Key()) = cpl -type NodeIDForCplFunc[K kad.Key[K], N kad.NodeID[K]] func(k K, cpl int) (N, error) - // An ExploreSchedule provides an ordering for explorations of each cpl in a routing table. type ExploreSchedule interface { // NextCpl returns the first cpl to be explored whose due time is before or equal to the given time. @@ -165,7 +160,7 @@ func DefaultExploreConfig() *ExploreConfig { } } -func NewExplore[K kad.Key[K], N kad.NodeID[K]](self N, rt RoutingTableCpl[K, N], cplFn NodeIDForCplFunc[K, N], schedule ExploreSchedule, cfg *ExploreConfig) (*Explore[K, N], error) { +func NewExplore[K kad.Key[K], N kad.NodeID[K]](self N, rt RoutingTableCpl[K, N], cplFn coordt.NodeIDForCplFunc[K, N], schedule ExploreSchedule, cfg *ExploreConfig) (*Explore[K, N], error) { if cfg == nil { cfg = DefaultExploreConfig() } else if err := cfg.Validate(); err != nil { From 3dc3d731c9d9edb7e45cafcca87525a13861f3f3 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 5 Oct 2023 15:41:25 +0200 Subject: [PATCH 02/23] WIP --- internal/coord/{query => routing}/crawl.go | 36 ++++++------- .../coord/{query => routing}/crawl_test.go | 52 ++++++++++++++++++- 2 files changed, 69 insertions(+), 19 deletions(-) rename internal/coord/{query => routing}/crawl.go (97%) rename internal/coord/{query => routing}/crawl_test.go (62%) diff --git a/internal/coord/query/crawl.go b/internal/coord/routing/crawl.go similarity index 97% rename from internal/coord/query/crawl.go rename to internal/coord/routing/crawl.go index 0dbe3bc..59bc0f4 100644 --- a/internal/coord/query/crawl.go +++ b/internal/coord/routing/crawl.go @@ -1,4 +1,4 @@ -package query +package routing import ( "context" @@ -42,15 +42,6 @@ func DefaultCrawlConfig() *CrawlConfig { } } -type crawlJob[K kad.Key[K], N kad.NodeID[K]] struct { - node N - target K -} - -func (c *crawlJob[K, N]) mapKey() string { - return c.node.String() + key.HexString(c.target) -} - type Crawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { self N id coordt.QueryID @@ -74,10 +65,6 @@ func NewCrawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message](self N, id coordt return nil, err } - if len(seed) == 0 { - return nil, fmt.Errorf("empty seed") - } - c := &Crawl[K, N, M]{ self: self, id: id, @@ -113,6 +100,10 @@ func NewCrawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message](self N, id coordt } } + if len(seed) == 0 { + return nil, fmt.Errorf("empty seed") + } + return c, nil } @@ -148,18 +139,18 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS continue } - job := crawlJob[K, N]{ + newJob := crawlJob[K, N]{ node: node, target: target.Key(), } - mapKey := job.mapKey() + newMapKey := newJob.mapKey() - if _, found := c.cpls[mapKey]; found { + if _, found := c.cpls[newMapKey]; found { continue } - c.cpls[mapKey] = i + c.cpls[newMapKey] = i c.todo = append(c.todo, job) } } @@ -216,6 +207,15 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS return &StateCrawlFinished{} } +type crawlJob[K kad.Key[K], N kad.NodeID[K]] struct { + node N + target K +} + +func (c *crawlJob[K, N]) mapKey() string { + return c.node.String() + key.HexString(c.target) +} + type CrawlState interface { crawlState() } diff --git a/internal/coord/query/crawl_test.go b/internal/coord/routing/crawl_test.go similarity index 62% rename from internal/coord/query/crawl_test.go rename to internal/coord/routing/crawl_test.go index 4d14b33..45283b4 100644 --- a/internal/coord/query/crawl_test.go +++ b/internal/coord/routing/crawl_test.go @@ -1,4 +1,4 @@ -package query +package routing import ( "context" @@ -14,6 +14,40 @@ import ( var _ coordt.StateMachine[CrawlEvent, CrawlState] = (*Crawl[tiny.Key, tiny.Node, tiny.Message])(nil) +func TestNewCrawl(t *testing.T) { + self := tiny.NewNode(0) + a := tiny.NewNode(0b10000100) + b := tiny.NewNode(0b11000000) + + t.Run("initializes maps", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.MaxCPL = 4 + seed := []tiny.Node{a} + qry, err := NewCrawl[tiny.Key, tiny.Node, tiny.Message](self, coordt.QueryID("test"), tiny.NodeWithCpl, seed, cfg) + require.NoError(t, err) + require.NotNil(t, qry) + require.Len(t, qry.todo, 4) + require.NotNil(t, qry.waiting) + require.NotNil(t, qry.success) + require.NotNil(t, qry.failed) + require.NotNil(t, qry.errors) + }) + + t.Run("removes self from seed", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.MaxCPL = 4 + seed := []tiny.Node{self, a, b} + qry, err := NewCrawl[tiny.Key, tiny.Node, tiny.Message](self, coordt.QueryID("test"), tiny.NodeWithCpl, seed, cfg) + require.NoError(t, err) + require.NotNil(t, qry) + require.Len(t, qry.todo, cfg.MaxCPL*2) // self is not included + require.NotNil(t, qry.waiting) + require.NotNil(t, qry.success) + require.NotNil(t, qry.failed) + require.NotNil(t, qry.errors) + }) +} + func TestCrawl_Advance(t *testing.T) { ctx := context.Background() @@ -87,6 +121,15 @@ func TestCrawl_Advance(t *testing.T) { assert.Len(t, qry.success, 2) assert.Len(t, qry.failed, 0) + moreReqs := make([]*StateCrawlFindCloser[tiny.Key, tiny.Node], cfg.MaxCPL) + moreReqs[0] = tstate + for i := 1; i < cfg.MaxCPL; i++ { + state = qry.Advance(ctx, &EventCrawlPoll{}) + tstate, ok = state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + moreReqs[i] = tstate + } + for i := 2; i < len(reqs); i++ { state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ NodeID: reqs[i].NodeID, @@ -94,6 +137,13 @@ func TestCrawl_Advance(t *testing.T) { CloserNodes: []tiny.Node{}, }) } + for i := 0; i < len(moreReqs); i++ { + state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + NodeID: moreReqs[i].NodeID, + Target: moreReqs[i].Target, + CloserNodes: []tiny.Node{}, + }) + } require.IsType(t, &StateCrawlIdle{}, state) } From 9adc0b0e8860cde239f4eadc067dd19804e766e6 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 5 Oct 2023 18:05:54 +0200 Subject: [PATCH 03/23] improve testing --- internal/coord/routing.go | 4 +- internal/coord/routing/crawl.go | 28 +++++++- internal/coord/routing/crawl_test.go | 99 +++++++++++++++++++--------- 3 files changed, 94 insertions(+), 37 deletions(-) diff --git a/internal/coord/routing.go b/internal/coord/routing.go index d1c4529..2aa6521 100644 --- a/internal/coord/routing.go +++ b/internal/coord/routing.go @@ -348,7 +348,7 @@ func NewRoutingBehaviour(self kadt.PeerID, rt routing.RoutingTableCpl[kadt.Key, probeCfg.Concurrency = cfg.ProbeRequestConcurrency probeCfg.CheckInterval = cfg.ProbeCheckInterval - probe, err := routing.NewProbe[kadt.Key](rt, probeCfg) + probe, err := routing.NewProbe[kadt.Key, kadt.PeerID](rt, probeCfg) if err != nil { return nil, fmt.Errorf("probe: %w", err) } @@ -366,7 +366,7 @@ func NewRoutingBehaviour(self kadt.PeerID, rt routing.RoutingTableCpl[kadt.Key, return nil, fmt.Errorf("explore schedule: %w", err) } - explore, err := routing.NewExplore[kadt.Key](self, rt, cplutil.GenRandPeerID[kadt.Key], schedule, exploreCfg) + explore, err := routing.NewExplore[kadt.Key, kadt.PeerID](self, rt, cplutil.GenRandPeerID, schedule, exploreCfg) if err != nil { return nil, fmt.Errorf("explore: %w", err) } diff --git a/internal/coord/routing/crawl.go b/internal/coord/routing/crawl.go index 59bc0f4..56172ab 100644 --- a/internal/coord/routing/crawl.go +++ b/internal/coord/routing/crawl.go @@ -7,6 +7,7 @@ import ( "github.com/benbjohnson/clock" "github.com/plprobelab/go-libdht/kad" "github.com/plprobelab/go-libdht/kad/key" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" "github.com/plprobelab/zikade/errs" @@ -109,7 +110,9 @@ func NewCrawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message](self N, id coordt func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlState) { ctx, span := tele.StartSpan(ctx, "Crawl.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) + c.setMapSizes(span, "before") defer func() { + c.setMapSizes(span, "after") span.SetAttributes(tele.AttrOutEvent(out)) span.End() }() @@ -118,6 +121,8 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS case *EventCrawlCancel: // TODO: ... case *EventCrawlNodeResponse[K, N]: + span.SetAttributes(attribute.Int("closer_nodes", len(tev.CloserNodes))) + job := crawlJob[K, N]{ node: tev.NodeID, target: tev.Target, @@ -145,17 +150,17 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS } newMapKey := newJob.mapKey() - if _, found := c.cpls[newMapKey]; found { continue } c.cpls[newMapKey] = i - c.todo = append(c.todo, job) + c.todo = append(c.todo, newJob) } } case *EventCrawlNodeFailure[K, N]: + span.RecordError(tev.Error) job := crawlJob[K, N]{ node: tev.NodeID, target: tev.Target, @@ -189,7 +194,8 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS job, c.todo = c.todo[0], c.todo[1:] // mark the job as waiting - c.waiting[job.mapKey()] = job.node + mapKey := job.mapKey() + c.waiting[mapKey] = job.node return &StateCrawlFindCloser[K, N]{ QueryID: c.id, @@ -207,6 +213,22 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS return &StateCrawlFinished{} } +func (c *Crawl[K, N, M]) setMapSizes(span trace.Span, prefix string) { + span.SetAttributes( + attribute.Int(prefix+"_todo", len(c.todo)), + attribute.Int(prefix+"_cpls", len(c.cpls)), + attribute.Int(prefix+"_waiting", len(c.waiting)), + attribute.Int(prefix+"_success", len(c.success)), + attribute.Int(prefix+"_failed", len(c.failed)), + attribute.Int(prefix+"_errors", len(c.errors)), + ) +} + +func (c *Crawl[K, N, M]) mapKey(node N, target K) string { + job := crawlJob[K, N]{node: node, target: target} + return job.mapKey() +} + type crawlJob[K kad.Key[K], N kad.NodeID[K]] struct { node N target K diff --git a/internal/coord/routing/crawl_test.go b/internal/coord/routing/crawl_test.go index 45283b4..afd8432 100644 --- a/internal/coord/routing/crawl_test.go +++ b/internal/coord/routing/crawl_test.go @@ -2,6 +2,7 @@ package routing import ( "context" + "fmt" "testing" "github.com/benbjohnson/clock" @@ -26,7 +27,7 @@ func TestNewCrawl(t *testing.T) { qry, err := NewCrawl[tiny.Key, tiny.Node, tiny.Message](self, coordt.QueryID("test"), tiny.NodeWithCpl, seed, cfg) require.NoError(t, err) require.NotNil(t, qry) - require.Len(t, qry.todo, 4) + require.Len(t, qry.todo, cfg.MaxCPL) require.NotNil(t, qry.waiting) require.NotNil(t, qry.success) require.NotNil(t, qry.failed) @@ -74,13 +75,19 @@ func TestCrawl_Advance(t *testing.T) { assert.Len(t, qry.waiting, 0) assert.Len(t, qry.success, 0) assert.Len(t, qry.failed, 0) + assert.Len(t, qry.errors, 0) - reqs := make([]*StateCrawlFindCloser[tiny.Key, tiny.Node], 2*cfg.MaxCPL) - for i := 0; i < 2*cfg.MaxCPL; i++ { + // Let the state machine emit all FIND_NODE RPCs. Track them as pending responses. + pending := []*StateCrawlFindCloser[tiny.Key, tiny.Node]{} // tracks pending requests + for { state := qry.Advance(ctx, &EventCrawlPoll{}) tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) - require.True(t, ok, "type is %T", state) - reqs[i] = tstate + if !ok { + // Even if we're at capacity, we don't have any more FIND_NODE RRCs to do. + require.IsType(t, &StateCrawlWaitingAtCapacity{}, state) + break + } + pending = append(pending, tstate) } assert.Len(t, qry.todo, 0) @@ -88,62 +95,90 @@ func TestCrawl_Advance(t *testing.T) { assert.Len(t, qry.waiting, 2*cfg.MaxCPL) assert.Len(t, qry.success, 0) assert.Len(t, qry.failed, 0) + assert.Len(t, qry.errors, 0) + assert.Equal(t, len(qry.waiting), len(pending)) + // Poll again to verify that we're still AtCapacity state := qry.Advance(ctx, &EventCrawlPoll{}) require.IsType(t, &StateCrawlWaitingAtCapacity{}, state) + // simulate first successful response + pop, pending := pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - NodeID: reqs[0].NodeID, - Target: reqs[0].Target, + NodeID: pop.NodeID, + Target: pop.Target, CloserNodes: []tiny.Node{}, }) + + // we didn't have anything to do, so now we're waiting WITH capacity require.IsType(t, &StateCrawlWaitingWithCapacity{}, state) assert.Len(t, qry.todo, 0) assert.Len(t, qry.cpls, 2*cfg.MaxCPL) - assert.Len(t, qry.waiting, 2*cfg.MaxCPL-1) - assert.Len(t, qry.success, 1) + assert.Len(t, qry.waiting, 2*cfg.MaxCPL-1) // one less + assert.Len(t, qry.success, 1) // one successful response assert.Len(t, qry.failed, 0) + assert.Len(t, qry.errors, 0) + // pop next successful response. This time it contains a new node! + pop, pending = pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - NodeID: reqs[1].NodeID, - Target: reqs[1].Target, + NodeID: pop.NodeID, + Target: pop.Target, CloserNodes: []tiny.Node{c}, }) + // because the response contained a new node, we have new things to do and + // therefore expect a FIND_NODE RPC state. tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) require.True(t, ok, "type is %T", state) assert.Equal(t, tstate.NodeID, c) + pending = append(pending, tstate) assert.Len(t, qry.todo, 3) assert.Len(t, qry.cpls, 3*cfg.MaxCPL) - assert.Len(t, qry.waiting, 2*cfg.MaxCPL-1) + assert.Len(t, qry.waiting, 2*cfg.MaxCPL-1) // still -1 because the new FIND_NODE for c "replaced" the old one assert.Len(t, qry.success, 2) assert.Len(t, qry.failed, 0) + assert.Len(t, qry.errors, 0) + + // simulate error + pop, pending = pending[0], pending[1:] + state = qry.Advance(ctx, &EventCrawlNodeFailure[tiny.Key, tiny.Node]{ + NodeID: pop.NodeID, + Target: pop.Target, + Error: fmt.Errorf("some error"), + }) + tstate, ok = state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) + require.True(t, ok) + pending = append(pending, tstate) - moreReqs := make([]*StateCrawlFindCloser[tiny.Key, tiny.Node], cfg.MaxCPL) - moreReqs[0] = tstate - for i := 1; i < cfg.MaxCPL; i++ { - state = qry.Advance(ctx, &EventCrawlPoll{}) - tstate, ok = state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) - require.True(t, ok, "type is %T", state) - moreReqs[i] = tstate - } + assert.Len(t, qry.failed, 1) + assert.Len(t, qry.errors, 1) + assert.ErrorContains(t, qry.errors[qry.mapKey(pop.NodeID, pop.Target)], "some error") - for i := 2; i < len(reqs); i++ { - state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - NodeID: reqs[i].NodeID, - Target: reqs[i].Target, - CloserNodes: []tiny.Node{}, - }) - } - for i := 0; i < len(moreReqs); i++ { + // simulate response from random node + + for { + pop, pending = pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - NodeID: moreReqs[i].NodeID, - Target: moreReqs[i].Target, + NodeID: pop.NodeID, + Target: pop.Target, CloserNodes: []tiny.Node{}, }) + tstate, ok = state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) + if ok { + pending = append(pending, tstate) + continue + } + + if _, ok = state.(*StateCrawlWaitingWithCapacity); ok { + // continue simulating responses + continue + } + + if _, ok = state.(*StateCrawlFinished); ok { + break + } } - - require.IsType(t, &StateCrawlIdle{}, state) } From c7f2cf39bdb395f5280cf73dc5eb8956dc293d22 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 5 Oct 2023 18:12:31 +0200 Subject: [PATCH 04/23] improve crawler configuration --- internal/coord/routing/crawl.go | 30 ++++++++++++++++++------- internal/coord/routing/crawl_test.go | 33 ++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/internal/coord/routing/crawl.go b/internal/coord/routing/crawl.go index 56172ab..23ce9ae 100644 --- a/internal/coord/routing/crawl.go +++ b/internal/coord/routing/crawl.go @@ -4,7 +4,6 @@ import ( "context" "fmt" - "github.com/benbjohnson/clock" "github.com/plprobelab/go-libdht/kad" "github.com/plprobelab/go-libdht/kad/key" "go.opentelemetry.io/otel/attribute" @@ -17,19 +16,34 @@ import ( // CrawlConfig specifies optional configuration for a Crawl type CrawlConfig struct { - MaxCPL int // the maximum CPL until we should crawl the peer - Concurrency int // the maximum number of concurrent peers that we may query - Clock clock.Clock // a clock that may replaced by a mock when testing + MaxCPL int // the maximum CPL until we should crawl the peer + Concurrency int // the maximum number of concurrent peers that we may query + Tracer trace.Tracer // Tracer is the tracer that should be used to trace execution. } // Validate checks the configuration options and returns an error if any have invalid values. func (cfg *CrawlConfig) Validate() error { - if cfg.Clock == nil { + if cfg.MaxCPL < 1 { return &errs.ConfigurationError{ Component: "CrawlConfig", - Err: fmt.Errorf("clock must not be nil"), + Err: fmt.Errorf("max cpl must be greater than zero"), } } + + if cfg.Concurrency < 1 { + return &errs.ConfigurationError{ + Component: "CrawlConfig", + Err: fmt.Errorf("concurrency must be greater than zero"), + } + } + + if cfg.Tracer == nil { + return &errs.ConfigurationError{ + Component: "CrawlConfig", + Err: fmt.Errorf("tracer must not be nil"), + } + } + return nil } @@ -39,7 +53,7 @@ func DefaultCrawlConfig() *CrawlConfig { return &CrawlConfig{ MaxCPL: 16, Concurrency: 1, - Clock: clock.New(), // use standard time + Tracer: tele.NoopTracer(), } } @@ -109,7 +123,7 @@ func NewCrawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message](self N, id coordt } func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlState) { - ctx, span := tele.StartSpan(ctx, "Crawl.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) + _, span := c.cfg.Tracer.Start(ctx, "Crawl.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) c.setMapSizes(span, "before") defer func() { c.setMapSizes(span, "after") diff --git a/internal/coord/routing/crawl_test.go b/internal/coord/routing/crawl_test.go index afd8432..852a1ed 100644 --- a/internal/coord/routing/crawl_test.go +++ b/internal/coord/routing/crawl_test.go @@ -5,7 +5,6 @@ import ( "fmt" "testing" - "github.com/benbjohnson/clock" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -15,6 +14,35 @@ import ( var _ coordt.StateMachine[CrawlEvent, CrawlState] = (*Crawl[tiny.Key, tiny.Node, tiny.Message])(nil) +func TestCrawlConfig_Validate(t *testing.T) { + t.Run("default is valid", func(t *testing.T) { + cfg := DefaultCrawlConfig() + require.NoError(t, cfg.Validate()) + }) + + t.Run("tracer is not nil", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.Tracer = nil + require.Error(t, cfg.Validate()) + }) + + t.Run("max cpl positive", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.MaxCPL = 0 + require.Error(t, cfg.Validate()) + cfg.MaxCPL = -1 + require.Error(t, cfg.Validate()) + }) + + t.Run("concurrency positive", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.Concurrency = 0 + require.Error(t, cfg.Validate()) + cfg.Concurrency = -1 + require.Error(t, cfg.Validate()) + }) +} + func TestNewCrawl(t *testing.T) { self := tiny.NewNode(0) a := tiny.NewNode(0b10000100) @@ -58,10 +86,7 @@ func TestCrawl_Advance(t *testing.T) { c := tiny.NewNode(0b10100000) seed := []tiny.Node{self, a, b} - clk := clock.NewMock() - cfg := DefaultCrawlConfig() - cfg.Clock = clk cfg.MaxCPL = 4 cfg.Concurrency = 2 From 91b9d08eee0ba2a0f36063f60c068f9b16660d83 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 5 Oct 2023 19:02:55 +0200 Subject: [PATCH 05/23] WIP --- internal/coord/routing.go | 7 +- internal/coord/routing/crawl.go | 202 ++++++++++++++--------- internal/coord/routing/crawl_test.go | 236 ++++++++++++++++++++------- 3 files changed, 309 insertions(+), 136 deletions(-) diff --git a/internal/coord/routing.go b/internal/coord/routing.go index 2aa6521..c0f9fdc 100644 --- a/internal/coord/routing.go +++ b/internal/coord/routing.go @@ -299,9 +299,12 @@ type RoutingBehaviour struct { // probe is the node probing state machine, responsible for periodically checking connectivity of nodes in the routing table probe coordt.StateMachine[routing.ProbeEvent, routing.ProbeState] - // explore is the routing table explore state machine, responsible for increasing the occupanct of the routing table + // explore is the routing table explore state machine, responsible for increasing the occupant of the routing table explore coordt.StateMachine[routing.ExploreEvent, routing.ExploreState] + // crawl is the state machine that can crawl the network from a set of seed nodes + crawl coordt.StateMachine[routing.ExploreEvent, routing.ExploreState] + pendingMu sync.Mutex pending []BehaviourEvent ready chan struct{} @@ -371,6 +374,8 @@ func NewRoutingBehaviour(self kadt.PeerID, rt routing.RoutingTableCpl[kadt.Key, return nil, fmt.Errorf("explore: %w", err) } + // crawl, err := routing.NewCrawl(self) + return ComposeRoutingBehaviour(self, bootstrap, include, probe, explore, cfg) } diff --git a/internal/coord/routing/crawl.go b/internal/coord/routing/crawl.go index 23ce9ae..ecbf551 100644 --- a/internal/coord/routing/crawl.go +++ b/internal/coord/routing/crawl.go @@ -57,14 +57,15 @@ func DefaultCrawlConfig() *CrawlConfig { } } -type Crawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { - self N - id coordt.QueryID - - // cfg is a copy of the optional configuration supplied to the query +type Crawl[K kad.Key[K], N kad.NodeID[K]] struct { + self N cfg CrawlConfig cplFn coordt.NodeIDForCplFunc[K, N] + info *crawlInformation[K, N] // only set of crawl is in progress +} +type crawlInformation[K kad.Key[K], N kad.NodeID[K]] struct { + queryID coordt.QueryID todo []crawlJob[K, N] cpls map[string]int waiting map[string]N @@ -73,56 +74,24 @@ type Crawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { errors map[string]error } -func NewCrawl[K kad.Key[K], N kad.NodeID[K], M coordt.Message](self N, id coordt.QueryID, cplFn coordt.NodeIDForCplFunc[K, N], seed []N, cfg *CrawlConfig) (*Crawl[K, N, M], error) { +func NewCrawl[K kad.Key[K], N kad.NodeID[K]](self N, cplFn coordt.NodeIDForCplFunc[K, N], cfg *CrawlConfig) (*Crawl[K, N], error) { if cfg == nil { cfg = DefaultCrawlConfig() } else if err := cfg.Validate(); err != nil { return nil, err } - c := &Crawl[K, N, M]{ - self: self, - id: id, - cfg: *cfg, - cplFn: cplFn, - todo: make([]crawlJob[K, N], 0, len(seed)*cfg.MaxCPL), - cpls: map[string]int{}, - waiting: map[string]N{}, - success: map[string]N{}, - failed: map[string]N{}, - errors: map[string]error{}, - } - - for _, node := range seed { - // exclude self from closest nodes - if key.Equal(node.Key(), self.Key()) { - continue - } - - for i := 0; i < c.cfg.MaxCPL; i++ { - target, err := cplFn(node.Key(), i) - if err != nil { - return nil, fmt.Errorf("generate cpl: %w", err) - } - - job := crawlJob[K, N]{ - node: node, - target: target.Key(), - } - - c.cpls[job.mapKey()] = i - c.todo = append(c.todo, job) - } - } - - if len(seed) == 0 { - return nil, fmt.Errorf("empty seed") + c := &Crawl[K, N]{ + self: self, + cfg: *cfg, + cplFn: cplFn, + info: nil, } return c, nil } -func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlState) { +func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlState) { _, span := c.cfg.Tracer.Start(ctx, "Crawl.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) c.setMapSizes(span, "before") defer func() { @@ -132,23 +101,71 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS }() switch tev := ev.(type) { - case *EventCrawlCancel: - // TODO: ... + case *EventCrawlStart[K, N]: + if c.info != nil { + break // query in progress, pretend it was a poll + } + + span.SetAttributes(attribute.Int("seed", len(tev.Seed))) + + ci := &crawlInformation[K, N]{ + queryID: tev.QueryID, + todo: []crawlJob[K, N]{}, + cpls: map[string]int{}, + waiting: map[string]N{}, + success: map[string]N{}, + failed: map[string]N{}, + errors: map[string]error{}, + } + + for _, node := range tev.Seed { + // exclude self from closest nodes + if key.Equal(node.Key(), c.self.Key()) { + continue + } + + for j := 0; j < c.cfg.MaxCPL; j++ { + target, err := c.cplFn(node.Key(), j) + if err != nil { + // return nil, fmt.Errorf("generate cpl: %w", err) + // TODO: log + continue + } + + job := crawlJob[K, N]{ + node: node, + target: target.Key(), + } + + ci.cpls[job.mapKey()] = j + ci.todo = append(ci.todo, job) + } + } + + c.info = ci + case *EventCrawlNodeResponse[K, N]: span.SetAttributes(attribute.Int("closer_nodes", len(tev.CloserNodes))) + if c.info == nil { + return &StateCrawlIdle{} + } else if c.info.queryID != tev.QueryID { + // if we don't know this query, pretend it was a poll by breaking + break + } + job := crawlJob[K, N]{ node: tev.NodeID, target: tev.Target, } mapKey := job.mapKey() - if _, found := c.waiting[mapKey]; !found { + if _, found := c.info.waiting[mapKey]; !found { break } - delete(c.waiting, mapKey) - c.success[mapKey] = tev.NodeID + delete(c.info.waiting, mapKey) + c.info.success[mapKey] = tev.NodeID for _, node := range tev.CloserNodes { for i := 0; i < c.cfg.MaxCPL; i++ { @@ -164,16 +181,23 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS } newMapKey := newJob.mapKey() - if _, found := c.cpls[newMapKey]; found { + if _, found := c.info.cpls[newMapKey]; found { continue } - c.cpls[newMapKey] = i - c.todo = append(c.todo, newJob) + c.info.cpls[newMapKey] = i + c.info.todo = append(c.info.todo, newJob) } } case *EventCrawlNodeFailure[K, N]: + if c.info == nil { + return &StateCrawlIdle{} + } else if c.info.queryID != tev.QueryID { + // if we don't know this query, pretend it was a poll by breaking + break + } + span.RecordError(tev.Error) job := crawlJob[K, N]{ node: tev.NodeID, @@ -181,13 +205,13 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS } mapKey := job.mapKey() - if _, found := c.waiting[mapKey]; !found { + if _, found := c.info.waiting[mapKey]; !found { break } - delete(c.waiting, mapKey) - c.failed[mapKey] = tev.NodeID - c.errors[mapKey] = tev.Error + delete(c.info.waiting, mapKey) + c.info.failed[mapKey] = tev.NodeID + c.info.errors[mapKey] = tev.Error case *EventCrawlPoll: // no event to process @@ -195,50 +219,59 @@ func (c *Crawl[K, N, M]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlS panic(fmt.Sprintf("unexpected event: %T", tev)) } - if len(c.waiting) >= c.cfg.MaxCPL*c.cfg.Concurrency { + if c.info == nil { + return &StateCrawlIdle{} + } + + if len(c.info.waiting) >= c.cfg.MaxCPL*c.cfg.Concurrency { return &StateCrawlWaitingAtCapacity{ - QueryID: c.id, + QueryID: c.info.queryID, } } - if len(c.todo) > 0 { + if len(c.info.todo) > 0 { // pop next crawl job from queue var job crawlJob[K, N] - job, c.todo = c.todo[0], c.todo[1:] + job, c.info.todo = c.info.todo[0], c.info.todo[1:] // mark the job as waiting mapKey := job.mapKey() - c.waiting[mapKey] = job.node + c.info.waiting[mapKey] = job.node return &StateCrawlFindCloser[K, N]{ - QueryID: c.id, + QueryID: c.info.queryID, Target: job.target, NodeID: job.node, } } - if len(c.waiting) > 0 { + if len(c.info.waiting) > 0 { return &StateCrawlWaitingWithCapacity{ - QueryID: c.id, + QueryID: c.info.queryID, } } + c.info = nil return &StateCrawlFinished{} } -func (c *Crawl[K, N, M]) setMapSizes(span trace.Span, prefix string) { +func (c *Crawl[K, N]) setMapSizes(span trace.Span, prefix string) { + if c.info == nil { + return + } + span.SetAttributes( - attribute.Int(prefix+"_todo", len(c.todo)), - attribute.Int(prefix+"_cpls", len(c.cpls)), - attribute.Int(prefix+"_waiting", len(c.waiting)), - attribute.Int(prefix+"_success", len(c.success)), - attribute.Int(prefix+"_failed", len(c.failed)), - attribute.Int(prefix+"_errors", len(c.errors)), + attribute.Int(prefix+"_todo", len(c.info.todo)), + attribute.Int(prefix+"_cpls", len(c.info.cpls)), + attribute.Int(prefix+"_waiting", len(c.info.waiting)), + attribute.Int(prefix+"_success", len(c.info.success)), + attribute.Int(prefix+"_failed", len(c.info.failed)), + attribute.Int(prefix+"_errors", len(c.info.errors)), ) } -func (c *Crawl[K, N, M]) mapKey(node N, target K) string { +func (c *Crawl[K, N]) mapKey(node N, target K) string { job := crawlJob[K, N]{node: node, target: target} return job.mapKey() } @@ -258,7 +291,9 @@ type CrawlState interface { type StateCrawlIdle struct{} -type StateCrawlFinished struct{} +type StateCrawlFinished struct { + QueryID coordt.QueryID +} type StateCrawlWaitingAtCapacity struct { QueryID coordt.QueryID @@ -287,22 +322,31 @@ type CrawlEvent interface { // EventCrawlPoll is an event that signals a [Crawl] that it can perform housekeeping work. type EventCrawlPoll struct{} -type EventCrawlCancel struct{} +// type EventCrawlCancel struct{} // TODO: implement + +type EventCrawlStart[K kad.Key[K], N kad.NodeID[K]] struct { + QueryID coordt.QueryID + Seed []N +} type EventCrawlNodeResponse[K kad.Key[K], N kad.NodeID[K]] struct { + QueryID coordt.QueryID NodeID N // the node the message was sent to Target K // the key that the node was asked for CloserNodes []N // the closer nodes sent by the node } type EventCrawlNodeFailure[K kad.Key[K], N kad.NodeID[K]] struct { - NodeID N // the node the message was sent to - Target K // the key that the node was asked for - Error error // the error that caused the failure, if any + QueryID coordt.QueryID + NodeID N // the node the message was sent to + Target K // the key that the node was asked for + Error error // the error that caused the failure, if any } // crawlEvent() ensures that only events accepted by [Crawl] can be assigned to a [CrawlEvent]. -func (*EventCrawlPoll) crawlEvent() {} -func (*EventCrawlCancel) crawlEvent() {} +func (*EventCrawlPoll) crawlEvent() {} + +// func (*EventCrawlCancel) crawlEvent() {} +func (*EventCrawlStart[K, N]) crawlEvent() {} func (*EventCrawlNodeResponse[K, N]) crawlEvent() {} func (*EventCrawlNodeFailure[K, N]) crawlEvent() {} diff --git a/internal/coord/routing/crawl_test.go b/internal/coord/routing/crawl_test.go index 852a1ed..0b181e9 100644 --- a/internal/coord/routing/crawl_test.go +++ b/internal/coord/routing/crawl_test.go @@ -12,7 +12,7 @@ import ( "github.com/plprobelab/zikade/internal/coord/internal/tiny" ) -var _ coordt.StateMachine[CrawlEvent, CrawlState] = (*Crawl[tiny.Key, tiny.Node, tiny.Message])(nil) +var _ coordt.StateMachine[CrawlEvent, CrawlState] = (*Crawl[tiny.Key, tiny.Node])(nil) func TestCrawlConfig_Validate(t *testing.T) { t.Run("default is valid", func(t *testing.T) { @@ -43,43 +43,101 @@ func TestCrawlConfig_Validate(t *testing.T) { }) } -func TestNewCrawl(t *testing.T) { +func TestNewCrawl_Start(t *testing.T) { self := tiny.NewNode(0) a := tiny.NewNode(0b10000100) b := tiny.NewNode(0b11000000) - t.Run("initializes maps", func(t *testing.T) { + t.Run("does not fail with default config", func(t *testing.T) { cfg := DefaultCrawlConfig() cfg.MaxCPL = 4 - seed := []tiny.Node{a} - qry, err := NewCrawl[tiny.Key, tiny.Node, tiny.Message](self, coordt.QueryID("test"), tiny.NodeWithCpl, seed, cfg) + qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) require.NoError(t, err) - require.NotNil(t, qry) - require.Len(t, qry.todo, cfg.MaxCPL) - require.NotNil(t, qry.waiting) - require.NotNil(t, qry.success) - require.NotNil(t, qry.failed) - require.NotNil(t, qry.errors) + require.Nil(t, qry.info) }) t.Run("removes self from seed", func(t *testing.T) { cfg := DefaultCrawlConfig() cfg.MaxCPL = 4 - seed := []tiny.Node{self, a, b} - qry, err := NewCrawl[tiny.Key, tiny.Node, tiny.Message](self, coordt.QueryID("test"), tiny.NodeWithCpl, seed, cfg) + qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) require.NoError(t, err) - require.NotNil(t, qry) - require.Len(t, qry.todo, cfg.MaxCPL*2) // self is not included - require.NotNil(t, qry.waiting) - require.NotNil(t, qry.success) - require.NotNil(t, qry.failed) - require.NotNil(t, qry.errors) + + qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("test"), + Seed: []tiny.Node{self, a, b}, + }) + require.NotNil(t, qry.info) + require.Len(t, qry.info.todo, cfg.MaxCPL*2-1) // self is not included + require.Len(t, qry.info.waiting, 1) + require.Len(t, qry.info.success, 0) + require.Len(t, qry.info.failed, 0) + require.Len(t, qry.info.errors, 0) + }) + + t.Run("removes self from seed (no left", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.MaxCPL = 4 + qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) + require.NoError(t, err) + + state := qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("test"), + Seed: []tiny.Node{self}, + }) + require.Nil(t, qry.info) + require.IsType(t, &StateCrawlFinished{}, state) + }) + + t.Run("handles duplicate starts (does not panic)", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.MaxCPL = 4 + qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) + require.NoError(t, err) + + seed := []tiny.Node{a, b} + qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("test"), + Seed: seed, + }) + + qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("test"), + Seed: seed, + }) + + qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("another"), + Seed: seed, + }) + }) + + t.Run("handles events if no crawl started", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.MaxCPL = 4 + qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) + require.NoError(t, err) + + state := qry.Advance(context.Background(), &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("test"), + }) + require.IsType(t, &StateCrawlIdle{}, state) + + state = qry.Advance(context.Background(), &EventCrawlNodeFailure[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("test"), + }) + require.IsType(t, &StateCrawlIdle{}, state) + + state = qry.Advance(context.Background(), &EventCrawlPoll{}) + require.IsType(t, &StateCrawlIdle{}, state) }) } func TestCrawl_Advance(t *testing.T) { ctx := context.Background() + // Let the state machine emit all FIND_NODE RPCs. Track them as pending responses. + pending := []*StateCrawlFindCloser[tiny.Key, tiny.Node]{} // tracks pending requests + self := tiny.NewNode(0) a := tiny.NewNode(0b10000100) b := tiny.NewNode(0b11000000) @@ -92,20 +150,26 @@ func TestCrawl_Advance(t *testing.T) { queryID := coordt.QueryID("test") - qry, err := NewCrawl[tiny.Key, tiny.Node, tiny.Message](self, queryID, tiny.NodeWithCpl, seed, cfg) + qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) require.NoError(t, err) - assert.Len(t, qry.todo, 2*cfg.MaxCPL) - assert.Len(t, qry.cpls, 2*cfg.MaxCPL) - assert.Len(t, qry.waiting, 0) - assert.Len(t, qry.success, 0) - assert.Len(t, qry.failed, 0) - assert.Len(t, qry.errors, 0) + state := qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ + QueryID: queryID, + Seed: seed, + }) + assert.Len(t, qry.info.todo, 2*cfg.MaxCPL-1) + assert.Len(t, qry.info.cpls, 2*cfg.MaxCPL) + assert.Len(t, qry.info.waiting, 1) + assert.Len(t, qry.info.success, 0) + assert.Len(t, qry.info.failed, 0) + assert.Len(t, qry.info.errors, 0) + + tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + pending = append(pending, tstate) - // Let the state machine emit all FIND_NODE RPCs. Track them as pending responses. - pending := []*StateCrawlFindCloser[tiny.Key, tiny.Node]{} // tracks pending requests for { - state := qry.Advance(ctx, &EventCrawlPoll{}) + state = qry.Advance(ctx, &EventCrawlPoll{}) tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) if !ok { // Even if we're at capacity, we don't have any more FIND_NODE RRCs to do. @@ -115,21 +179,22 @@ func TestCrawl_Advance(t *testing.T) { pending = append(pending, tstate) } - assert.Len(t, qry.todo, 0) - assert.Len(t, qry.cpls, 2*cfg.MaxCPL) - assert.Len(t, qry.waiting, 2*cfg.MaxCPL) - assert.Len(t, qry.success, 0) - assert.Len(t, qry.failed, 0) - assert.Len(t, qry.errors, 0) - assert.Equal(t, len(qry.waiting), len(pending)) + assert.Len(t, qry.info.todo, 0) + assert.Len(t, qry.info.cpls, 2*cfg.MaxCPL) + assert.Len(t, qry.info.waiting, 2*cfg.MaxCPL) + assert.Len(t, qry.info.success, 0) + assert.Len(t, qry.info.failed, 0) + assert.Len(t, qry.info.errors, 0) + assert.Equal(t, len(qry.info.waiting), len(pending)) // Poll again to verify that we're still AtCapacity - state := qry.Advance(ctx, &EventCrawlPoll{}) + state = qry.Advance(ctx, &EventCrawlPoll{}) require.IsType(t, &StateCrawlWaitingAtCapacity{}, state) // simulate first successful response pop, pending := pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + QueryID: queryID, NodeID: pop.NodeID, Target: pop.Target, CloserNodes: []tiny.Node{}, @@ -138,16 +203,17 @@ func TestCrawl_Advance(t *testing.T) { // we didn't have anything to do, so now we're waiting WITH capacity require.IsType(t, &StateCrawlWaitingWithCapacity{}, state) - assert.Len(t, qry.todo, 0) - assert.Len(t, qry.cpls, 2*cfg.MaxCPL) - assert.Len(t, qry.waiting, 2*cfg.MaxCPL-1) // one less - assert.Len(t, qry.success, 1) // one successful response - assert.Len(t, qry.failed, 0) - assert.Len(t, qry.errors, 0) + assert.Len(t, qry.info.todo, 0) + assert.Len(t, qry.info.cpls, 2*cfg.MaxCPL) + assert.Len(t, qry.info.waiting, 2*cfg.MaxCPL-1) // one less + assert.Len(t, qry.info.success, 1) // one successful response + assert.Len(t, qry.info.failed, 0) + assert.Len(t, qry.info.errors, 0) // pop next successful response. This time it contains a new node! pop, pending = pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + QueryID: queryID, NodeID: pop.NodeID, Target: pop.Target, CloserNodes: []tiny.Node{c}, @@ -155,38 +221,40 @@ func TestCrawl_Advance(t *testing.T) { // because the response contained a new node, we have new things to do and // therefore expect a FIND_NODE RPC state. - tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) + tstate, ok = state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) require.True(t, ok, "type is %T", state) assert.Equal(t, tstate.NodeID, c) pending = append(pending, tstate) - assert.Len(t, qry.todo, 3) - assert.Len(t, qry.cpls, 3*cfg.MaxCPL) - assert.Len(t, qry.waiting, 2*cfg.MaxCPL-1) // still -1 because the new FIND_NODE for c "replaced" the old one - assert.Len(t, qry.success, 2) - assert.Len(t, qry.failed, 0) - assert.Len(t, qry.errors, 0) + assert.Len(t, qry.info.todo, 3) + assert.Len(t, qry.info.cpls, 3*cfg.MaxCPL) + assert.Len(t, qry.info.waiting, 2*cfg.MaxCPL-1) // still -1 because the new FIND_NODE for c "replaced" the old one + assert.Len(t, qry.info.success, 2) + assert.Len(t, qry.info.failed, 0) + assert.Len(t, qry.info.errors, 0) // simulate error pop, pending = pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeFailure[tiny.Key, tiny.Node]{ - NodeID: pop.NodeID, - Target: pop.Target, - Error: fmt.Errorf("some error"), + QueryID: queryID, + NodeID: pop.NodeID, + Target: pop.Target, + Error: fmt.Errorf("some error"), }) tstate, ok = state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) require.True(t, ok) pending = append(pending, tstate) - assert.Len(t, qry.failed, 1) - assert.Len(t, qry.errors, 1) - assert.ErrorContains(t, qry.errors[qry.mapKey(pop.NodeID, pop.Target)], "some error") + assert.Len(t, qry.info.failed, 1) + assert.Len(t, qry.info.errors, 1) + assert.ErrorContains(t, qry.info.errors[qry.mapKey(pop.NodeID, pop.Target)], "some error") // simulate response from random node for { pop, pending = pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + QueryID: queryID, NodeID: pop.NodeID, Target: pop.Target, CloserNodes: []tiny.Node{}, @@ -203,7 +271,63 @@ func TestCrawl_Advance(t *testing.T) { } if _, ok = state.(*StateCrawlFinished); ok { + require.Nil(t, qry.info) break } } } + +func TestCrawl_Advance_unrelated_response(t *testing.T) { + ctx := context.Background() + + self := tiny.NewNode(0) + a := tiny.NewNode(0b10000100) + seed := []tiny.Node{self, a} + + cfg := DefaultCrawlConfig() + cfg.MaxCPL = 1 + cfg.Concurrency = 2 + + queryID := coordt.QueryID("test") + + qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) + require.NoError(t, err) + + state := qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ + QueryID: queryID, + Seed: seed, + }) + tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + + state = qry.Advance(ctx, &EventCrawlPoll{}) + require.IsType(t, &StateCrawlWaitingWithCapacity{}, state) + + // send it an unrelated response + state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("another"), + NodeID: tstate.NodeID, + Target: tstate.Target, + CloserNodes: []tiny.Node{}, + }) + require.IsType(t, &StateCrawlWaitingWithCapacity{}, state) // still waiting with capacity because the response was ignored + + // send it an unrelated response + state = qry.Advance(ctx, &EventCrawlNodeFailure[tiny.Key, tiny.Node]{ + QueryID: coordt.QueryID("another"), + NodeID: tstate.NodeID, + Target: tstate.Target, + Error: fmt.Errorf("some error"), + }) + require.IsType(t, &StateCrawlWaitingWithCapacity{}, state) // still waiting with capacity because the response was ignored + + // send correct response + state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ + QueryID: queryID, + NodeID: tstate.NodeID, + Target: tstate.Target, + CloserNodes: []tiny.Node{}, + }) + require.IsType(t, &StateCrawlFinished{}, state) + require.Nil(t, qry.info) +} From 2a64316c420afeac17c59559de7490a818190b7f Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 6 Oct 2023 16:16:44 +0200 Subject: [PATCH 06/23] WIP --- internal/coord/routing/crawl_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/coord/routing/crawl_test.go b/internal/coord/routing/crawl_test.go index 0b181e9..4ddb2e5 100644 --- a/internal/coord/routing/crawl_test.go +++ b/internal/coord/routing/crawl_test.go @@ -9,7 +9,7 @@ import ( "github.com/stretchr/testify/require" "github.com/plprobelab/zikade/internal/coord/coordt" - "github.com/plprobelab/zikade/internal/coord/internal/tiny" + "github.com/plprobelab/zikade/internal/tiny" ) var _ coordt.StateMachine[CrawlEvent, CrawlState] = (*Crawl[tiny.Key, tiny.Node])(nil) From 0bd2b48ad55329febf960f2b3f1a492e4845afb4 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 6 Oct 2023 17:51:24 +0200 Subject: [PATCH 07/23] WIP --- internal/coord/event.go | 6 ++ internal/coord/routing.go | 93 +++++++++++++++++++++++++--- internal/coord/routing/crawl.go | 52 +++++----------- internal/coord/routing/crawl_test.go | 4 +- internal/coord/routing_test.go | 55 +++++++++++++--- 5 files changed, 154 insertions(+), 56 deletions(-) diff --git a/internal/coord/event.go b/internal/coord/event.go index 6766ea7..ba94477 100644 --- a/internal/coord/event.go +++ b/internal/coord/event.go @@ -245,3 +245,9 @@ type EventRoutingPoll struct{} func (*EventRoutingPoll) behaviourEvent() {} func (*EventRoutingPoll) routingCommand() {} + +type EventStartCrawl struct { + Seed []kadt.PeerID +} + +func (*EventStartCrawl) behaviourEvent() {} diff --git a/internal/coord/routing.go b/internal/coord/routing.go index c0f9fdc..91cb209 100644 --- a/internal/coord/routing.go +++ b/internal/coord/routing.go @@ -303,7 +303,7 @@ type RoutingBehaviour struct { explore coordt.StateMachine[routing.ExploreEvent, routing.ExploreState] // crawl is the state machine that can crawl the network from a set of seed nodes - crawl coordt.StateMachine[routing.ExploreEvent, routing.ExploreState] + crawl coordt.StateMachine[routing.CrawlEvent, routing.CrawlState] pendingMu sync.Mutex pending []BehaviourEvent @@ -374,9 +374,14 @@ func NewRoutingBehaviour(self kadt.PeerID, rt routing.RoutingTableCpl[kadt.Key, return nil, fmt.Errorf("explore: %w", err) } - // crawl, err := routing.NewCrawl(self) + crawlCfg := routing.DefaultCrawlConfig() - return ComposeRoutingBehaviour(self, bootstrap, include, probe, explore, cfg) + crawl, err := routing.NewCrawl(self, cplutil.GenRandPeerID, crawlCfg) + if err != nil { + return nil, fmt.Errorf("crawl: %w", err) + } + + return ComposeRoutingBehaviour(self, bootstrap, include, probe, explore, crawl, cfg) } // ComposeRoutingBehaviour creates a [RoutingBehaviour] composed of the supplied state machines. @@ -387,6 +392,7 @@ func ComposeRoutingBehaviour( include coordt.StateMachine[routing.IncludeEvent, routing.IncludeState], probe coordt.StateMachine[routing.ProbeEvent, routing.ProbeState], explore coordt.StateMachine[routing.ExploreEvent, routing.ExploreState], + crawl coordt.StateMachine[routing.CrawlEvent, routing.CrawlState], cfg *RoutingConfig, ) (*RoutingBehaviour, error) { if cfg == nil { @@ -402,6 +408,7 @@ func ComposeRoutingBehaviour( include: include, probe: probe, explore: explore, + crawl: crawl, ready: make(chan struct{}, 1), } return r, nil @@ -418,12 +425,11 @@ func (r *RoutingBehaviour) Notify(ctx context.Context, ev BehaviourEvent) { // notify must only be called while r.pendingMu is held func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { - ctx, span := r.cfg.Tracer.Start(ctx, "RoutingBehaviour.notify", trace.WithAttributes(attribute.String("event", fmt.Sprintf("%T", ev)))) + ctx, span := r.cfg.Tracer.Start(ctx, "RoutingBehaviour.notify", trace.WithAttributes(tele.AttrInEvent(ev))) defer span.End() switch ev := ev.(type) { case *EventStartBootstrap: - span.SetAttributes(attribute.String("event", "EventStartBootstrap")) cmd := &routing.EventBootstrapStart[kadt.Key, kadt.PeerID]{ KnownClosestNodes: ev.SeedNodes, } @@ -433,8 +439,17 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { r.pending = append(r.pending, next) } + case *EventStartCrawl: + cmd := &routing.EventCrawlStart[kadt.Key, kadt.PeerID]{ + Seed: ev.Seed, + } + // attempt to advance the bootstrap + next, ok := r.advanceCrawl(ctx, cmd) + if ok { + r.pending = append(r.pending, next) + } + case *EventAddNode: - span.SetAttributes(attribute.String("event", "EventAddAddrInfo")) // Ignore self if r.self.Equal(ev.NodeID) { break @@ -450,7 +465,7 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { } case *EventRoutingUpdated: - span.SetAttributes(attribute.String("event", "EventRoutingUpdated"), attribute.String("nodeid", ev.NodeID.String())) + span.SetAttributes(attribute.String("nodeid", ev.NodeID.String())) cmd := &routing.EventProbeAdd[kadt.Key, kadt.PeerID]{ NodeID: ev.NodeID, } @@ -533,9 +548,23 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { r.pending = append(r.pending, next) } + case routing.CrawlQueryID: + cmd := &routing.EventCrawlNodeResponse[kadt.Key, kadt.PeerID]{ + NodeID: ev.To, + Target: ev.Target, + CloserNodes: ev.CloserNodes, + } + + // attempt to advance the crawl + next, ok := r.advanceCrawl(ctx, cmd) + if ok { + r.pending = append(r.pending, next) + } + default: panic(fmt.Sprintf("unexpected query id: %s", ev.QueryID)) } + case *EventGetCloserNodesFailure: span.SetAttributes(attribute.String("event", "EventGetCloserNodesFailure"), attribute.String("queryid", string(ev.QueryID)), attribute.String("nodeid", ev.To.String())) span.RecordError(ev.Err) @@ -580,10 +609,22 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { if ok { r.pending = append(r.pending, next) } + case routing.CrawlQueryID: + cmd := &routing.EventCrawlNodeFailure[kadt.Key, kadt.PeerID]{ + NodeID: ev.To, + Target: ev.Target, + Error: ev.Err, + } + // attempt to advance the crawl + next, ok := r.advanceCrawl(ctx, cmd) + if ok { + r.pending = append(r.pending, next) + } default: panic(fmt.Sprintf("unexpected query id: %s", ev.QueryID)) } + case *EventNotifyConnectivity: span.SetAttributes(attribute.String("event", "EventNotifyConnectivity"), attribute.String("nodeid", ev.NodeID.String())) // ignore self @@ -609,6 +650,7 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { if ok { r.pending = append(r.pending, nextProbe) } + case *EventNotifyNonConnectivity: span.SetAttributes(attribute.String("event", "EventNotifyConnectivity"), attribute.String("nodeid", ev.NodeID.String())) @@ -620,6 +662,7 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { if ok { r.pending = append(r.pending, nextProbe) } + case *EventRoutingPoll: r.pollChildren(ctx) @@ -693,6 +736,11 @@ func (r *RoutingBehaviour) pollChildren(ctx context.Context) { if ok { r.pending = append(r.pending, ev) } + + ev, ok = r.advanceCrawl(ctx, &routing.EventCrawlPoll{}) + if ok { + r.pending = append(r.pending, ev) + } } func (r *RoutingBehaviour) advanceBootstrap(ctx context.Context, ev routing.BootstrapEvent) (BehaviourEvent, bool) { @@ -817,9 +865,9 @@ func (r *RoutingBehaviour) advanceProbe(ctx context.Context, ev routing.ProbeEve func (r *RoutingBehaviour) advanceExplore(ctx context.Context, ev routing.ExploreEvent) (BehaviourEvent, bool) { ctx, span := r.cfg.Tracer.Start(ctx, "RoutingBehaviour.advanceExplore") defer span.End() + bstate := r.explore.Advance(ctx, ev) switch st := bstate.(type) { - case *routing.StateExploreFindCloser[kadt.Key, kadt.PeerID]: r.cfg.Logger.Debug("starting explore", slog.Int("cpl", st.Cpl), tele.LogAttrPeerID(st.NodeID)) return &EventOutboundGetCloserNodes{ @@ -845,3 +893,32 @@ func (r *RoutingBehaviour) advanceExplore(ctx context.Context, ev routing.Explor return nil, false } + +func (r *RoutingBehaviour) advanceCrawl(ctx context.Context, ev routing.CrawlEvent) (BehaviourEvent, bool) { + ctx, span := r.cfg.Tracer.Start(ctx, "RoutingBehaviour.advanceCrawl") + defer span.End() + + cstate := r.crawl.Advance(ctx, ev) + switch st := cstate.(type) { + case *routing.StateCrawlFindCloser[kadt.Key, kadt.PeerID]: + return &EventOutboundGetCloserNodes{ + QueryID: routing.CrawlQueryID, + To: st.NodeID, + Target: st.Target, + Notify: r, + }, true + + case *routing.StateCrawlWaitingWithCapacity: + // crawl waiting for a message response but has capacity to do more + case *routing.StateCrawlWaitingAtCapacity: + // crawl waiting for a message response but has no capacity to do more + case *routing.StateCrawlFinished: + r.cfg.Logger.Info("crawl finished") + case *routing.StateCrawlIdle: + // bootstrap not running, nothing to do + default: + panic(fmt.Sprintf("unexpected explore state: %T", st)) + } + + return nil, false +} diff --git a/internal/coord/routing/crawl.go b/internal/coord/routing/crawl.go index ecbf551..5e8a706 100644 --- a/internal/coord/routing/crawl.go +++ b/internal/coord/routing/crawl.go @@ -14,6 +14,9 @@ import ( "github.com/plprobelab/zikade/tele" ) +// CrawlQueryID is the id for the query operated by the crawl state machine +const CrawlQueryID = coordt.QueryID("crawl") + // CrawlConfig specifies optional configuration for a Crawl type CrawlConfig struct { MaxCPL int // the maximum CPL until we should crawl the peer @@ -65,7 +68,6 @@ type Crawl[K kad.Key[K], N kad.NodeID[K]] struct { } type crawlInformation[K kad.Key[K], N kad.NodeID[K]] struct { - queryID coordt.QueryID todo []crawlJob[K, N] cpls map[string]int waiting map[string]N @@ -109,7 +111,6 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat span.SetAttributes(attribute.Int("seed", len(tev.Seed))) ci := &crawlInformation[K, N]{ - queryID: tev.QueryID, todo: []crawlJob[K, N]{}, cpls: map[string]int{}, waiting: map[string]N{}, @@ -149,9 +150,6 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat if c.info == nil { return &StateCrawlIdle{} - } else if c.info.queryID != tev.QueryID { - // if we don't know this query, pretend it was a poll by breaking - break } job := crawlJob[K, N]{ @@ -193,9 +191,6 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat case *EventCrawlNodeFailure[K, N]: if c.info == nil { return &StateCrawlIdle{} - } else if c.info.queryID != tev.QueryID { - // if we don't know this query, pretend it was a poll by breaking - break } span.RecordError(tev.Error) @@ -224,9 +219,7 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat } if len(c.info.waiting) >= c.cfg.MaxCPL*c.cfg.Concurrency { - return &StateCrawlWaitingAtCapacity{ - QueryID: c.info.queryID, - } + return &StateCrawlWaitingAtCapacity{} } if len(c.info.todo) > 0 { @@ -240,16 +233,13 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat c.info.waiting[mapKey] = job.node return &StateCrawlFindCloser[K, N]{ - QueryID: c.info.queryID, - Target: job.target, - NodeID: job.node, + Target: job.target, + NodeID: job.node, } } if len(c.info.waiting) > 0 { - return &StateCrawlWaitingWithCapacity{ - QueryID: c.info.queryID, - } + return &StateCrawlWaitingWithCapacity{} } c.info = nil @@ -291,21 +281,14 @@ type CrawlState interface { type StateCrawlIdle struct{} -type StateCrawlFinished struct { - QueryID coordt.QueryID -} +type StateCrawlFinished struct{} -type StateCrawlWaitingAtCapacity struct { - QueryID coordt.QueryID -} -type StateCrawlWaitingWithCapacity struct { - QueryID coordt.QueryID -} +type StateCrawlWaitingAtCapacity struct{} +type StateCrawlWaitingWithCapacity struct{} type StateCrawlFindCloser[K kad.Key[K], N kad.NodeID[K]] struct { - QueryID coordt.QueryID - Target K // the key that the query wants to find closer nodes for - NodeID N // the node to send the message to + Target K // the key that the query wants to find closer nodes for + NodeID N // the node to send the message to } // crawlState() ensures that only [Crawl] states can be assigned to a CrawlState. @@ -325,22 +308,19 @@ type EventCrawlPoll struct{} // type EventCrawlCancel struct{} // TODO: implement type EventCrawlStart[K kad.Key[K], N kad.NodeID[K]] struct { - QueryID coordt.QueryID - Seed []N + Seed []N } type EventCrawlNodeResponse[K kad.Key[K], N kad.NodeID[K]] struct { - QueryID coordt.QueryID NodeID N // the node the message was sent to Target K // the key that the node was asked for CloserNodes []N // the closer nodes sent by the node } type EventCrawlNodeFailure[K kad.Key[K], N kad.NodeID[K]] struct { - QueryID coordt.QueryID - NodeID N // the node the message was sent to - Target K // the key that the node was asked for - Error error // the error that caused the failure, if any + NodeID N // the node the message was sent to + Target K // the key that the node was asked for + Error error // the error that caused the failure, if any } // crawlEvent() ensures that only events accepted by [Crawl] can be assigned to a [CrawlEvent]. diff --git a/internal/coord/routing/crawl_test.go b/internal/coord/routing/crawl_test.go index 4ddb2e5..54ea12d 100644 --- a/internal/coord/routing/crawl_test.go +++ b/internal/coord/routing/crawl_test.go @@ -305,7 +305,7 @@ func TestCrawl_Advance_unrelated_response(t *testing.T) { // send it an unrelated response state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("another"), + QueryID: "another", NodeID: tstate.NodeID, Target: tstate.Target, CloserNodes: []tiny.Node{}, @@ -314,7 +314,7 @@ func TestCrawl_Advance_unrelated_response(t *testing.T) { // send it an unrelated response state = qry.Advance(ctx, &EventCrawlNodeFailure[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("another"), + QueryID: "another", NodeID: tstate.NodeID, Target: tstate.Target, Error: fmt.Errorf("some error"), diff --git a/internal/coord/routing_test.go b/internal/coord/routing_test.go index c2f8dd5..952b66b 100644 --- a/internal/coord/routing_test.go +++ b/internal/coord/routing_test.go @@ -37,6 +37,11 @@ func idleExplore() *RecordingSM[routing.ExploreEvent, routing.ExploreState] { return NewRecordingSM[routing.ExploreEvent, routing.ExploreState](&routing.StateExploreIdle{}) } +// idleCrawl returns an crawl state machine that is always idle +func idleCrawl() *RecordingSM[routing.CrawlEvent, routing.CrawlState] { + return NewRecordingSM[routing.CrawlEvent, routing.CrawlState](&routing.StateCrawlIdle{}) +} + func TestRoutingConfigValidate(t *testing.T) { t.Run("default is valid", func(t *testing.T) { cfg := DefaultRoutingConfig() @@ -225,7 +230,7 @@ func TestRoutingStartBootstrapSendsEvent(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, bootstrap, idleInclude(), idleProbe(), idleExplore(), cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, bootstrap, idleInclude(), idleProbe(), idleExplore(), idleCrawl(), cfg) require.NoError(t, err) ev := &EventStartBootstrap{ @@ -255,7 +260,7 @@ func TestRoutingBootstrapGetClosestNodesSuccess(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, bootstrap, idleInclude(), idleProbe(), idleExplore(), cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, bootstrap, idleInclude(), idleProbe(), idleExplore(), idleCrawl(), cfg) require.NoError(t, err) ev := &EventGetCloserNodesSuccess{ @@ -289,7 +294,7 @@ func TestRoutingBootstrapGetClosestNodesFailure(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, bootstrap, idleInclude(), idleProbe(), idleExplore(), cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, bootstrap, idleInclude(), idleProbe(), idleExplore(), idleCrawl(), cfg) require.NoError(t, err) failure := errors.New("failed") @@ -324,7 +329,7 @@ func TestRoutingAddNodeInfoSendsEvent(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), include, idleProbe(), idleExplore(), cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), include, idleProbe(), idleExplore(), idleCrawl(), cfg) require.NoError(t, err) ev := &EventAddNode{ @@ -354,7 +359,7 @@ func TestRoutingIncludeGetClosestNodesSuccess(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), include, idleProbe(), idleExplore(), cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), include, idleProbe(), idleExplore(), idleCrawl(), cfg) require.NoError(t, err) ev := &EventGetCloserNodesSuccess{ @@ -387,7 +392,7 @@ func TestRoutingIncludeGetClosestNodesFailure(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), include, idleProbe(), idleExplore(), cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), include, idleProbe(), idleExplore(), idleCrawl(), cfg) require.NoError(t, err) failure := errors.New("failed") @@ -431,7 +436,7 @@ func TestRoutingIncludedNodeAddToProbeList(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), include, probe, idleExplore(), cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), include, probe, idleExplore(), idleCrawl(), cfg) require.NoError(t, err) // a new node to be included @@ -510,7 +515,7 @@ func TestRoutingExploreSendsEvent(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), idleInclude(), idleProbe(), explore, cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), idleInclude(), idleProbe(), explore, idleCrawl(), cfg) require.NoError(t, err) routingBehaviour.Notify(ctx, &EventRoutingPoll{}) @@ -543,7 +548,7 @@ func TestRoutingExploreGetClosestNodesSuccess(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), idleInclude(), idleProbe(), explore, cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), idleInclude(), idleProbe(), explore, idleCrawl(), cfg) require.NoError(t, err) ev := &EventGetCloserNodesSuccess{ @@ -576,7 +581,7 @@ func TestRoutingExploreGetClosestNodesFailure(t *testing.T) { cfg := DefaultRoutingConfig() cfg.Clock = clk - routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), idleInclude(), idleProbe(), explore, cfg) + routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), idleInclude(), idleProbe(), explore, idleCrawl(), cfg) require.NoError(t, err) failure := errors.New("failed") @@ -596,3 +601,33 @@ func TestRoutingExploreGetClosestNodesFailure(t *testing.T) { require.Equal(t, peer.ID(nodes[1].NodeID), peer.ID(rev.NodeID)) require.Equal(t, failure, rev.Error) } + +func TestRoutingStartCrawlSendsEvent(t *testing.T) { + ctx := kadtest.CtxShort(t) + + clk := clock.NewMock() + _, nodes, err := nettest.LinearTopology(4, clk) + require.NoError(t, err) + + self := nodes[0].NodeID + + // records the event passed to bootstrap + crawl := NewRecordingSM[routing.CrawlEvent, routing.CrawlState](&routing.StateCrawlIdle{}) + + cfg := DefaultRoutingConfig() + cfg.Clock = clk + routingBehaviour, err := ComposeRoutingBehaviour(self, idleBootstrap(), idleInclude(), idleProbe(), idleExplore(), crawl, cfg) + require.NoError(t, err) + + ev := &EventStartCrawl{ + Seed: []kadt.PeerID{nodes[1].NodeID}, + } + + routingBehaviour.Notify(ctx, ev) + + // the event that should be passed to the bootstrap state machine + expected := &routing.EventCrawlStart[kadt.Key, kadt.PeerID]{ + Seed: ev.Seed, + } + require.Equal(t, expected, crawl.Received) +} From 7a432c7d6333ac662c7a836d32c1aeba24d7ffc6 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 12 Oct 2023 10:40:36 +0200 Subject: [PATCH 08/23] WIP --- internal/coord/routing/crawl.go | 8 ++-- internal/coord/routing/crawl_test.go | 55 +++++++++------------------- 2 files changed, 22 insertions(+), 41 deletions(-) diff --git a/internal/coord/routing/crawl.go b/internal/coord/routing/crawl.go index 5e8a706..712fd6f 100644 --- a/internal/coord/routing/crawl.go +++ b/internal/coord/routing/crawl.go @@ -193,7 +193,6 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat return &StateCrawlIdle{} } - span.RecordError(tev.Error) job := crawlJob[K, N]{ node: tev.NodeID, target: tev.Target, @@ -203,6 +202,7 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat if _, found := c.info.waiting[mapKey]; !found { break } + span.RecordError(tev.Error) delete(c.info.waiting, mapKey) c.info.failed[mapKey] = tev.NodeID @@ -283,8 +283,10 @@ type StateCrawlIdle struct{} type StateCrawlFinished struct{} -type StateCrawlWaitingAtCapacity struct{} -type StateCrawlWaitingWithCapacity struct{} +type ( + StateCrawlWaitingAtCapacity struct{} + StateCrawlWaitingWithCapacity struct{} +) type StateCrawlFindCloser[K kad.Key[K], N kad.NodeID[K]] struct { Target K // the key that the query wants to find closer nodes for diff --git a/internal/coord/routing/crawl_test.go b/internal/coord/routing/crawl_test.go index 54ea12d..53f2c9a 100644 --- a/internal/coord/routing/crawl_test.go +++ b/internal/coord/routing/crawl_test.go @@ -63,8 +63,7 @@ func TestNewCrawl_Start(t *testing.T) { require.NoError(t, err) qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("test"), - Seed: []tiny.Node{self, a, b}, + Seed: []tiny.Node{self, a, b}, }) require.NotNil(t, qry.info) require.Len(t, qry.info.todo, cfg.MaxCPL*2-1) // self is not included @@ -81,8 +80,7 @@ func TestNewCrawl_Start(t *testing.T) { require.NoError(t, err) state := qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("test"), - Seed: []tiny.Node{self}, + Seed: []tiny.Node{self}, }) require.Nil(t, qry.info) require.IsType(t, &StateCrawlFinished{}, state) @@ -96,18 +94,15 @@ func TestNewCrawl_Start(t *testing.T) { seed := []tiny.Node{a, b} qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("test"), - Seed: seed, + Seed: seed, }) qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("test"), - Seed: seed, + Seed: seed, }) qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("another"), - Seed: seed, + Seed: seed, }) }) @@ -117,14 +112,10 @@ func TestNewCrawl_Start(t *testing.T) { qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) require.NoError(t, err) - state := qry.Advance(context.Background(), &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("test"), - }) + state := qry.Advance(context.Background(), &EventCrawlNodeResponse[tiny.Key, tiny.Node]{}) require.IsType(t, &StateCrawlIdle{}, state) - state = qry.Advance(context.Background(), &EventCrawlNodeFailure[tiny.Key, tiny.Node]{ - QueryID: coordt.QueryID("test"), - }) + state = qry.Advance(context.Background(), &EventCrawlNodeFailure[tiny.Key, tiny.Node]{}) require.IsType(t, &StateCrawlIdle{}, state) state = qry.Advance(context.Background(), &EventCrawlPoll{}) @@ -148,14 +139,11 @@ func TestCrawl_Advance(t *testing.T) { cfg.MaxCPL = 4 cfg.Concurrency = 2 - queryID := coordt.QueryID("test") - qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) require.NoError(t, err) state := qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ - QueryID: queryID, - Seed: seed, + Seed: seed, }) assert.Len(t, qry.info.todo, 2*cfg.MaxCPL-1) assert.Len(t, qry.info.cpls, 2*cfg.MaxCPL) @@ -194,7 +182,6 @@ func TestCrawl_Advance(t *testing.T) { // simulate first successful response pop, pending := pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - QueryID: queryID, NodeID: pop.NodeID, Target: pop.Target, CloserNodes: []tiny.Node{}, @@ -213,7 +200,6 @@ func TestCrawl_Advance(t *testing.T) { // pop next successful response. This time it contains a new node! pop, pending = pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - QueryID: queryID, NodeID: pop.NodeID, Target: pop.Target, CloserNodes: []tiny.Node{c}, @@ -236,10 +222,9 @@ func TestCrawl_Advance(t *testing.T) { // simulate error pop, pending = pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeFailure[tiny.Key, tiny.Node]{ - QueryID: queryID, - NodeID: pop.NodeID, - Target: pop.Target, - Error: fmt.Errorf("some error"), + NodeID: pop.NodeID, + Target: pop.Target, + Error: fmt.Errorf("some error"), }) tstate, ok = state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) require.True(t, ok) @@ -254,7 +239,6 @@ func TestCrawl_Advance(t *testing.T) { for { pop, pending = pending[0], pending[1:] state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - QueryID: queryID, NodeID: pop.NodeID, Target: pop.Target, CloserNodes: []tiny.Node{}, @@ -282,20 +266,18 @@ func TestCrawl_Advance_unrelated_response(t *testing.T) { self := tiny.NewNode(0) a := tiny.NewNode(0b10000100) + b := tiny.NewNode(0b10010100) seed := []tiny.Node{self, a} cfg := DefaultCrawlConfig() cfg.MaxCPL = 1 cfg.Concurrency = 2 - queryID := coordt.QueryID("test") - qry, err := NewCrawl[tiny.Key, tiny.Node](self, tiny.NodeWithCpl, cfg) require.NoError(t, err) state := qry.Advance(context.Background(), &EventCrawlStart[tiny.Key, tiny.Node]{ - QueryID: queryID, - Seed: seed, + Seed: seed, }) tstate, ok := state.(*StateCrawlFindCloser[tiny.Key, tiny.Node]) require.True(t, ok, "type is %T", state) @@ -305,8 +287,7 @@ func TestCrawl_Advance_unrelated_response(t *testing.T) { // send it an unrelated response state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - QueryID: "another", - NodeID: tstate.NodeID, + NodeID: b, Target: tstate.Target, CloserNodes: []tiny.Node{}, }) @@ -314,16 +295,14 @@ func TestCrawl_Advance_unrelated_response(t *testing.T) { // send it an unrelated response state = qry.Advance(ctx, &EventCrawlNodeFailure[tiny.Key, tiny.Node]{ - QueryID: "another", - NodeID: tstate.NodeID, - Target: tstate.Target, - Error: fmt.Errorf("some error"), + NodeID: b, + Target: tstate.Target, + Error: fmt.Errorf("some error"), }) require.IsType(t, &StateCrawlWaitingWithCapacity{}, state) // still waiting with capacity because the response was ignored // send correct response state = qry.Advance(ctx, &EventCrawlNodeResponse[tiny.Key, tiny.Node]{ - QueryID: queryID, NodeID: tstate.NodeID, Target: tstate.Target, CloserNodes: []tiny.Node{}, From 4acfc835ae8ecc6b27e6428397ecbe5f769d043c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 12 Oct 2023 13:00:20 +0200 Subject: [PATCH 09/23] WIP --- fullrt.go | 23 ++++++++++ internal/coord/event.go | 3 +- internal/coord/routing.go | 30 +++++++++---- internal/coord/routing/crawl.go | 35 ++++++++++++--- internal/coord/routing/crawl_test.go | 21 +++++++-- internal/coord/routing/include.go | 62 +++++++++++++++++++------- internal/coord/routing/include_test.go | 49 ++++++++++++++++++++ 7 files changed, 190 insertions(+), 33 deletions(-) diff --git a/fullrt.go b/fullrt.go index 4e219d4..5f1667b 100644 --- a/fullrt.go +++ b/fullrt.go @@ -3,9 +3,11 @@ package zikade import ( "context" "fmt" + "time" "github.com/ipfs/go-cid" record "github.com/libp2p/go-libp2p-record" + "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/peerstore" @@ -20,6 +22,27 @@ import ( type FullRT struct { *DHT + + cfg *FullRTConfig +} + +type FullRTConfig struct { + *Config + CrawlInterval time.Duration +} + +func NewFullRT(h host.Host, cfg *FullRTConfig) (*FullRT, error) { + d, err := New(h, cfg.Config) + if err != nil { + return nil, fmt.Errorf("new DHT: %w", err) + } + + frt := &FullRT{ + DHT: d, + cfg: cfg, + } + + return frt, nil } var _ routing.Routing = (*FullRT)(nil) diff --git a/internal/coord/event.go b/internal/coord/event.go index ba94477..28f884e 100644 --- a/internal/coord/event.go +++ b/internal/coord/event.go @@ -117,7 +117,8 @@ func (*EventStopQuery) queryCommand() {} // EventAddNode notifies the routing behaviour of a potential new peer. type EventAddNode struct { - NodeID kadt.PeerID + NodeID kadt.PeerID + Checked bool // indicates whether this node has already passed a connectivity check and should be added to the routing table right away } func (*EventAddNode) behaviourEvent() {} diff --git a/internal/coord/routing.go b/internal/coord/routing.go index 91cb209..8d212ef 100644 --- a/internal/coord/routing.go +++ b/internal/coord/routing.go @@ -433,7 +433,7 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { cmd := &routing.EventBootstrapStart[kadt.Key, kadt.PeerID]{ KnownClosestNodes: ev.SeedNodes, } - // attempt to advance the bootstrap + // attempt to advance the bootstrap state machine next, ok := r.advanceBootstrap(ctx, cmd) if ok { r.pending = append(r.pending, next) @@ -443,7 +443,7 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { cmd := &routing.EventCrawlStart[kadt.Key, kadt.PeerID]{ Seed: ev.Seed, } - // attempt to advance the bootstrap + // attempt to advance the crawl state machine next, ok := r.advanceCrawl(ctx, cmd) if ok { r.pending = append(r.pending, next) @@ -454,11 +454,20 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { if r.self.Equal(ev.NodeID) { break } - // TODO: apply ttl - cmd := &routing.EventIncludeAddCandidate[kadt.Key, kadt.PeerID]{ - NodeID: ev.NodeID, + + var cmd routing.IncludeEvent + if ev.Checked { + cmd = &routing.EventIncludeNode[kadt.Key, kadt.PeerID]{ + NodeID: ev.NodeID, + } + } else { + // TODO: apply ttl + cmd = &routing.EventIncludeAddCandidate[kadt.Key, kadt.PeerID]{ + NodeID: ev.NodeID, + } } - // attempt to advance the include + + // attempt to advance the include state machine next, ok := r.advanceInclude(ctx, cmd) if ok { r.pending = append(r.pending, next) @@ -549,6 +558,11 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { } case routing.CrawlQueryID: + r.pending = append(r.pending, &EventAddNode{ + NodeID: ev.To, + Checked: true, + }) + cmd := &routing.EventCrawlNodeResponse[kadt.Key, kadt.PeerID]{ NodeID: ev.To, Target: ev.Target, @@ -642,7 +656,7 @@ func (r *RoutingBehaviour) notify(ctx context.Context, ev BehaviourEvent) { r.pending = append(r.pending, next) } - // tell the probe state machine in case there is are connectivity checks that could satisfied + // tell the probe state machine in case there are connectivity checks that could be satisfied cmdProbe := &routing.EventProbeNotifyConnectivity[kadt.Key, kadt.PeerID]{ NodeID: ev.NodeID, } @@ -912,7 +926,7 @@ func (r *RoutingBehaviour) advanceCrawl(ctx context.Context, ev routing.CrawlEve // crawl waiting for a message response but has capacity to do more case *routing.StateCrawlWaitingAtCapacity: // crawl waiting for a message response but has no capacity to do more - case *routing.StateCrawlFinished: + case *routing.StateCrawlFinished[kadt.Key, kadt.PeerID]: r.cfg.Logger.Info("crawl finished") case *routing.StateCrawlIdle: // bootstrap not running, nothing to do diff --git a/internal/coord/routing/crawl.go b/internal/coord/routing/crawl.go index 712fd6f..9741394 100644 --- a/internal/coord/routing/crawl.go +++ b/internal/coord/routing/crawl.go @@ -3,6 +3,7 @@ package routing import ( "context" "fmt" + "time" "github.com/plprobelab/go-libdht/kad" "github.com/plprobelab/go-libdht/kad/key" @@ -19,9 +20,10 @@ const CrawlQueryID = coordt.QueryID("crawl") // CrawlConfig specifies optional configuration for a Crawl type CrawlConfig struct { - MaxCPL int // the maximum CPL until we should crawl the peer - Concurrency int // the maximum number of concurrent peers that we may query - Tracer trace.Tracer // Tracer is the tracer that should be used to trace execution. + MaxCPL int // the maximum CPL until we should crawl the peer + Interval time.Duration // the interval in which the network should be crawled (0 means no crawling) + Concurrency int // the maximum number of concurrent peers that we may query + Tracer trace.Tracer // Tracer is the tracer that should be used to trace execution. } // Validate checks the configuration options and returns an error if any have invalid values. @@ -33,6 +35,13 @@ func (cfg *CrawlConfig) Validate() error { } } + if cfg.Interval < 0 { + return &errs.ConfigurationError{ + Component: "CrawlConfig", + Err: fmt.Errorf("crawl interval must be zero or positive"), + } + } + if cfg.Concurrency < 1 { return &errs.ConfigurationError{ Component: "CrawlConfig", @@ -242,8 +251,20 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat return &StateCrawlWaitingWithCapacity{} } + // generate list of new nodes for the routing table + nodes := make([]N, len(c.info.success)) + i := 0 + for _, node := range c.info.success { + nodes[i] = node + i += 1 + } + + // clear info to indicate that we're idle c.info = nil - return &StateCrawlFinished{} + + return &StateCrawlFinished[K, N]{ + Nodes: nodes, + } } func (c *Crawl[K, N]) setMapSizes(span trace.Span, prefix string) { @@ -281,7 +302,9 @@ type CrawlState interface { type StateCrawlIdle struct{} -type StateCrawlFinished struct{} +type StateCrawlFinished[K kad.Key[K], N kad.NodeID[K]] struct { + Nodes []N +} type ( StateCrawlWaitingAtCapacity struct{} @@ -294,7 +317,7 @@ type StateCrawlFindCloser[K kad.Key[K], N kad.NodeID[K]] struct { } // crawlState() ensures that only [Crawl] states can be assigned to a CrawlState. -func (*StateCrawlFinished) crawlState() {} +func (*StateCrawlFinished[K, N]) crawlState() {} func (*StateCrawlFindCloser[K, N]) crawlState() {} func (*StateCrawlWaitingAtCapacity) crawlState() {} func (*StateCrawlWaitingWithCapacity) crawlState() {} diff --git a/internal/coord/routing/crawl_test.go b/internal/coord/routing/crawl_test.go index 53f2c9a..dc7f1aa 100644 --- a/internal/coord/routing/crawl_test.go +++ b/internal/coord/routing/crawl_test.go @@ -20,6 +20,16 @@ func TestCrawlConfig_Validate(t *testing.T) { require.NoError(t, cfg.Validate()) }) + t.Run("crawl interval must be 0 or positive", func(t *testing.T) { + cfg := DefaultCrawlConfig() + cfg.Interval = 0 + require.NoError(t, cfg.Validate()) + + cfg = DefaultCrawlConfig() + cfg.Interval = -1 + require.Error(t, cfg.Validate()) + }) + t.Run("tracer is not nil", func(t *testing.T) { cfg := DefaultCrawlConfig() cfg.Tracer = nil @@ -83,7 +93,7 @@ func TestNewCrawl_Start(t *testing.T) { Seed: []tiny.Node{self}, }) require.Nil(t, qry.info) - require.IsType(t, &StateCrawlFinished{}, state) + require.IsType(t, &StateCrawlFinished[tiny.Key, tiny.Node]{}, state) }) t.Run("handles duplicate starts (does not panic)", func(t *testing.T) { @@ -254,8 +264,9 @@ func TestCrawl_Advance(t *testing.T) { continue } - if _, ok = state.(*StateCrawlFinished); ok { + if tstate, ok := state.(*StateCrawlFinished[tiny.Key, tiny.Node]); ok { require.Nil(t, qry.info) + require.Len(t, tstate.Nodes, 11) break } } @@ -307,6 +318,10 @@ func TestCrawl_Advance_unrelated_response(t *testing.T) { Target: tstate.Target, CloserNodes: []tiny.Node{}, }) - require.IsType(t, &StateCrawlFinished{}, state) + fstate, ok := state.(*StateCrawlFinished[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + require.Nil(t, qry.info) + require.Len(t, fstate.Nodes, 1) + require.Equal(t, tstate.NodeID, fstate.Nodes[0]) } diff --git a/internal/coord/routing/include.go b/internal/coord/routing/include.go index 1bd4a71..3da3c55 100644 --- a/internal/coord/routing/include.go +++ b/internal/coord/routing/include.go @@ -222,6 +222,18 @@ func (in *Include[K, N]) Advance(ctx context.Context, ev IncludeEvent) (out Incl } in.candidates.Enqueue(ctx, tev.NodeID) + case *EventIncludeNode[K, N]: + delete(in.checks, key.HexString(tev.NodeID.Key())) + if in.rt.AddNode(tev.NodeID) { + return &StateIncludeRoutingUpdated[K, N]{ + NodeID: tev.NodeID, + } + } + + // no need to remove the node from the candidate queue because we + // will enqueue as many nodes from the queue until we find one that + // is not yet included in the routing table. + case *EventIncludeConnectivityCheckSuccess[K, N]: in.counterChecksPassed.Add(ctx, 1) ch, ok := in.checks[key.HexString(tev.NodeID.Key())] @@ -252,25 +264,37 @@ func (in *Include[K, N]) Advance(ctx context.Context, ev IncludeEvent) (out Incl return &StateIncludeWaitingAtCapacity{} } - candidate, ok := in.candidates.Dequeue(ctx) - if !ok { - // No candidate in queue - if len(in.checks) > 0 { - return &StateIncludeWaitingWithCapacity{} + // dequeue multiple candidates and check if they are already in the routing + // table. If they are, we won't start a check for them. This can happen + // we have added them directly to the routing table via [EventIncludeNode]. + for { + candidate, ok := in.candidates.Dequeue(ctx) + if !ok { + break } - return &StateIncludeIdle{} - } - in.checks[key.HexString(candidate.Key())] = check[K, N]{ - NodeID: candidate, - Started: in.cfg.Clock.Now(), + if _, exists := in.rt.GetNode(candidate.Key()); exists { + continue + } + + in.checks[key.HexString(candidate.Key())] = check[K, N]{ + NodeID: candidate, + Started: in.cfg.Clock.Now(), + } + + // Ask the node to find itself + in.counterChecksSent.Add(ctx, 1) + return &StateIncludeConnectivityCheck[K, N]{ + NodeID: candidate, + } } - // Ask the node to find itself - in.counterChecksSent.Add(ctx, 1) - return &StateIncludeConnectivityCheck[K, N]{ - NodeID: candidate, + // No candidate in queue + if len(in.checks) > 0 { + return &StateIncludeWaitingWithCapacity{} } + + return &StateIncludeIdle{} } // nodeQueue is a bounded queue of unique NodeIDs @@ -304,7 +328,7 @@ func (q *nodeQueue[K, N]) Enqueue(ctx context.Context, id N) bool { return true } -// Dequeue reads an node from the queue. It returns the node and a true value +// Dequeue reads a node from the queue. It returns the node and a true value // if a node was read or nil and false if no node was read. func (q *nodeQueue[K, N]) Dequeue(ctx context.Context) (N, bool) { if len(q.nodes) == 0 { @@ -379,6 +403,13 @@ type EventIncludeAddCandidate[K kad.Key[K], N kad.NodeID[K]] struct { NodeID N // the candidate node } +// EventIncludeNode notifies an [Include] that a node should be added to the +// routing table straight away. This means this node will skip the candidate +// queue and potential checks. +type EventIncludeNode[K kad.Key[K], N kad.NodeID[K]] struct { + NodeID N // the node to be added +} + // EventIncludeConnectivityCheckSuccess notifies an [Include] that a requested connectivity check has received a successful response. type EventIncludeConnectivityCheckSuccess[K kad.Key[K], N kad.NodeID[K]] struct { NodeID N // the node the message was sent to @@ -393,5 +424,6 @@ type EventIncludeConnectivityCheckFailure[K kad.Key[K], N kad.NodeID[K]] struct // includeEvent() ensures that only events accepted by an [Include] can be assigned to the [IncludeEvent] interface. func (*EventIncludePoll) includeEvent() {} func (*EventIncludeAddCandidate[K, N]) includeEvent() {} +func (*EventIncludeNode[K, N]) includeEvent() {} func (*EventIncludeConnectivityCheckSuccess[K, N]) includeEvent() {} func (*EventIncludeConnectivityCheckFailure[K, N]) includeEvent() {} diff --git a/internal/coord/routing/include_test.go b/internal/coord/routing/include_test.go index 6c89dc8..27f4957 100644 --- a/internal/coord/routing/include_test.go +++ b/internal/coord/routing/include_test.go @@ -288,3 +288,52 @@ func TestIncludeConnectivityCheckFailure(t *testing.T) { require.False(t, found) require.Zero(t, foundNode) } + +func TestIncludeAddNodeSkipsCandidateQueue(t *testing.T) { + ctx := context.Background() + clk := clock.NewMock() + cfg := DefaultIncludeConfig() + cfg.Clock = clk + cfg.Concurrency = 1 + + rt, err := triert.New[tiny.Key, tiny.Node](tiny.NewNode(128), nil) + require.NoError(t, err) + p, err := NewInclude[tiny.Key, tiny.Node](rt, cfg) + require.NoError(t, err) + + candidate1 := tiny.NewNode(0b00000100) + candidate2 := tiny.NewNode(0b00000110) + + state := p.Advance(ctx, &EventIncludeAddCandidate[tiny.Key, tiny.Node]{ + NodeID: candidate1, + }) + require.IsType(t, &StateIncludeConnectivityCheck[tiny.Key, tiny.Node]{}, state) + + state = p.Advance(ctx, &EventIncludeAddCandidate[tiny.Key, tiny.Node]{ + NodeID: candidate2, + }) + require.IsType(t, &StateIncludeWaitingAtCapacity{}, state) + + state = p.Advance(ctx, &EventIncludeNode[tiny.Key, tiny.Node]{ + NodeID: candidate2, + }) + tstate, ok := state.(*StateIncludeRoutingUpdated[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + require.Equal(t, tstate.NodeID, candidate2) + + // there's still candidate2 in the candidates queue because we're at capacity + require.Len(t, p.candidates.nodes, 1) + + state = p.Advance(ctx, &EventIncludeConnectivityCheckSuccess[tiny.Key, tiny.Node]{ + NodeID: candidate1, + }) + tstate, ok = state.(*StateIncludeRoutingUpdated[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + require.Equal(t, tstate.NodeID, candidate1) + + // there's still candidate2 in the candidates queue because we're at capacity + require.Len(t, p.candidates.nodes, 1) + + state = p.Advance(ctx, &EventIncludePoll{}) + require.IsType(t, &StateIncludeIdle{}, state) +} From d99b8897a108b0de48877541cbc6777fa41ba51f Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 13 Oct 2023 09:38:16 +0200 Subject: [PATCH 10/23] WIP --- fullrt.go | 86 +++++++++++++++++++++++++----- fullrt_test.go | 23 ++++++++ internal/coord/coordinator.go | 68 +++++++++++++++++------ internal/coord/coordinator_test.go | 6 +-- internal/coord/event.go | 6 ++- internal/coord/query.go | 16 +++--- internal/coord/query/iter.go | 49 +++++++++++++++++ internal/coord/query/pool.go | 54 +++++++++++++------ internal/coord/query/strategy.go | 13 +++++ internal/coord/routing/crawl.go | 2 +- query_test.go | 2 +- routing.go | 16 ++++-- 12 files changed, 277 insertions(+), 64 deletions(-) create mode 100644 fullrt_test.go create mode 100644 internal/coord/query/strategy.go diff --git a/fullrt.go b/fullrt.go index 5f1667b..230771c 100644 --- a/fullrt.go +++ b/fullrt.go @@ -5,6 +5,11 @@ import ( "fmt" "time" + "github.com/multiformats/go-multiaddr" + + "github.com/plprobelab/zikade/internal/coord" + "github.com/plprobelab/zikade/internal/coord/query" + "github.com/ipfs/go-cid" record "github.com/libp2p/go-libp2p-record" "github.com/libp2p/go-libp2p/core/host" @@ -29,6 +34,15 @@ type FullRT struct { type FullRTConfig struct { *Config CrawlInterval time.Duration + QuorumFrac float64 +} + +func DefaultFullRTConfig() *FullRTConfig { + return &FullRTConfig{ + Config: DefaultConfig(), + CrawlInterval: time.Hour, // MAGIC + QuorumFrac: 0.25, // MAGIC + } } func NewFullRT(h host.Host, cfg *FullRTConfig) (*FullRT, error) { @@ -47,16 +61,16 @@ func NewFullRT(h host.Host, cfg *FullRTConfig) (*FullRT, error) { var _ routing.Routing = (*FullRT)(nil) -func (f *FullRT) FindPeer(ctx context.Context, id peer.ID) (peer.AddrInfo, error) { +func (f *FullRT) FindPeer(ctx context.Context, pid peer.ID) (peer.AddrInfo, error) { ctx, span := f.tele.Tracer.Start(ctx, "DHT.FindPeer") defer span.End() // First check locally. If we are or were recently connected to the peer, // return the addresses from our peerstore unless the information doesn't // contain any. - switch f.host.Network().Connectedness(id) { + switch f.host.Network().Connectedness(pid) { case network.Connected, network.CanConnect: - addrInfo := f.host.Peerstore().PeerInfo(id) + addrInfo := f.host.Peerstore().PeerInfo(pid) if addrInfo.ID != "" && len(addrInfo.Addrs) > 0 { return addrInfo, nil } @@ -64,25 +78,59 @@ func (f *FullRT) FindPeer(ctx context.Context, id peer.ID) (peer.AddrInfo, error // we're not connected or were recently connected } - var foundPeer peer.ID + maddrsMap := make(map[multiaddr.Multiaddr]struct{}) + quorum := int(float64(20) * f.cfg.QuorumFrac) // TODO: does not need to be 20 (can be less if routing table isn't full) - generally parameterize fn := func(ctx context.Context, visited kadt.PeerID, msg *pb.Message, stats coordt.QueryStats) error { - if peer.ID(visited) == id { - foundPeer = peer.ID(visited) + for _, addrInfo := range msg.CloserPeersAddrInfos() { + if addrInfo.ID != pid { + continue + } + + for _, maddr := range addrInfo.Addrs { + maddrsMap[maddr] = struct{}{} + } + } + + quorum -= 1 + if quorum == 0 { return coordt.ErrSkipRemaining } + return nil } - _, _, err := f.kad.QueryClosest(ctx, kadt.PeerID(id).Key(), fn, 20) + _, _, err := f.kad.QueryClosest(ctx, kadt.PeerID(pid).Key(), fn, f.queryConfig()) if err != nil { return peer.AddrInfo{}, fmt.Errorf("failed to run query: %w", err) } - if foundPeer == "" { - return peer.AddrInfo{}, fmt.Errorf("peer record not found") + if len(maddrsMap) == 0 { + return peer.AddrInfo{}, routing.ErrNotFound + } + + maddrs := make([]multiaddr.Multiaddr, 0, len(maddrsMap)) + for maddr := range maddrsMap { + maddrs = append(maddrs, maddr) + } + + connCtx, cancel := context.WithTimeout(ctx, 5*time.Second) // TODO: put timeout in config + defer cancel() + _ = f.host.Connect(connCtx, peer.AddrInfo{ + ID: pid, + Addrs: maddrs, + }) + + switch f.host.Network().Connectedness(pid) { + case network.Connected, network.CanConnect: + addrInfo := f.host.Peerstore().PeerInfo(pid) + if addrInfo.ID != "" && len(addrInfo.Addrs) > 0 { + return addrInfo, nil + } + default: + // we're not connected or were recently connected } - return f.host.Peerstore().PeerInfo(foundPeer), nil + return peer.AddrInfo{}, routing.ErrNotFound } func (f *FullRT) Provide(ctx context.Context, c cid.Cid, brdcst bool) error { @@ -126,8 +174,13 @@ func (f *FullRT) Provide(ctx context.Context, c cid.Cid, brdcst bool) error { }, } + seed, err := f.kad.GetClosestNodes(ctx, msg.Target(), f.cfg.BucketSize) + if err != nil { + return fmt.Errorf("get closest nodes from routing table: %w", err) + } + // finally, find the closest peers to the target key. - return f.kad.BroadcastRecord(ctx, msg) + return f.kad.BroadcastStatic(ctx, msg, seed) } // PutValue satisfies the [routing.Routing] interface and will add the given @@ -175,8 +228,8 @@ func (f *FullRT) PutValue(ctx context.Context, keyStr string, value []byte, opts func (f *FullRT) Bootstrap(ctx context.Context) error { ctx, span := f.tele.Tracer.Start(ctx, "DHT.Bootstrap") defer span.End() - f.log.Info("Starting bootstrap") + f.log.Info("Starting crawl bootstrap") seed := make([]kadt.PeerID, len(f.cfg.BootstrapPeers)) for i, addrInfo := range f.cfg.BootstrapPeers { seed[i] = kadt.PeerID(addrInfo.ID) @@ -186,5 +239,12 @@ func (f *FullRT) Bootstrap(ctx context.Context) error { f.host.Peerstore().AddAddrs(addrInfo.ID, addrInfo.Addrs, peerstore.PermanentAddrTTL) } - return f.kad.Bootstrap(ctx, seed) + return f.kad.Crawl(ctx, seed) +} + +func (f *FullRT) queryConfig() *coord.QueryConfig { + cfg := coord.DefaultQueryConfig() + cfg.NumResults = f.cfg.BucketSize + cfg.Strategy = &query.QueryStrategyStatic{} + return cfg } diff --git a/fullrt_test.go b/fullrt_test.go new file mode 100644 index 0000000..65187e7 --- /dev/null +++ b/fullrt_test.go @@ -0,0 +1,23 @@ +package zikade + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestNewFullRT(t *testing.T) { + cfg := &FullRTConfig{ + Config: DefaultConfig(), + CrawlInterval: 60 * time.Minute, + } + h := newTestHost(t) + fullRT, err := NewFullRT(h, cfg) + require.NoError(t, err) + + fullRT.Bootstrap(context.Background()) + + time.Sleep(time.Hour) +} diff --git a/internal/coord/coordinator.go b/internal/coord/coordinator.go index 83335fc..08d2b17 100644 --- a/internal/coord/coordinator.go +++ b/internal/coord/coordinator.go @@ -8,6 +8,8 @@ import ( "sync" "sync/atomic" + "github.com/plprobelab/zikade/internal/coord/query" + "github.com/benbjohnson/clock" "github.com/plprobelab/go-libdht/kad" "go.opentelemetry.io/otel" @@ -290,6 +292,18 @@ func (c *Coordinator) GetClosestNodes(ctx context.Context, k kadt.Key, n int) ([ return c.rt.NearestNodes(k, n), nil } +type QueryConfig struct { + NumResults int + Strategy query.QueryStrategy +} + +func DefaultQueryConfig() *QueryConfig { + return &QueryConfig{ + NumResults: 20, + Strategy: &query.QueryStrategyConverge{}, + } +} + // QueryClosest starts a query that attempts to find the closest nodes to the target key. // It returns the closest nodes found to the target key and statistics on the actions of the query. // @@ -301,15 +315,20 @@ func (c *Coordinator) GetClosestNodes(ctx context.Context, k kadt.Key, n int) ([ // numResults specifies the minimum number of nodes to successfully contact before considering iteration complete. // The query is considered to be exhausted when it has received responses from at least this number of nodes // and there are no closer nodes remaining to be contacted. A default of 20 is used if this value is less than 1. -func (c *Coordinator) QueryClosest(ctx context.Context, target kadt.Key, fn coordt.QueryFunc, numResults int) ([]kadt.PeerID, coordt.QueryStats, error) { +func (c *Coordinator) QueryClosest(ctx context.Context, target kadt.Key, fn coordt.QueryFunc, cfg *QueryConfig) ([]kadt.PeerID, coordt.QueryStats, error) { ctx, span := c.tele.Tracer.Start(ctx, "Coordinator.Query") defer span.End() c.cfg.Logger.Debug("starting query for closest nodes", tele.LogAttrKey(target)) + if cfg == nil { + cfg = DefaultQueryConfig() + } + // TODO: validate config + ctx, cancel := context.WithCancel(ctx) defer cancel() - seedIDs, err := c.GetClosestNodes(ctx, target, 20) + seedIDs, err := c.GetClosestNodes(ctx, target, cfg.NumResults) if err != nil { return nil, coordt.QueryStats{}, err } @@ -318,11 +337,12 @@ func (c *Coordinator) QueryClosest(ctx context.Context, target kadt.Key, fn coor queryID := c.newOperationID() cmd := &EventStartFindCloserQuery{ - QueryID: queryID, - Target: target, - KnownClosestNodes: seedIDs, - Notify: waiter, - NumResults: numResults, + QueryID: queryID, + Target: target, + Seed: seedIDs, + Notify: waiter, + NumResults: cfg.NumResults, + Strategy: cfg.Strategy, } // queue the start of the query @@ -342,7 +362,7 @@ func (c *Coordinator) QueryClosest(ctx context.Context, target kadt.Key, fn coor // numResults specifies the minimum number of nodes to successfully contact before considering iteration complete. // The query is considered to be exhausted when it has received responses from at least this number of nodes // and there are no closer nodes remaining to be contacted. A default of 20 is used if this value is less than 1. -func (c *Coordinator) QueryMessage(ctx context.Context, msg *pb.Message, fn coordt.QueryFunc, numResults int) ([]kadt.PeerID, coordt.QueryStats, error) { +func (c *Coordinator) QueryMessage(ctx context.Context, msg *pb.Message, fn coordt.QueryFunc, cfg *QueryConfig) ([]kadt.PeerID, coordt.QueryStats, error) { ctx, span := c.tele.Tracer.Start(ctx, "Coordinator.QueryMessage") defer span.End() if msg == nil { @@ -353,11 +373,12 @@ func (c *Coordinator) QueryMessage(ctx context.Context, msg *pb.Message, fn coor ctx, cancel := context.WithCancel(ctx) defer cancel() - if numResults < 1 { - numResults = 20 // TODO: parameterize + if cfg == nil { + cfg = DefaultQueryConfig() } + // TODO: validate config - seedIDs, err := c.GetClosestNodes(ctx, msg.Target(), numResults) + seedIDs, err := c.GetClosestNodes(ctx, msg.Target(), cfg.NumResults) if err != nil { return nil, coordt.QueryStats{}, err } @@ -366,12 +387,13 @@ func (c *Coordinator) QueryMessage(ctx context.Context, msg *pb.Message, fn coor queryID := c.newOperationID() cmd := &EventStartMessageQuery{ - QueryID: queryID, - Target: msg.Target(), - Message: msg, - KnownClosestNodes: seedIDs, - Notify: waiter, - NumResults: numResults, + QueryID: queryID, + Target: msg.Target(), + Message: msg, + Seed: seedIDs, + Notify: waiter, + NumResults: cfg.NumResults, + Strategy: cfg.Strategy, } // queue the start of the query @@ -551,6 +573,18 @@ func (c *Coordinator) Bootstrap(ctx context.Context, seeds []kadt.PeerID) error return nil } +// Crawl instructs the dht to begin a full network crawl +func (c *Coordinator) Crawl(ctx context.Context, seeds []kadt.PeerID) error { + ctx, span := c.tele.Tracer.Start(ctx, "Coordinator.Crawl") + defer span.End() + + c.routingBehaviour.Notify(ctx, &EventStartCrawl{ + Seed: seeds, + }) + + return nil +} + // NotifyConnectivity notifies the coordinator that a peer has passed a connectivity check // which means it is connected and supports finding closer nodes func (c *Coordinator) NotifyConnectivity(ctx context.Context, id kadt.PeerID) { diff --git a/internal/coord/coordinator_test.go b/internal/coord/coordinator_test.go index 0f79d88..08502d3 100644 --- a/internal/coord/coordinator_test.go +++ b/internal/coord/coordinator_test.go @@ -50,7 +50,7 @@ func TestConfigValidate(t *testing.T) { } func TestExhaustiveQuery(t *testing.T) { - ctx := kadtest.CtxShort(t) + ctx := context.Background() clk := clock.NewMock() _, nodes, err := nettest.LinearTopology(4, clk) @@ -77,7 +77,7 @@ func TestExhaustiveQuery(t *testing.T) { } // Run a query to find the value - _, _, err = c.QueryClosest(ctx, target, qfn, 20) + _, _, err = c.QueryClosest(ctx, target, qfn, DefaultQueryConfig()) require.NoError(t, err) require.Equal(t, 3, len(visited)) @@ -115,7 +115,7 @@ func TestRoutingUpdatedEventEmittedForCloserNodes(t *testing.T) { // Run a query to find the value target := nodes[3].NodeID.Key() - _, _, err = c.QueryClosest(ctx, target, qfn, 20) + _, _, err = c.QueryClosest(ctx, target, qfn, DefaultQueryConfig()) require.NoError(t, err) // the query run by the dht should have received a response from nodes[1] with closer nodes diff --git a/internal/coord/event.go b/internal/coord/event.go index 28f884e..e661247 100644 --- a/internal/coord/event.go +++ b/internal/coord/event.go @@ -91,7 +91,8 @@ type EventStartMessageQuery struct { Message *pb.Message KnownClosestNodes []kadt.PeerID Notify QueryMonitor[*EventQueryFinished] - NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + Strategy query.QueryStrategy // the way the query should be performed - [query.QueryStrategyConverge] will be used by default. } func (*EventStartMessageQuery) behaviourEvent() {} @@ -102,7 +103,8 @@ type EventStartFindCloserQuery struct { Target kadt.Key KnownClosestNodes []kadt.PeerID Notify QueryMonitor[*EventQueryFinished] - NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + Strategy query.QueryStrategy // the way the query should be performed - [query.QueryStrategyConverge] will be used by default. } func (*EventStartFindCloserQuery) behaviourEvent() {} diff --git a/internal/coord/query.go b/internal/coord/query.go index e565820..62eb82c 100644 --- a/internal/coord/query.go +++ b/internal/coord/query.go @@ -263,19 +263,21 @@ func (p *PooledQueryBehaviour) perfomNextInbound(ctx context.Context) (Behaviour switch ev := pev.Event.(type) { case *EventStartFindCloserQuery: cmd = &query.EventPoolAddFindCloserQuery[kadt.Key, kadt.PeerID]{ - QueryID: ev.QueryID, - Target: ev.Target, - Seed: ev.KnownClosestNodes, + QueryID: ev.QueryID, + Target: ev.Target, + Seed: ev.Seed, + Strategy: ev.Strategy, } if ev.Notify != nil { p.notifiers[ev.QueryID] = &queryNotifier[*EventQueryFinished]{monitor: ev.Notify} } case *EventStartMessageQuery: cmd = &query.EventPoolAddQuery[kadt.Key, kadt.PeerID, *pb.Message]{ - QueryID: ev.QueryID, - Target: ev.Target, - Message: ev.Message, - Seed: ev.KnownClosestNodes, + QueryID: ev.QueryID, + Target: ev.Target, + Message: ev.Message, + Seed: ev.Seed, + Strategy: ev.Strategy, } if ev.Notify != nil { p.notifiers[ev.QueryID] = &queryNotifier[*EventQueryFinished]{monitor: ev.Notify} diff --git a/internal/coord/query/iter.go b/internal/coord/query/iter.go index 84403de..4ef150c 100644 --- a/internal/coord/query/iter.go +++ b/internal/coord/query/iter.go @@ -97,3 +97,52 @@ func (iter *SequentialIter[K, N]) Each(ctx context.Context, fn func(context.Cont } return false } + +// StaticIter iterates through the give nodes but does not add any new nodes +// to the list. It implements the [NodeIter] interface but does not allow any +// new nodes to be added to the list. It is static in that sense. +type StaticIter[K kad.Key[K], N kad.NodeID[K]] struct { + // nodes holds the static set of nodes we want to iterate through + nodes []*NodeStatus[K, N] +} + +// NewStaticIter takes the sole list of nodes that this iterator will loop over. +func NewStaticIter[K kad.Key[K], N kad.NodeID[K]](nodes []N) *StaticIter[K, N] { + ns := make([]*NodeStatus[K, N], len(nodes)) + for i, node := range nodes { + ns[i] = &NodeStatus[K, N]{ + NodeID: node, + State: &StateNodeNotContacted{}, + } + } + + return &StaticIter[K, N]{ + nodes: ns, + } +} + +func (iter *StaticIter[K, N]) Add(ni *NodeStatus[K, N]) { + // no-op +} + +// Find returns the node information corresponding to the given Kademlia key. +// It uses a linear search which makes it unsuitable for large numbers of +// entries. +func (iter *StaticIter[K, N]) Find(k K) (*NodeStatus[K, N], bool) { + for i := range iter.nodes { + if key.Equal(k, iter.nodes[i].NodeID.Key()) { + return iter.nodes[i], true + } + } + + return nil, false +} + +func (iter *StaticIter[K, N]) Each(ctx context.Context, fn func(context.Context, *NodeStatus[K, N]) bool) bool { + for _, ns := range iter.nodes { + if fn(ctx, ns) { + return true + } + } + return false +} diff --git a/internal/coord/query/pool.go b/internal/coord/query/pool.go index d6c58d4..9f891cf 100644 --- a/internal/coord/query/pool.go +++ b/internal/coord/query/pool.go @@ -122,9 +122,9 @@ func (p *Pool[K, N, M]) Advance(ctx context.Context, ev PoolEvent) PoolState { switch tev := ev.(type) { case *EventPoolAddFindCloserQuery[K, N]: - p.addFindCloserQuery(ctx, tev.QueryID, tev.Target, tev.Seed, tev.NumResults) + p.addFindCloserQuery(ctx, tev) case *EventPoolAddQuery[K, N, M]: - p.addQuery(ctx, tev.QueryID, tev.Target, tev.Message, tev.Seed, tev.NumResults) + p.addQuery(ctx, tev) // TODO: return error as state case *EventPoolStopQuery: if qry, ok := p.queryIndex[tev.QueryID]; ok { @@ -258,55 +258,73 @@ func (p *Pool[K, N, M]) removeQuery(queryID coordt.QueryID) { // addQuery adds a query to the pool, returning the new query id // TODO: remove target argument and use msg.Target -func (p *Pool[K, N, M]) addQuery(ctx context.Context, queryID coordt.QueryID, target K, msg M, knownClosestNodes []N, numResults int) error { - if _, exists := p.queryIndex[queryID]; exists { +func (p *Pool[K, N, M]) addQuery(ctx context.Context, evt *EventPoolAddQuery[K, N, M]) error { + if _, exists := p.queryIndex[evt.QueryID]; exists { return fmt.Errorf("query id already in use") } - iter := NewClosestNodesIter[K, N](target) + + var iter NodeIter[K, N] + switch evt.Strategy.(type) { + case *QueryStrategyConverge: + iter = NewClosestNodesIter[K, N](evt.Target) + case *QueryStrategyStatic: + iter = NewStaticIter[K, N](evt.Seed) + default: + iter = NewClosestNodesIter[K, N](evt.Target) // default if unset + } qryCfg := DefaultQueryConfig() qryCfg.Clock = p.cfg.Clock qryCfg.Concurrency = p.cfg.QueryConcurrency qryCfg.RequestTimeout = p.cfg.RequestTimeout - if numResults > 0 { - qryCfg.NumResults = numResults + if evt.NumResults > 0 { + qryCfg.NumResults = evt.NumResults } - qry, err := NewQuery[K, N, M](p.self, queryID, target, msg, iter, knownClosestNodes, qryCfg) + qry, err := NewQuery[K, N, M](p.self, evt.QueryID, evt.Target, evt.Message, iter, evt.Seed, qryCfg) if err != nil { return fmt.Errorf("new query: %w", err) } p.queries = append(p.queries, qry) - p.queryIndex[queryID] = qry + p.queryIndex[evt.QueryID] = qry return nil } -// addQuery adds a find closer query to the pool, returning the new query id -func (p *Pool[K, N, M]) addFindCloserQuery(ctx context.Context, queryID coordt.QueryID, target K, knownClosestNodes []N, numResults int) error { - if _, exists := p.queryIndex[queryID]; exists { +// addFindCloserQuery adds a find closer query to the pool, returning the new query id +func (p *Pool[K, N, M]) addFindCloserQuery(ctx context.Context, evt *EventPoolAddFindCloserQuery[K, N]) error { + if _, exists := p.queryIndex[evt.QueryID]; exists { return fmt.Errorf("query id already in use") } - iter := NewClosestNodesIter[K, N](target) + + var iter NodeIter[K, N] + switch evt.Strategy.(type) { + case *QueryStrategyConverge: + iter = NewClosestNodesIter[K, N](evt.Target) + case *QueryStrategyStatic: + iter = NewStaticIter[K, N](evt.Seed) + default: + iter = NewClosestNodesIter[K, N](evt.Target) // default if unset + } qryCfg := DefaultQueryConfig() qryCfg.Clock = p.cfg.Clock qryCfg.Concurrency = p.cfg.QueryConcurrency qryCfg.RequestTimeout = p.cfg.RequestTimeout - if numResults > 0 { - qryCfg.NumResults = numResults + if evt.NumResults > 0 { + qryCfg.NumResults = evt.NumResults } - qry, err := NewFindCloserQuery[K, N, M](p.self, queryID, target, iter, knownClosestNodes, qryCfg) + qry, err := NewFindCloserQuery[K, N, M](p.self, evt.QueryID, evt.Target, iter, evt.Seed, qryCfg) if err != nil { return fmt.Errorf("new query: %w", err) } p.queries = append(p.queries, qry) - p.queryIndex[queryID] = qry + p.queryIndex[evt.QueryID] = qry return nil } @@ -377,6 +395,7 @@ type EventPoolAddFindCloserQuery[K kad.Key[K], N kad.NodeID[K]] struct { Target K // the target key for the query Seed []N // an initial set of close nodes the query should use NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + Strategy QueryStrategy // the way the query should be performed - [QueryStrategyConverge] will be used by default. } // EventPoolAddQuery is an event that attempts to add a new query that sends a message. @@ -386,6 +405,7 @@ type EventPoolAddQuery[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { Message M // message to be sent to each node Seed []N // an initial set of close nodes the query should use NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + Strategy QueryStrategy // the way the query should be performed - [QueryStrategyConverge] will be used by default. } // EventPoolStopQuery notifies a [Pool] to stop a query. diff --git a/internal/coord/query/strategy.go b/internal/coord/query/strategy.go new file mode 100644 index 0000000..d3b1312 --- /dev/null +++ b/internal/coord/query/strategy.go @@ -0,0 +1,13 @@ +package query + +type QueryStrategy interface { + queryStrategy() +} + +type QueryStrategyConverge struct{} + +func (q *QueryStrategyConverge) queryStrategy() {} + +type QueryStrategyStatic struct{} + +func (q *QueryStrategyStatic) queryStrategy() {} diff --git a/internal/coord/routing/crawl.go b/internal/coord/routing/crawl.go index 9741394..e7074d5 100644 --- a/internal/coord/routing/crawl.go +++ b/internal/coord/routing/crawl.go @@ -64,7 +64,7 @@ func (cfg *CrawlConfig) Validate() error { func DefaultCrawlConfig() *CrawlConfig { return &CrawlConfig{ MaxCPL: 16, - Concurrency: 1, + Concurrency: 200, Tracer: tele.NoopTracer(), } } diff --git a/query_test.go b/query_test.go index 053e0d7..f69f8d8 100644 --- a/query_test.go +++ b/query_test.go @@ -25,7 +25,7 @@ func TestRTAdditionOnSuccessfulQuery(t *testing.T) { // d1 does not know about d3 require.False(t, d1.kad.IsRoutable(ctx, kadt.PeerID(d3.host.ID()))) - // // but when d3 queries d2, d1 and d3 discover each other + // but when d3 queries d2, d1 and d3 discover each other _, _ = d3.FindPeer(ctx, "something") // ignore the error diff --git a/routing.go b/routing.go index f7ee5c7..485c99a 100644 --- a/routing.go +++ b/routing.go @@ -15,6 +15,7 @@ import ( "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/peerstore" "github.com/libp2p/go-libp2p/core/routing" + "github.com/plprobelab/zikade/internal/coord" "go.opentelemetry.io/otel/attribute" otel "go.opentelemetry.io/otel/trace" "golang.org/x/exp/slog" @@ -52,7 +53,10 @@ func (d *DHT) FindPeer(ctx context.Context, id peer.ID) (peer.AddrInfo, error) { return nil } - _, _, err := d.kad.QueryClosest(ctx, kadt.PeerID(id).Key(), fn, 20) + qcfg := coord.DefaultQueryConfig() + qcfg.NumResults = d.cfg.BucketSize + + _, _, err := d.kad.QueryClosest(ctx, kadt.PeerID(id).Key(), fn, qcfg) if err != nil { return peer.AddrInfo{}, fmt.Errorf("failed to run query: %w", err) } @@ -203,7 +207,10 @@ func (d *DHT) findProvidersAsyncRoutine(ctx context.Context, c cid.Cid, count in return nil } - _, _, err = d.kad.QueryMessage(ctx, msg, fn, d.cfg.BucketSize) + qcfg := coord.DefaultQueryConfig() + qcfg.NumResults = d.cfg.BucketSize + + _, _, err = d.kad.QueryMessage(ctx, msg, fn, qcfg) if err != nil { span.RecordError(err) d.log.Warn("Failed querying", slog.String("cid", c.String()), slog.String("err", err.Error())) @@ -436,7 +443,10 @@ func (d *DHT) searchValueRoutine(ctx context.Context, backend Backend, ns string return nil } - _, _, err := d.kad.QueryMessage(ctx, req, fn, d.cfg.BucketSize) + qcfg := coord.DefaultQueryConfig() + qcfg.NumResults = d.cfg.BucketSize + + _, _, err := d.kad.QueryMessage(ctx, req, fn, qcfg) if err != nil { d.warnErr(err, "Search value query failed") return From 490e947f77ee02f0c0246dc5d54ba322236fa168 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 13 Oct 2023 14:38:16 +0200 Subject: [PATCH 11/23] WIP --- config.go | 4 + fullrt.go | 358 +++++++++++++++++++++++++++++++++++++++++++++++------ routing.go | 39 ++++-- 3 files changed, 354 insertions(+), 47 deletions(-) diff --git a/config.go b/config.go index c1bb6c1..b95ec64 100644 --- a/config.go +++ b/config.go @@ -394,6 +394,10 @@ type QueryConfig struct { // we have observed in the network during the SearchValue/GetValue // operation. A DefaultQuorum of 0 means that we search the network until // we have exhausted the keyspace. + // + // If you are using the FullRT implementation and keep this value at 0, then + // the DefaultQuorum will be overwritten to be: + // [Config.BucketSize] * [FullRTConfig.QuorumFrac]. DefaultQuorum int } diff --git a/fullrt.go b/fullrt.go index 230771c..8c51642 100644 --- a/fullrt.go +++ b/fullrt.go @@ -1,26 +1,28 @@ package zikade import ( + "bytes" "context" + "errors" "fmt" "time" - "github.com/multiformats/go-multiaddr" - - "github.com/plprobelab/zikade/internal/coord" - "github.com/plprobelab/zikade/internal/coord/query" - "github.com/ipfs/go-cid" + ds "github.com/ipfs/go-datastore" record "github.com/libp2p/go-libp2p-record" + recpb "github.com/libp2p/go-libp2p-record/pb" "github.com/libp2p/go-libp2p/core/host" - "github.com/libp2p/go-libp2p/core/network" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/peerstore" "github.com/libp2p/go-libp2p/core/routing" + "github.com/multiformats/go-multiaddr" "go.opentelemetry.io/otel/attribute" otel "go.opentelemetry.io/otel/trace" + "golang.org/x/exp/slog" + "github.com/plprobelab/zikade/internal/coord" "github.com/plprobelab/zikade/internal/coord/coordt" + "github.com/plprobelab/zikade/internal/coord/query" "github.com/plprobelab/zikade/kadt" "github.com/plprobelab/zikade/pb" ) @@ -51,6 +53,10 @@ func NewFullRT(h host.Host, cfg *FullRTConfig) (*FullRT, error) { return nil, fmt.Errorf("new DHT: %w", err) } + if cfg.Query.DefaultQuorum == 0 { + cfg.Query.DefaultQuorum = int(float64(cfg.BucketSize) * cfg.QuorumFrac) + } + frt := &FullRT{ DHT: d, cfg: cfg, @@ -62,24 +68,19 @@ func NewFullRT(h host.Host, cfg *FullRTConfig) (*FullRT, error) { var _ routing.Routing = (*FullRT)(nil) func (f *FullRT) FindPeer(ctx context.Context, pid peer.ID) (peer.AddrInfo, error) { - ctx, span := f.tele.Tracer.Start(ctx, "DHT.FindPeer") + ctx, span := f.tele.Tracer.Start(ctx, "FullRT.FindPeer") defer span.End() // First check locally. If we are or were recently connected to the peer, // return the addresses from our peerstore unless the information doesn't // contain any. - switch f.host.Network().Connectedness(pid) { - case network.Connected, network.CanConnect: - addrInfo := f.host.Peerstore().PeerInfo(pid) - if addrInfo.ID != "" && len(addrInfo.Addrs) > 0 { - return addrInfo, nil - } - default: - // we're not connected or were recently connected + addrInfo, err := f.getRecentAddrInfo(pid) + if err == nil { + return addrInfo, nil } maddrsMap := make(map[multiaddr.Multiaddr]struct{}) - quorum := int(float64(20) * f.cfg.QuorumFrac) // TODO: does not need to be 20 (can be less if routing table isn't full) - generally parameterize + quorum := f.cfg.Query.DefaultQuorum fn := func(ctx context.Context, visited kadt.PeerID, msg *pb.Message, stats coordt.QueryStats) error { for _, addrInfo := range msg.CloserPeersAddrInfos() { if addrInfo.ID != pid { @@ -99,20 +100,24 @@ func (f *FullRT) FindPeer(ctx context.Context, pid peer.ID) (peer.AddrInfo, erro return nil } - _, _, err := f.kad.QueryClosest(ctx, kadt.PeerID(pid).Key(), fn, f.queryConfig()) + // start the query with a static set of peers (see queryConfig) + _, _, err = f.kad.QueryClosest(ctx, kadt.PeerID(pid).Key(), fn, f.queryConfig()) if err != nil { return peer.AddrInfo{}, fmt.Errorf("failed to run query: %w", err) } + // if we haven't found any addresses, return a not found error if len(maddrsMap) == 0 { return peer.AddrInfo{}, routing.ErrNotFound } + // transform map into slice maddrs := make([]multiaddr.Multiaddr, 0, len(maddrsMap)) for maddr := range maddrsMap { maddrs = append(maddrs, maddr) } + // connect to peer (this also happens in the non-fullrt case) connCtx, cancel := context.WithTimeout(ctx, 5*time.Second) // TODO: put timeout in config defer cancel() _ = f.host.Connect(connCtx, peer.AddrInfo{ @@ -120,21 +125,12 @@ func (f *FullRT) FindPeer(ctx context.Context, pid peer.ID) (peer.AddrInfo, erro Addrs: maddrs, }) - switch f.host.Network().Connectedness(pid) { - case network.Connected, network.CanConnect: - addrInfo := f.host.Peerstore().PeerInfo(pid) - if addrInfo.ID != "" && len(addrInfo.Addrs) > 0 { - return addrInfo, nil - } - default: - // we're not connected or were recently connected - } - - return peer.AddrInfo{}, routing.ErrNotFound + // return addresses + return f.getRecentAddrInfo(pid) } func (f *FullRT) Provide(ctx context.Context, c cid.Cid, brdcst bool) error { - ctx, span := f.tele.Tracer.Start(ctx, "DHT.Provide", otel.WithAttributes(attribute.String("cid", c.String()))) + ctx, span := f.tele.Tracer.Start(ctx, "FullRT.Provide", otel.WithAttributes(attribute.String("cid", c.String()))) defer span.End() // verify if this DHT supports provider records by checking if a "providers" @@ -179,17 +175,119 @@ func (f *FullRT) Provide(ctx context.Context, c cid.Cid, brdcst bool) error { return fmt.Errorf("get closest nodes from routing table: %w", err) } - // finally, find the closest peers to the target key. + // finally, store the record with the currently known closest peers return f.kad.BroadcastStatic(ctx, msg, seed) } +func (f *FullRT) FindProvidersAsync(ctx context.Context, c cid.Cid, count int) <-chan peer.AddrInfo { + peerOut := make(chan peer.AddrInfo) + go f.findProvidersAsyncRoutine(ctx, c, count, peerOut) + return peerOut +} + +func (f *FullRT) findProvidersAsyncRoutine(ctx context.Context, c cid.Cid, count int, out chan<- peer.AddrInfo) { + _, span := f.tele.Tracer.Start(ctx, "DHT.findProvidersAsyncRoutine", otel.WithAttributes(attribute.String("cid", c.String()), attribute.Int("count", count))) + defer span.End() + + defer close(out) + + // verify if this DHT supports provider records by checking + // if a "providers" backend is registered. + b, found := f.backends[namespaceProviders] + if !found || !c.Defined() { + span.RecordError(fmt.Errorf("no providers backend registered or CID undefined")) + return + } + + // send all providers onto the out channel until the desired count + // was reached. If no count was specified, continue with network lookup. + providers := map[peer.ID]struct{}{} + + // first fetch the record locally + stored, err := b.Fetch(ctx, string(c.Hash())) + if err != nil { + if !errors.Is(err, ds.ErrNotFound) { + span.RecordError(err) + f.log.Warn("Fetching value from provider store", slog.String("cid", c.String()), slog.String("err", err.Error())) + return + } + + stored = &providerSet{} + } + + ps, ok := stored.(*providerSet) + if !ok { + span.RecordError(err) + f.log.Warn("Stored value is not a provider set", slog.String("cid", c.String()), slog.String("type", fmt.Sprintf("%T", stored))) + return + } + + for _, provider := range ps.providers { + providers[provider.ID] = struct{}{} + + select { + case <-ctx.Done(): + return + case out <- provider: + } + + if count != 0 && len(providers) == count { + return + } + } + + // Craft message to send to other peers + msg := &pb.Message{ + Type: pb.Message_GET_PROVIDERS, + Key: c.Hash(), + } + + // handle node response + fn := func(ctx context.Context, id kadt.PeerID, resp *pb.Message, stats coordt.QueryStats) error { + // loop through all providers that the remote peer returned + for _, provider := range resp.ProviderAddrInfos() { + + // if we had already sent that peer on the channel -> do nothing + if _, found := providers[provider.ID]; found { + continue + } + + // keep track that we will have sent this peer on the channel + providers[provider.ID] = struct{}{} + + // actually send the provider information to the user + select { + case <-ctx.Done(): + return coordt.ErrSkipRemaining + case out <- provider: + } + + // if count is 0, we will wait until the query has exhausted the keyspace + // if count isn't 0, we will stop if the number of providers we have sent + // equals the number that the user has requested. + if count != 0 && len(providers) == count { + return coordt.ErrSkipRemaining + } + } + + return nil + } + + _, _, err = f.kad.QueryMessage(ctx, msg, fn, f.queryConfig()) + if err != nil { + span.RecordError(err) + f.log.Warn("Failed querying", slog.String("cid", c.String()), slog.String("err", err.Error())) + return + } +} + // PutValue satisfies the [routing.Routing] interface and will add the given // value to the k-closest nodes to keyStr. The parameter keyStr should have the // format `/$namespace/$binary_id`. Namespace examples are `pk` or `ipns`. To // identify the closest peers to keyStr, that complete string will be SHA256 // hashed. func (f *FullRT) PutValue(ctx context.Context, keyStr string, value []byte, opts ...routing.Option) error { - ctx, span := f.tele.Tracer.Start(ctx, "DHT.PutValue") + ctx, span := f.tele.Tracer.Start(ctx, "FullRT.PutValue") defer span.End() // first parse the routing options @@ -216,17 +314,17 @@ func (f *FullRT) PutValue(ctx context.Context, keyStr string, value []byte, opts Record: record.MakePutRecord(keyStr, value), } - // finally, find the closest peers to the target key. - err := f.kad.BroadcastRecord(ctx, msg) + seed, err := f.kad.GetClosestNodes(ctx, msg.Target(), f.cfg.BucketSize) if err != nil { - return fmt.Errorf("query error: %w", err) + return fmt.Errorf("get closest nodes from routing table: %w", err) } - return nil + // finally, store the record with the currently known closest peers + return f.kad.BroadcastStatic(ctx, msg, seed) } func (f *FullRT) Bootstrap(ctx context.Context) error { - ctx, span := f.tele.Tracer.Start(ctx, "DHT.Bootstrap") + ctx, span := f.tele.Tracer.Start(ctx, "FullRT.Bootstrap") defer span.End() f.log.Info("Starting crawl bootstrap") @@ -248,3 +346,191 @@ func (f *FullRT) queryConfig() *coord.QueryConfig { cfg.Strategy = &query.QueryStrategyStatic{} return cfg } + +func (f *FullRT) GetValue(ctx context.Context, key string, opts ...routing.Option) ([]byte, error) { + ctx, span := f.tele.Tracer.Start(ctx, "FullRT.GetValue") + defer span.End() + + // start searching for value + valueChan, err := f.SearchValue(ctx, key, opts...) + if err != nil { + return nil, err + } + + // valueChan will always emit "better" values than previous ones + // therefore, store the latest best value until the channel was closed + var best []byte + for val := range valueChan { + best = val + } + + // if the channel was closed because the context was cancelled, return + // the best known value and the context error. + if ctx.Err() != nil { + return best, ctx.Err() + } + + // if the query terminated without having found a value, + // return a not found error + if best == nil { + return nil, routing.ErrNotFound + } + + return best, nil +} + +// SearchValue will search in the DHT for keyStr. keyStr must have the form +// `/$namespace/$binary_id` +func (f *FullRT) SearchValue(ctx context.Context, keyStr string, options ...routing.Option) (<-chan []byte, error) { + _, span := f.tele.Tracer.Start(ctx, "FullRT.SearchValue") + defer span.End() + + // first parse the routing options + rOpt := &routing.Options{} // routing config + if err := rOpt.Apply(options...); err != nil { + return nil, fmt.Errorf("apply routing options: %w", err) + } + + ns, path, err := record.SplitKey(keyStr) + if err != nil { + return nil, fmt.Errorf("splitting key: %w", err) + } + + b, found := f.backends[ns] + if !found { + return nil, routing.ErrNotSupported + } + + val, err := b.Fetch(ctx, path) + if err != nil { + if !errors.Is(err, ds.ErrNotFound) { + return nil, fmt.Errorf("fetch from backend: %w", err) + } + + if rOpt.Offline { + return nil, routing.ErrNotFound + } + + out := make(chan []byte) + go f.searchValueRoutine(ctx, b, ns, path, rOpt, out) + return out, nil + } + + rec, ok := val.(*recpb.Record) + if !ok { + return nil, fmt.Errorf("expected *recpb.Record from backend, got: %T", val) + } + + if rOpt.Offline { + out := make(chan []byte, 1) + defer close(out) + out <- rec.GetValue() + return out, nil + } + + out := make(chan []byte) + go func() { + out <- rec.GetValue() + f.searchValueRoutine(ctx, b, ns, path, rOpt, out) + }() + + return out, nil +} + +func (f *FullRT) searchValueRoutine(ctx context.Context, backend Backend, ns string, path string, ropt *routing.Options, out chan<- []byte) { + _, span := f.tele.Tracer.Start(ctx, "DHT.searchValueRoutine") + defer span.End() + defer close(out) + + routingKey := []byte(newRoutingKey(ns, path)) + + req := &pb.Message{ + Type: pb.Message_GET_VALUE, + Key: routingKey, + } + + // The currently known best value for /$ns/$path + var best []byte + + // Peers that we identified to hold stale records + var fixupPeers []kadt.PeerID + + // The peers that returned the best value + quorumPeers := map[kadt.PeerID]struct{}{} + + // The quorum that we require for terminating the query. This number tells + // us how many peers must have responded with the "best" value before we + // cancel the query. + quorum := f.getQuorum(ropt) + + fn := func(ctx context.Context, id kadt.PeerID, resp *pb.Message, stats coordt.QueryStats) error { + rec := resp.GetRecord() + if rec == nil { + return nil + } + + if !bytes.Equal(routingKey, rec.GetKey()) { + return nil + } + + idx, _ := backend.Validate(ctx, path, best, rec.GetValue()) + switch idx { + case 0: // "best" is still the best value + if bytes.Equal(best, rec.GetValue()) { + quorumPeers[id] = struct{}{} + } + + case 1: // rec.GetValue() is better than our current "best" + + // We have identified a better record. All peers that were currently + // in our set of quorum peers need to be updated wit this new record + for p := range quorumPeers { + fixupPeers = append(fixupPeers, p) + } + + // re-initialize the quorum peers set for this new record + quorumPeers = map[kadt.PeerID]struct{}{} + quorumPeers[id] = struct{}{} + + // submit the new value to the user + best = rec.GetValue() + out <- best + case -1: // "best" and rec.GetValue() are both invalid + return nil + + default: + f.log.Warn("unexpected validate index", slog.Int("idx", idx)) + } + + // Check if we have reached the quorum + if len(quorumPeers) == quorum { + return coordt.ErrSkipRemaining + } + + return nil + } + + _, _, err := f.kad.QueryMessage(ctx, req, fn, f.queryConfig()) + if err != nil { + f.warnErr(err, "Search value query failed") + return + } + + // check if we have peers that we found to hold stale records. If so, + // update them asynchronously. + if len(fixupPeers) == 0 { + return + } + + go func() { + msg := &pb.Message{ + Type: pb.Message_PUT_VALUE, + Key: routingKey, + Record: record.MakePutRecord(string(routingKey), best), + } + + if err := f.kad.BroadcastStatic(ctx, msg, fixupPeers); err != nil { + f.log.Warn("Failed updating peer") + } + }() +} diff --git a/routing.go b/routing.go index 485c99a..251b686 100644 --- a/routing.go +++ b/routing.go @@ -27,26 +27,21 @@ import ( var _ routing.Routing = (*DHT)(nil) -func (d *DHT) FindPeer(ctx context.Context, id peer.ID) (peer.AddrInfo, error) { +func (d *DHT) FindPeer(ctx context.Context, pid peer.ID) (peer.AddrInfo, error) { ctx, span := d.tele.Tracer.Start(ctx, "DHT.FindPeer") defer span.End() // First check locally. If we are or were recently connected to the peer, // return the addresses from our peerstore unless the information doesn't // contain any. - switch d.host.Network().Connectedness(id) { - case network.Connected, network.CanConnect: - addrInfo := d.host.Peerstore().PeerInfo(id) - if addrInfo.ID != "" && len(addrInfo.Addrs) > 0 { - return addrInfo, nil - } - default: - // we're not connected or were recently connected + addrInfo, err := d.getRecentAddrInfo(pid) + if err == nil { + return addrInfo, nil } var foundPeer peer.ID fn := func(ctx context.Context, visited kadt.PeerID, msg *pb.Message, stats coordt.QueryStats) error { - if peer.ID(visited) == id { + if peer.ID(visited) == pid { foundPeer = peer.ID(visited) return coordt.ErrSkipRemaining } @@ -56,7 +51,7 @@ func (d *DHT) FindPeer(ctx context.Context, id peer.ID) (peer.AddrInfo, error) { qcfg := coord.DefaultQueryConfig() qcfg.NumResults = d.cfg.BucketSize - _, _, err := d.kad.QueryClosest(ctx, kadt.PeerID(id).Key(), fn, qcfg) + _, _, err = d.kad.QueryClosest(ctx, kadt.PeerID(pid).Key(), fn, qcfg) if err != nil { return peer.AddrInfo{}, fmt.Errorf("failed to run query: %w", err) } @@ -68,6 +63,21 @@ func (d *DHT) FindPeer(ctx context.Context, id peer.ID) (peer.AddrInfo, error) { return d.host.Peerstore().PeerInfo(foundPeer), nil } +// getRecentAddrInfo returns the peer's address information if we are or were +// recently connected to it. +func (d *DHT) getRecentAddrInfo(pid peer.ID) (peer.AddrInfo, error) { + switch d.host.Network().Connectedness(pid) { + case network.Connected, network.CanConnect: + addrInfo := d.host.Peerstore().PeerInfo(pid) + if addrInfo.ID != "" && len(addrInfo.Addrs) > 0 { + return addrInfo, nil + } + default: + // we're not connected or were recently connected + } + return peer.AddrInfo{}, routing.ErrNotFound +} + func (d *DHT) Provide(ctx context.Context, c cid.Cid, brdcst bool) error { ctx, span := d.tele.Tracer.Start(ctx, "DHT.Provide", otel.WithAttributes(attribute.String("cid", c.String()))) defer span.End() @@ -291,20 +301,27 @@ func (d *DHT) GetValue(ctx context.Context, key string, opts ...routing.Option) ctx, span := d.tele.Tracer.Start(ctx, "DHT.GetValue") defer span.End() + // start searching for value valueChan, err := d.SearchValue(ctx, key, opts...) if err != nil { return nil, err } + // valueChan will always emit "better" values than previous ones + // therefore, store the latest best value until the channel was closed var best []byte for val := range valueChan { best = val } + // if the channel was closed because the context was cancelled, return + // the best known value and the context error. if ctx.Err() != nil { return best, ctx.Err() } + // if the query terminated without having found a value, + // return a not found error if best == nil { return nil, routing.ErrNotFound } From 69ce4f47d9f6e4121b88c7d310ba423f172aeb77 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 13 Oct 2023 19:02:57 +0200 Subject: [PATCH 12/23] wip --- fullrt.go | 70 ++++++++++ internal/coord/brdcst.go | 6 +- internal/coord/brdcst/brdcst.go | 15 +- internal/coord/brdcst/brdcst_test.go | 1 - internal/coord/brdcst/config.go | 65 ++++++--- internal/coord/brdcst/config_test.go | 17 +-- internal/coord/brdcst/followup.go | 41 ++++-- internal/coord/brdcst/mtm.go | 200 +++++++++++++++++++++++++++ internal/coord/brdcst/mtm_test.go | 87 ++++++++++++ internal/coord/brdcst/otm.go | 146 +++++++++++++++++++ internal/coord/brdcst/pool.go | 36 ++--- internal/coord/brdcst/pool_test.go | 32 ++--- internal/coord/brdcst/static.go | 143 ------------------- internal/coord/brdcst_events.go | 35 ----- internal/coord/coordinator.go | 37 ++--- internal/coord/event.go | 67 ++++++--- internal/coord/network.go | 2 + internal/coord/query.go | 1 + internal/coord/query/pool.go | 2 + internal/coord/query/query.go | 2 + internal/coord/query_test.go | 43 +++--- routing.go | 19 ++- 22 files changed, 722 insertions(+), 345 deletions(-) create mode 100644 internal/coord/brdcst/mtm.go create mode 100644 internal/coord/brdcst/mtm_test.go create mode 100644 internal/coord/brdcst/otm.go delete mode 100644 internal/coord/brdcst/static.go delete mode 100644 internal/coord/brdcst_events.go diff --git a/fullrt.go b/fullrt.go index 8c51642..04afcaf 100644 --- a/fullrt.go +++ b/fullrt.go @@ -16,6 +16,7 @@ import ( "github.com/libp2p/go-libp2p/core/peerstore" "github.com/libp2p/go-libp2p/core/routing" "github.com/multiformats/go-multiaddr" + mh "github.com/multiformats/go-multihash" "go.opentelemetry.io/otel/attribute" otel "go.opentelemetry.io/otel/trace" "golang.org/x/exp/slog" @@ -534,3 +535,72 @@ func (f *FullRT) searchValueRoutine(ctx context.Context, backend Backend, ns str } }() } + +func (f *FullRT) ProvideMany(ctx context.Context, mhashes []mh.Multihash) error { + _, span := f.tele.Tracer.Start(ctx, "FullRT.ProvideMany") + defer span.End() + + _, found := f.backends[namespaceProviders] + if !found { + return routing.ErrNotSupported + } + + // Compute addresses once for all provides + self := peer.AddrInfo{ + ID: f.host.ID(), + Addrs: f.host.Addrs(), + } + if len(self.Addrs) < 1 { + return fmt.Errorf("no known addresses for self, cannot put provider") + } + + msgFn := func(k kadt.Key) *pb.Message { + return &pb.Message{ + Type: pb.Message_ADD_PROVIDER, + Key: k.MsgKey(), + ProviderPeers: []*pb.Message_Peer{ + pb.FromAddrInfo(self), + }, + } + } + + keys := make([]kadt.Key, 0, len(mhashes)) + for _, mhash := range mhashes { + keys = append(keys, kadt.NewKey(mhash)) + } + + // TODO: get seed set of peers + return f.kad.BroadcastMany(ctx, keys, nil, msgFn) +} + +//func (f *FullRT) PutMany(ctx context.Context, keys []string, values [][]byte) error { +// _, span := f.tele.Tracer.Start(ctx, "FullRT.PutMany") +// defer span.End() +// +// +// if !dht.enableValues { +// return routing.ErrNotSupported +// } +// +// if len(keys) != len(values) { +// return fmt.Errorf("number of keys does not match the number of values") +// } +// +// keysAsPeerIDs := make([]peer.ID, 0, len(keys)) +// keyRecMap := make(map[string][]byte) +// for i, k := range keys { +// keysAsPeerIDs = append(keysAsPeerIDs, peer.ID(k)) +// keyRecMap[k] = values[i] +// } +// +// if len(keys) != len(keyRecMap) { +// return fmt.Errorf("does not support duplicate keys") +// } +// +// fn := func(ctx context.Context, p, k peer.ID) error { +// keyStr := string(k) +// return dht.protoMessenger.PutValue(ctx, p, record.MakePutRecord(keyStr, keyRecMap[keyStr])) +// } +// +// return dht.bulkMessageSend(ctx, keysAsPeerIDs, fn, false) +//} diff --git a/internal/coord/brdcst.go b/internal/coord/brdcst.go index 331ea41..6aeef74 100644 --- a/internal/coord/brdcst.go +++ b/internal/coord/brdcst.go @@ -165,8 +165,7 @@ func (b *PooledBroadcastBehaviour) perfomNextInbound(ctx context.Context) (Behav case *EventStartBroadcast: cmd = &brdcst.EventPoolStartBroadcast[kadt.Key, kadt.PeerID, *pb.Message]{ QueryID: ev.QueryID, - Target: ev.Target, - Message: ev.Message, + MsgFunc: ev.MsgFunc, Seed: ev.Seed, Config: ev.Config, } @@ -227,6 +226,7 @@ func (b *PooledBroadcastBehaviour) perfomNextInbound(ctx context.Context) (Behav cmd = &brdcst.EventPoolStoreRecordSuccess[kadt.Key, kadt.PeerID, *pb.Message]{ QueryID: ev.QueryID, NodeID: ev.To, + Target: ev.Target, Request: ev.Request, Response: ev.Response, } @@ -242,6 +242,7 @@ func (b *PooledBroadcastBehaviour) perfomNextInbound(ctx context.Context) (Behav NodeID: ev.To, QueryID: ev.QueryID, Request: ev.Request, + Target: ev.Target, Error: ev.Err, } @@ -277,6 +278,7 @@ func (b *PooledBroadcastBehaviour) advancePool(ctx context.Context, ev brdcst.Po return &EventOutboundSendMessage{ QueryID: st.QueryID, To: st.NodeID, + Target: st.Target, Message: st.Message, Notify: b, }, true diff --git a/internal/coord/brdcst/brdcst.go b/internal/coord/brdcst/brdcst.go index d755a3c..bcfc80b 100644 --- a/internal/coord/brdcst/brdcst.go +++ b/internal/coord/brdcst/brdcst.go @@ -30,7 +30,8 @@ type StateBroadcastFindCloser[K kad.Key[K], N kad.NodeID[K]] struct { type StateBroadcastStoreRecord[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the id of the broadcast operation that wants to send the message NodeID N // the node to send the message to - Message M // the message the broadcast behaviour wants to send + Target K + Message M // the message the broadcast behaviour wants to send } // StateBroadcastWaiting indicates that a [Broadcast] state machine is waiting @@ -84,13 +85,6 @@ type BroadcastEvent interface { // it can perform housekeeping work such as time out queries. type EventBroadcastPoll struct{} -// EventBroadcastStart is an event that instructs a broadcast state machine to -// start the operation. -type EventBroadcastStart[K kad.Key[K], N kad.NodeID[K]] struct { - Target K // the key we want to store the record for - Seed []N // the closest nodes we know so far and from where we start the operation -} - // EventBroadcastStop notifies a [Broadcast] state machine to stop the // operation. This comprises all in-flight queries. type EventBroadcastStop struct{} @@ -119,6 +113,7 @@ type EventBroadcastNodeFailure[K kad.Key[K], N kad.NodeID[K]] struct { // receive a response. type EventBroadcastStoreRecordSuccess[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { NodeID N // the node the message was sent to + Target K Request M // the message that was sent to the remote node Response M // the reply we got from the remote node (nil in many cases of the Amino DHT) } @@ -127,7 +122,8 @@ type EventBroadcastStoreRecordSuccess[K kad.Key[K], N kad.NodeID[K], M coordt.Me // machine that storing a record with a remote node (NodeID) has failed. The // message that was sent is held in Request, and the error will be in Error. type EventBroadcastStoreRecordFailure[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { - NodeID N // the node the message was sent to + NodeID N // the node the message was sent to + Target K Request M // the message that was sent to the remote node Error error // the error that caused the failure, if any } @@ -136,7 +132,6 @@ type EventBroadcastStoreRecordFailure[K kad.Key[K], N kad.NodeID[K], M coordt.Me // machine can be assigned to the [BroadcastEvent] interface. func (*EventBroadcastStop) broadcastEvent() {} func (*EventBroadcastPoll) broadcastEvent() {} -func (*EventBroadcastStart[K, N]) broadcastEvent() {} func (*EventBroadcastNodeResponse[K, N]) broadcastEvent() {} func (*EventBroadcastNodeFailure[K, N]) broadcastEvent() {} func (*EventBroadcastStoreRecordSuccess[K, N, M]) broadcastEvent() {} diff --git a/internal/coord/brdcst/brdcst_test.go b/internal/coord/brdcst/brdcst_test.go index 9284b03..3c5d92a 100644 --- a/internal/coord/brdcst/brdcst_test.go +++ b/internal/coord/brdcst/brdcst_test.go @@ -23,7 +23,6 @@ func TestBroadcastEvent_interface_conformance(t *testing.T) { events := []BroadcastEvent{ &EventBroadcastStop{}, &EventBroadcastPoll{}, - &EventBroadcastStart[tiny.Key, tiny.Node]{}, &EventBroadcastNodeResponse[tiny.Key, tiny.Node]{}, &EventBroadcastNodeFailure[tiny.Key, tiny.Node]{}, &EventBroadcastStoreRecordSuccess[tiny.Key, tiny.Node, tiny.Message]{}, diff --git a/internal/coord/brdcst/config.go b/internal/coord/brdcst/config.go index 90576f5..fd709c9 100644 --- a/internal/coord/brdcst/config.go +++ b/internal/coord/brdcst/config.go @@ -3,6 +3,8 @@ package brdcst import ( "fmt" + "github.com/plprobelab/go-libdht/kad" + "github.com/plprobelab/zikade/internal/coord/query" ) @@ -31,7 +33,7 @@ func DefaultConfigPool() *ConfigPool { // Config is an interface that all broadcast configurations must implement. // Because we have multiple ways of broadcasting records to the network, like -// [FollowUp] or [Static], the [EventPoolStartBroadcast] has a configuration +// [FollowUp] or [OneToMany], the [EventPoolStartBroadcast] has a configuration // field that depending on the concrete type of [Config] initializes the // respective state machine. Then the broadcast operation will be performed // based on the encoded rules in that state machine. @@ -39,53 +41,72 @@ type Config interface { broadcastConfig() } -func (c *ConfigFollowUp) broadcastConfig() {} -func (c *ConfigOptimistic) broadcastConfig() {} -func (c *ConfigStatic) broadcastConfig() {} +func (c *ConfigFollowUp[K]) broadcastConfig() {} +func (c *ConfigOneToMany[K]) broadcastConfig() {} +func (c *ConfigManyToMany[K]) broadcastConfig() {} // ConfigFollowUp specifies the configuration for the [FollowUp] state machine. -type ConfigFollowUp struct{} +type ConfigFollowUp[K kad.Key[K]] struct { + Target K +} // Validate checks the configuration options and returns an error if any have // invalid values. -func (c *ConfigFollowUp) Validate() error { +func (c *ConfigFollowUp[K]) Validate() error { return nil } // DefaultConfigFollowUp returns the default configuration options for the // [FollowUp] state machine. -func DefaultConfigFollowUp() *ConfigFollowUp { - return &ConfigFollowUp{} +func DefaultConfigFollowUp[K kad.Key[K]](target K) *ConfigFollowUp[K] { + return &ConfigFollowUp[K]{ + Target: target, + } } -// ConfigOptimistic specifies the configuration for the [Optimistic] state +// ConfigOneToMany specifies the configuration for the [OneToMany] state // machine. -type ConfigOptimistic struct{} +type ConfigOneToMany[K kad.Key[K]] struct { + Target K +} // Validate checks the configuration options and returns an error if any have // invalid values. -func (c *ConfigOptimistic) Validate() error { +func (c *ConfigOneToMany[K]) Validate() error { return nil } -// DefaultConfigOptimistic returns the default configuration options for the -// [Optimistic] state machine. -func DefaultConfigOptimistic() *ConfigOptimistic { - return &ConfigOptimistic{} +// DefaultConfigOneToMany returns the default configuration options for the +// [OneToMany] state machine. +func DefaultConfigOneToMany[K kad.Key[K]](target K) *ConfigOneToMany[K] { + return &ConfigOneToMany[K]{ + Target: target, + } } -// ConfigStatic specifies the configuration for the [Static] state +// ConfigManyToMany specifies the configuration for the [ManyToMany] state // machine. -type ConfigStatic struct{} +type ConfigManyToMany[K kad.Key[K]] struct { + NodeConcurrency int + StreamConcurrency int + Targets []K +} // Validate checks the configuration options and returns an error if any have // invalid values. -func (c *ConfigStatic) Validate() error { +func (c *ConfigManyToMany[K]) Validate() error { + if len(c.Targets) == 0 { + return fmt.Errorf("targets must not be empty") + } return nil } -// DefaultConfigStatic returns the default configuration options for the -// [Static] state machine. -func DefaultConfigStatic() *ConfigStatic { - return &ConfigStatic{} +// DefaultConfigManyToMany returns the default configuration options for the +// [ManyToMany] state machine. +func DefaultConfigManyToMany[K kad.Key[K]](targets []K) *ConfigManyToMany[K] { + return &ConfigManyToMany[K]{ + NodeConcurrency: 100, // MAGIC + StreamConcurrency: 10, // MAGIC + Targets: targets, + } } diff --git a/internal/coord/brdcst/config_test.go b/internal/coord/brdcst/config_test.go index 4377952..16654fe 100644 --- a/internal/coord/brdcst/config_test.go +++ b/internal/coord/brdcst/config_test.go @@ -3,6 +3,8 @@ package brdcst import ( "testing" + "github.com/plprobelab/zikade/internal/tiny" + "github.com/stretchr/testify/assert" ) @@ -21,23 +23,16 @@ func TestConfigPool_Validate(t *testing.T) { func TestConfigFollowUp_Validate(t *testing.T) { t.Run("default is valid", func(t *testing.T) { - cfg := DefaultConfigFollowUp() - assert.NoError(t, cfg.Validate()) - }) -} - -func TestConfigOptimistic_Validate(t *testing.T) { - t.Run("default is valid", func(t *testing.T) { - cfg := DefaultConfigOptimistic() + cfg := DefaultConfigFollowUp[tiny.Key](tiny.Key(0)) assert.NoError(t, cfg.Validate()) }) } func TestConfig_interface_conformance(t *testing.T) { configs := []Config{ - &ConfigFollowUp{}, - &ConfigOptimistic{}, - &ConfigStatic{}, + &ConfigFollowUp[tiny.Key]{}, + &ConfigOneToMany[tiny.Key]{}, + &ConfigManyToMany[tiny.Key]{}, } for _, c := range configs { c.broadcastConfig() // drives test coverage diff --git a/internal/coord/brdcst/followup.go b/internal/coord/brdcst/followup.go index a9a47b4..56df355 100644 --- a/internal/coord/brdcst/followup.go +++ b/internal/coord/brdcst/followup.go @@ -22,7 +22,7 @@ type FollowUp[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { queryID coordt.QueryID // a struct holding configuration options - cfg *ConfigFollowUp + cfg *ConfigFollowUp[K] // a reference to the query pool in which the "get closer nodes" queries // will be spawned. This pool is governed by the broadcast [Pool]. @@ -30,8 +30,15 @@ type FollowUp[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { // the logic much easier to implement. pool *query.Pool[K, N, M] - // the message that we will send to the closest nodes in the follow-up phase - msg M + // TODO: ... + started bool + + // the message generator that takes a target key and will return the message + // that we will send to the closest nodes in the follow-up phase + msgFunc func(K) M + + // TODO: + seed []N // the closest nodes to the target key. This will be filled after the query // for the closest nodes has finished (when the query pool emits a @@ -56,12 +63,14 @@ type FollowUp[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { } // NewFollowUp initializes a new [FollowUp] struct. -func NewFollowUp[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.QueryID, pool *query.Pool[K, N, M], msg M, cfg *ConfigFollowUp) *FollowUp[K, N, M] { - return &FollowUp[K, N, M]{ +func NewFollowUp[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.QueryID, msgFunc func(K) M, pool *query.Pool[K, N, M], seed []N, cfg *ConfigFollowUp[K]) *FollowUp[K, N, M] { + f := &FollowUp[K, N, M]{ queryID: qid, cfg: cfg, pool: pool, - msg: msg, + started: false, + msgFunc: msgFunc, + seed: seed, todo: map[string]N{}, waiting: map[string]N{}, success: map[string]N{}, @@ -70,6 +79,8 @@ func NewFollowUp[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.Que Err error }{}, } + + return f } // Advance advances the state of the [FollowUp] [Broadcast] state machine. It @@ -118,7 +129,8 @@ func (f *FollowUp[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) (out return &StateBroadcastStoreRecord[K, N, M]{ QueryID: f.queryID, NodeID: n, - Message: f.msg, + Target: f.cfg.Target, + Message: f.msgFunc(f.cfg.Target), } } @@ -149,12 +161,6 @@ func (f *FollowUp[K, N, M]) handleEvent(ctx context.Context, ev BroadcastEvent) }() switch ev := ev.(type) { - case *EventBroadcastStart[K, N]: - return &query.EventPoolAddFindCloserQuery[K, N]{ - QueryID: f.queryID, - Target: ev.Target, - Seed: ev.Seed, - } case *EventBroadcastStop: if f.isQueryDone() { return nil @@ -185,7 +191,14 @@ func (f *FollowUp[K, N, M]) handleEvent(ctx context.Context, ev BroadcastEvent) Err error }{Node: ev.NodeID, Err: ev.Error} case *EventBroadcastPoll: - // ignore, nothing to do + if !f.started { + f.started = true + return &query.EventPoolAddFindCloserQuery[K, N]{ + QueryID: f.queryID, + Target: f.cfg.Target, + Seed: f.seed, + } + } return &query.EventPoolPoll{} default: panic(fmt.Sprintf("unexpected event: %T", ev)) diff --git a/internal/coord/brdcst/mtm.go b/internal/coord/brdcst/mtm.go new file mode 100644 index 0000000..d1e23a1 --- /dev/null +++ b/internal/coord/brdcst/mtm.go @@ -0,0 +1,200 @@ +package brdcst + +import ( + "context" + "fmt" + "time" + + "github.com/plprobelab/go-libdht/kad" + "github.com/plprobelab/go-libdht/kad/key" + "github.com/plprobelab/go-libdht/kad/trie" + "go.opentelemetry.io/otel/trace" + + "github.com/plprobelab/zikade/internal/coord/coordt" + "github.com/plprobelab/zikade/tele" +) + +// ManyToMany is a [Broadcast] state machine and encapsulates the logic around +// doing a put operation to a static set of nodes. That static set of nodes +// is given by the list of seed nodes in the [EventBroadcastStart] event. +type ManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { + // the unique ID for this broadcast operation + queryID coordt.QueryID + + // a struct holding configuration options + cfg *ConfigManyToMany[K] + + // TODO + keyReports map[string]*report + + // TODO + unprocessedNodes map[string]*nodeState[K, N] + + // TODO + inflightWithCapacity map[string]*nodeState[K, N] + + // TODO + inflightAtCapacity map[string]*nodeState[K, N] + + // TODO + processedNodes map[string]*nodeState[K, N] + + // TODO + msgFunc func(K) M +} + +type brdcstManyMapVal[K kad.Key[K], N kad.NodeID[K]] struct { + target K + node N +} + +type nodeState[K kad.Key[K], N kad.NodeID[K]] struct { + node N + todo []K + inflight map[string]K + done []K +} + +type report struct { + successes int + failures int + lastSuccess time.Time +} + +// NewManyToMany initializes a new [ManyToMany] struct. +func NewManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.QueryID, msgFunc func(K) M, seed []N, cfg *ConfigManyToMany[K]) *ManyToMany[K, N, M] { + t := trie.New[K, N]() + for _, s := range seed { + t.Add(s.Key(), s) + } + + // find out which seed nodes are responsible to hold the provider/put + // record for which target key. + keyReports := make(map[string]*report, len(cfg.Targets)) + mappings := map[string]map[string]*brdcstManyMapVal[K, N]{} // map from node -> map of target keys -> target key + for _, target := range cfg.Targets { + entries := trie.Closest(t, target, 20) // TODO: make configurable + targetMapKey := key.HexString(target) + keyReports[targetMapKey] = &report{} + for _, entry := range entries { + node := entry.Data + nodeMapKey := node.String() + if _, found := mappings[nodeMapKey]; !found { + mappings[nodeMapKey] = map[string]*brdcstManyMapVal[K, N]{} + } + + mappings[nodeMapKey][targetMapKey] = &brdcstManyMapVal[K, N]{target: target, node: node} + } + } + + unprocessedNodes := make(map[string]*nodeState[K, N], len(mappings)) + for node, mapVals := range mappings { + if len(mapVals) == 0 { + continue + } + + unprocessedNodes[node] = &nodeState[K, N]{ + todo: make([]K, 0, len(mapVals)), + done: make([]K, 0, len(mapVals)), + inflight: map[string]K{}, + } + for _, val := range mapVals { + unprocessedNodes[node].todo = append(unprocessedNodes[node].todo, val.target) + unprocessedNodes[node].node = val.node // actually, this needs to only be done once + } + } + + return &ManyToMany[K, N, M]{ + queryID: qid, + cfg: cfg, + keyReports: keyReports, + unprocessedNodes: unprocessedNodes, + inflightWithCapacity: map[string]*nodeState[K, N]{}, + inflightAtCapacity: map[string]*nodeState[K, N]{}, + processedNodes: map[string]*nodeState[K, N]{}, + msgFunc: msgFunc, + } +} + +// Advance advances the state of the [ManyToMany] [Broadcast] state machine. +func (otm *ManyToMany[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) (out BroadcastState) { + _, span := tele.StartSpan(ctx, "ManyToMany.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) + defer span.End() + + switch ev := ev.(type) { + case *EventBroadcastStop: + case *EventBroadcastStoreRecordSuccess[K, N, M]: + if nstate, found := otm.inflightAtCapacity[ev.NodeID.String()]; found { + delete(nstate.inflight, key.HexString(ev.Target)) + nstate.done = append(nstate.done, ev.Target) + + delete(otm.inflightAtCapacity, ev.NodeID.String()) + if len(nstate.todo) == 0 && len(nstate.inflight) == 0 { + otm.processedNodes[ev.NodeID.String()] = nstate + } else { + otm.inflightWithCapacity[ev.NodeID.String()] = nstate + } + } else if nstate, found := otm.inflightWithCapacity[ev.NodeID.String()]; found { + delete(nstate.inflight, key.HexString(ev.Target)) + nstate.done = append(nstate.done, ev.Target) + + if len(nstate.todo) == 0 && len(nstate.inflight) == 0 { + otm.processedNodes[ev.NodeID.String()] = nstate + } + } + + case *EventBroadcastStoreRecordFailure[K, N, M]: + case *EventBroadcastPoll: + // ignore, nothing to do + default: + panic(fmt.Sprintf("unexpected event: %T", ev)) + } + + if len(otm.inflightWithCapacity)+len(otm.inflightAtCapacity) == otm.cfg.NodeConcurrency { + for node, nstate := range otm.inflightWithCapacity { + var popped K + popped, nstate.todo = nstate.todo[0], nstate.todo[1:] + + nstate.inflight[key.HexString(popped)] = popped + + if len(nstate.todo) == 0 || len(nstate.inflight) == otm.cfg.StreamConcurrency { + delete(otm.inflightWithCapacity, node) + otm.inflightAtCapacity[nstate.node.String()] = nstate + } + + return &StateBroadcastStoreRecord[K, N, M]{ + QueryID: otm.queryID, + NodeID: nstate.node, + Target: popped, + Message: otm.msgFunc(popped), + } + } + + return &StateBroadcastWaiting{ + QueryID: otm.queryID, + } + } + + for nodeStr, nstate := range otm.unprocessedNodes { + delete(otm.unprocessedNodes, nodeStr) + + var popped K + popped, nstate.todo = nstate.todo[0], nstate.todo[1:] + nstate.inflight[key.HexString(popped)] = popped + + if len(nstate.todo) == 0 { + otm.inflightAtCapacity[nstate.node.String()] = nstate + } else { + otm.inflightWithCapacity[nstate.node.String()] = nstate + } + + return &StateBroadcastStoreRecord[K, N, M]{ + QueryID: otm.queryID, + NodeID: nstate.node, + Target: popped, + Message: otm.msgFunc(popped), + } + } + + return &StateBroadcastIdle{} +} diff --git a/internal/coord/brdcst/mtm_test.go b/internal/coord/brdcst/mtm_test.go new file mode 100644 index 0000000..e3ab4ab --- /dev/null +++ b/internal/coord/brdcst/mtm_test.go @@ -0,0 +1,87 @@ +package brdcst + +import ( + "context" + "math/big" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/plprobelab/zikade/internal/coord/coordt" + "github.com/plprobelab/zikade/internal/tiny" +) + +func TestNewManyToMany(t *testing.T) { + ctx := context.Background() + + targets := make([]tiny.Key, 0, 100) + seed := make([]tiny.Node, 0, 100) + for i := 0; i < 100; i++ { + seed = append(seed, tiny.NewNode(tiny.Key(i+1))) + targets = append(targets, tiny.Key(i+1)) + } + + msgFunc := func(k tiny.Key) tiny.Message { + return tiny.Message{ + Content: k.String(), + } + } + cfg := DefaultConfigManyToMany(targets) + cfg.NodeConcurrency = 2 + cfg.StreamConcurrency = 5 + + qid := coordt.QueryID("test") + sm := NewManyToMany[tiny.Key, tiny.Node, tiny.Message](qid, msgFunc, seed, cfg) + require.Len(t, sm.unprocessedNodes, 100) + + pending := map[tiny.Node][]tiny.Message{} + for i := 0; i < cfg.NodeConcurrency*cfg.StreamConcurrency; i++ { + state := sm.Advance(ctx, &EventBroadcastPoll{}) + tstate, ok := state.(*StateBroadcastStoreRecord[tiny.Key, tiny.Node, tiny.Message]) + require.True(t, ok, "type is %T", state) + require.Equal(t, tstate.QueryID, qid) + require.NotNil(t, tstate.NodeID) + require.NotNil(t, tstate.Message) + if _, found := pending[tstate.NodeID]; !found { + pending[tstate.NodeID] = []tiny.Message{} + } + pending[tstate.NodeID] = append(pending[tstate.NodeID], tstate.Message) + } + + require.Len(t, pending, cfg.NodeConcurrency) + require.Len(t, sm.inflightAtCapacity, cfg.NodeConcurrency) + for _, messages := range pending { + require.Len(t, messages, cfg.StreamConcurrency) + } + for _, inflight := range sm.inflightAtCapacity { + require.Len(t, inflight.inflight, cfg.StreamConcurrency) + } + + state := sm.Advance(ctx, &EventBroadcastPoll{}) + wstate, ok := state.(*StateBroadcastWaiting) + require.True(t, ok, "type is %T", state) + require.Equal(t, qid, wstate.QueryID) + + for node, messages := range pending { + var popped tiny.Message + popped, messages = messages[0], messages[1:] + + n := new(big.Int) + n.SetString(popped.Content, 16) + state = sm.Advance(ctx, &EventBroadcastStoreRecordSuccess[tiny.Key, tiny.Node, tiny.Message]{ + NodeID: node, + Target: tiny.Key(n.Int64()), + Request: popped, + Response: popped, + }) + + tstate, ok := state.(*StateBroadcastStoreRecord[tiny.Key, tiny.Node, tiny.Message]) + require.True(t, ok, "type is %T", state) + _ = tstate + + state := sm.Advance(ctx, &EventBroadcastPoll{}) + wstate, ok := state.(*StateBroadcastWaiting) + require.True(t, ok, "type is %T", state) + require.Equal(t, qid, wstate.QueryID) + } +} diff --git a/internal/coord/brdcst/otm.go b/internal/coord/brdcst/otm.go new file mode 100644 index 0000000..4f1a4e7 --- /dev/null +++ b/internal/coord/brdcst/otm.go @@ -0,0 +1,146 @@ +package brdcst + +import ( + "context" + "fmt" + + "github.com/plprobelab/go-libdht/kad" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + + "github.com/plprobelab/zikade/internal/coord/coordt" + "github.com/plprobelab/zikade/tele" +) + +// OneToMany is a [Broadcast] state machine and encapsulates the logic around +// doing a ONE put operation to MANY preconfigured nodes. That static set of +// nodes is given by the list of seed nodes in the [EventBroadcastStart] event. +type OneToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { + // the unique ID for this broadcast operation + queryID coordt.QueryID + + // a struct holding configuration options + cfg *ConfigOneToMany[K] + + // the message generator that takes a target key and will return the message + // that we will send to the closest nodes in the follow-up phase + msgFunc func(K) M + + // nodes we still need to store records with. This map will be filled with + // all the closest nodes after the query has finished. + todo map[string]N + + // nodes we have contacted to store the record but haven't heard a response yet + waiting map[string]N + + // nodes that successfully hold the record for us + success map[string]N + + // nodes that failed to hold the record for us + failed map[string]struct { + Node N + Err error + } +} + +// NewOneToMany initializes a new [OneToMany] struct. +func NewOneToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.QueryID, msgFunc func(K) M, seed []N, cfg *ConfigOneToMany[K]) *OneToMany[K, N, M] { + otm := &OneToMany[K, N, M]{ + queryID: qid, + cfg: cfg, + msgFunc: msgFunc, + todo: map[string]N{}, + waiting: map[string]N{}, + success: map[string]N{}, + failed: map[string]struct { + Node N + Err error + }{}, + } + + for _, s := range seed { + otm.todo[s.String()] = s + } + + return otm +} + +// Advance advances the state of the [OneToMany] [Broadcast] state machine. +func (otm *OneToMany[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) (out BroadcastState) { + _, span := tele.StartSpan(ctx, "OneToMany.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) + defer func() { + span.SetAttributes( + tele.AttrOutEvent(out), + attribute.Int("todo", len(otm.todo)), + attribute.Int("waiting", len(otm.waiting)), + attribute.Int("success", len(otm.success)), + attribute.Int("failed", len(otm.failed)), + ) + span.End() + }() + + switch ev := ev.(type) { + case *EventBroadcastStop: + for _, n := range otm.todo { + delete(otm.todo, n.String()) + otm.failed[n.String()] = struct { + Node N + Err error + }{Node: n, Err: fmt.Errorf("cancelled")} + } + + for _, n := range otm.waiting { + delete(otm.waiting, n.String()) + otm.failed[n.String()] = struct { + Node N + Err error + }{Node: n, Err: fmt.Errorf("cancelled")} + } + case *EventBroadcastStoreRecordSuccess[K, N, M]: + delete(otm.waiting, ev.NodeID.String()) + otm.success[ev.NodeID.String()] = ev.NodeID + case *EventBroadcastStoreRecordFailure[K, N, M]: + delete(otm.waiting, ev.NodeID.String()) + otm.failed[ev.NodeID.String()] = struct { + Node N + Err error + }{Node: ev.NodeID, Err: ev.Error} + case *EventBroadcastPoll: + // ignore, nothing to do + default: + panic(fmt.Sprintf("unexpected event: %T", ev)) + } + + for k, n := range otm.todo { + delete(otm.todo, k) + otm.waiting[k] = n + return &StateBroadcastStoreRecord[K, N, M]{ + QueryID: otm.queryID, + NodeID: n, + Target: otm.cfg.Target, + Message: otm.msgFunc(otm.cfg.Target), + } + } + + if len(otm.waiting) > 0 { + return &StateBroadcastWaiting{} + } + + if len(otm.todo) == 0 { + contacted := make([]N, 0, len(otm.success)+len(otm.failed)) + for _, n := range otm.success { + contacted = append(contacted, n) + } + for _, n := range otm.failed { + contacted = append(contacted, n.Node) + } + + return &StateBroadcastFinished[K, N]{ + QueryID: otm.queryID, + Contacted: contacted, + Errors: otm.failed, + } + } + + return &StateBroadcastIdle{} +} diff --git a/internal/coord/brdcst/pool.go b/internal/coord/brdcst/pool.go index d990392..af78734 100644 --- a/internal/coord/brdcst/pool.go +++ b/internal/coord/brdcst/pool.go @@ -119,19 +119,16 @@ func (p *Pool[K, N, M]) handleEvent(ctx context.Context, ev PoolEvent) (sm Broad case *EventPoolStartBroadcast[K, N, M]: // first initialize the state machine for the broadcast desired strategy switch cfg := ev.Config.(type) { - case *ConfigFollowUp: - p.bcs[ev.QueryID] = NewFollowUp[K, N, M](ev.QueryID, p.qp, ev.Message, cfg) - case *ConfigStatic: - p.bcs[ev.QueryID] = NewStatic[K, N, M](ev.QueryID, ev.Message, cfg) - case *ConfigOptimistic: - panic("implement me") + case *ConfigFollowUp[K]: + p.bcs[ev.QueryID] = NewFollowUp[K, N, M](ev.QueryID, ev.MsgFunc, p.qp, ev.Seed, cfg) + case *ConfigOneToMany[K]: + p.bcs[ev.QueryID] = NewOneToMany[K, N, M](ev.QueryID, ev.MsgFunc, ev.Seed, cfg) + case *ConfigManyToMany[K]: + p.bcs[ev.QueryID] = NewManyToMany[K, N, M](ev.QueryID, ev.MsgFunc, ev.Seed, cfg) } // start the new state machine - return p.bcs[ev.QueryID], &EventBroadcastStart[K, N]{ - Target: ev.Target, - Seed: ev.Seed, - } + return p.bcs[ev.QueryID], &EventBroadcastPoll{} case *EventPoolStopBroadcast: return p.bcs[ev.QueryID], &EventBroadcastStop{} @@ -151,6 +148,7 @@ func (p *Pool[K, N, M]) handleEvent(ctx context.Context, ev PoolEvent) (sm Broad case *EventPoolStoreRecordSuccess[K, N, M]: return p.bcs[ev.QueryID], &EventBroadcastStoreRecordSuccess[K, N, M]{ NodeID: ev.NodeID, + Target: ev.Target, Request: ev.Request, Response: ev.Response, } @@ -158,6 +156,7 @@ func (p *Pool[K, N, M]) handleEvent(ctx context.Context, ev PoolEvent) (sm Broad case *EventPoolStoreRecordFailure[K, N, M]: return p.bcs[ev.QueryID], &EventBroadcastStoreRecordFailure[K, N, M]{ NodeID: ev.NodeID, + Target: ev.Target, Request: ev.Request, Error: ev.Error, } @@ -193,6 +192,7 @@ func (p *Pool[K, N, M]) advanceBroadcast(ctx context.Context, sm Broadcast, bev return &StatePoolStoreRecord[K, N, M]{ QueryID: st.QueryID, NodeID: st.NodeID, + Target: st.Target, Message: st.Message, }, true case *StateBroadcastFinished[K, N]: @@ -235,7 +235,8 @@ type StatePoolWaiting struct{} type StatePoolStoreRecord[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the id of the broadcast operation that wants to send the message NodeID N // the node to send the message to - Message M // the message that should be sent to the remote node + Target K + Message M // the message that should be sent to the remote node } // StatePoolBroadcastFinished indicates that the broadcast operation with the @@ -283,8 +284,7 @@ type EventPoolPoll struct{} // operation. This is the entry point. type EventPoolStartBroadcast[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the unique ID for this operation - Target K // the key we want to store the record for - Message M // the message that we want to send to the closest peers (this encapsulates the payload we want to store) + MsgFunc func(K) M // a message generator that takes a target key and returns the message we will send out Seed []N // the closest nodes we know so far and from where we start the operation Config Config // the configuration for this operation. Most importantly, this defines the broadcast strategy ([FollowUp] or [Static]) } @@ -324,8 +324,9 @@ type EventPoolGetCloserNodesFailure[K kad.Key[K], N kad.NodeID[K]] struct { type EventPoolStoreRecordSuccess[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the id of the query that sent the message NodeID N // the node the message was sent to - Request M // the message that was sent to the remote node - Response M // the reply we got from the remote node (nil in many cases of the Amino DHT) + Target K + Request M // the message that was sent to the remote node + Response M // the reply we got from the remote node (nil in many cases of the Amino DHT) } // EventPoolStoreRecordFailure noties the broadcast [Pool] that storing a record @@ -334,8 +335,9 @@ type EventPoolStoreRecordSuccess[K kad.Key[K], N kad.NodeID[K], M coordt.Message type EventPoolStoreRecordFailure[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the id of the query that sent the message NodeID N // the node the message was sent to - Request M // the message that was sent to the remote node - Error error // the error that caused the failure + Target K + Request M // the message that was sent to the remote node + Error error // the error that caused the failure } // poolEvent() ensures that only events accepted by a broadcast [Pool] can be diff --git a/internal/coord/brdcst/pool_test.go b/internal/coord/brdcst/pool_test.go index eb2b6e9..0ac0bcd 100644 --- a/internal/coord/brdcst/pool_test.go +++ b/internal/coord/brdcst/pool_test.go @@ -54,10 +54,9 @@ func TestPool_FollowUp_lifecycle(t *testing.T) { state := p.Advance(ctx, &EventPoolStartBroadcast[tiny.Key, tiny.Node, tiny.Message]{ QueryID: queryID, - Target: target, - Message: msg, + MsgFunc: func(t tiny.Key) tiny.Message { return msg }, Seed: []tiny.Node{a}, - Config: DefaultConfigFollowUp(), + Config: DefaultConfigFollowUp(target), }) // the query should attempt to contact the node it was given @@ -191,10 +190,9 @@ func TestPool_FollowUp_stop_during_query(t *testing.T) { state := p.Advance(ctx, &EventPoolStartBroadcast[tiny.Key, tiny.Node, tiny.Message]{ QueryID: queryID, - Target: target, - Message: msg, + MsgFunc: func(t tiny.Key) tiny.Message { return msg }, Seed: []tiny.Node{a}, - Config: DefaultConfigFollowUp(), + Config: DefaultConfigFollowUp(target), }) // the query should attempt to contact the node it was given @@ -235,10 +233,9 @@ func TestPool_FollowUp_stop_during_followup_phase(t *testing.T) { state := p.Advance(ctx, &EventPoolStartBroadcast[tiny.Key, tiny.Node, tiny.Message]{ QueryID: queryID, - Target: target, - Message: msg, + MsgFunc: func(t tiny.Key) tiny.Message { return msg }, Seed: []tiny.Node{a, b}, - Config: DefaultConfigFollowUp(), + Config: DefaultConfigFollowUp(target), }) require.IsType(t, &StatePoolFindCloser[tiny.Key, tiny.Node]{}, state) @@ -288,13 +285,12 @@ func TestPool_empty_seed(t *testing.T) { startEvt := &EventPoolStartBroadcast[tiny.Key, tiny.Node, tiny.Message]{ QueryID: queryID, - Target: target, - Message: msg, + MsgFunc: func(t tiny.Key) tiny.Message { return msg }, Seed: []tiny.Node{}, } t.Run("follow up", func(t *testing.T) { - startEvt.Config = DefaultConfigFollowUp() + startEvt.Config = DefaultConfigFollowUp(target) state := p.Advance(ctx, startEvt) require.IsType(t, &StatePoolBroadcastFinished[tiny.Key, tiny.Node]{}, state) @@ -304,7 +300,7 @@ func TestPool_empty_seed(t *testing.T) { }) t.Run("static", func(t *testing.T) { - startEvt.Config = DefaultConfigStatic() + startEvt.Config = DefaultConfigOneToMany(target) state := p.Advance(ctx, startEvt) require.IsType(t, &StatePoolBroadcastFinished[tiny.Key, tiny.Node]{}, state) @@ -332,10 +328,9 @@ func TestPool_Static_happy_path(t *testing.T) { state := p.Advance(ctx, &EventPoolStartBroadcast[tiny.Key, tiny.Node, tiny.Message]{ QueryID: queryID, - Target: target, - Message: msg, + MsgFunc: func(t tiny.Key) tiny.Message { return msg }, Seed: []tiny.Node{a, b, c}, - Config: DefaultConfigStatic(), + Config: DefaultConfigOneToMany(target), }) spsr, ok := state.(*StatePoolStoreRecord[tiny.Key, tiny.Node, tiny.Message]) require.True(t, ok, "state is %T", state) @@ -389,10 +384,9 @@ func TestPool_Static_stop_mid_flight(t *testing.T) { state := p.Advance(ctx, &EventPoolStartBroadcast[tiny.Key, tiny.Node, tiny.Message]{ QueryID: queryID, - Target: target, - Message: msg, + MsgFunc: func(t tiny.Key) tiny.Message { return msg }, Seed: []tiny.Node{a, b, c}, - Config: DefaultConfigStatic(), + Config: DefaultConfigOneToMany(target), }) require.IsType(t, &StatePoolStoreRecord[tiny.Key, tiny.Node, tiny.Message]{}, state) diff --git a/internal/coord/brdcst/static.go b/internal/coord/brdcst/static.go deleted file mode 100644 index 0c115be..0000000 --- a/internal/coord/brdcst/static.go +++ /dev/null @@ -1,143 +0,0 @@ -package brdcst - -import ( - "context" - "fmt" - - "github.com/plprobelab/go-libdht/kad" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/trace" - - "github.com/plprobelab/zikade/internal/coord/coordt" - "github.com/plprobelab/zikade/tele" -) - -// Static is a [Broadcast] state machine and encapsulates the logic around -// doing a put operation to a static set of nodes. That static set of nodes -// is given by the list of seed nodes in the [EventBroadcastStart] event. -type Static[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { - // the unique ID for this broadcast operation - queryID coordt.QueryID - - // a struct holding configuration options - cfg *ConfigStatic - - // the message that we will send to the closest nodes in the follow-up phase - msg M - - // nodes we still need to store records with. This map will be filled with - // all the closest nodes after the query has finished. - todo map[string]N - - // nodes we have contacted to store the record but haven't heard a response yet - waiting map[string]N - - // nodes that successfully hold the record for us - success map[string]N - - // nodes that failed to hold the record for us - failed map[string]struct { - Node N - Err error - } -} - -// NewStatic initializes a new [Static] struct. -func NewStatic[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.QueryID, msg M, cfg *ConfigStatic) *Static[K, N, M] { - return &Static[K, N, M]{ - queryID: qid, - cfg: cfg, - msg: msg, - todo: map[string]N{}, - waiting: map[string]N{}, - success: map[string]N{}, - failed: map[string]struct { - Node N - Err error - }{}, - } -} - -// Advance advances the state of the [Static] [Broadcast] state machine. -func (f *Static[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) (out BroadcastState) { - _, span := tele.StartSpan(ctx, "Static.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) - defer func() { - span.SetAttributes( - tele.AttrOutEvent(out), - attribute.Int("todo", len(f.todo)), - attribute.Int("waiting", len(f.waiting)), - attribute.Int("success", len(f.success)), - attribute.Int("failed", len(f.failed)), - ) - span.End() - }() - - switch ev := ev.(type) { - case *EventBroadcastStart[K, N]: - span.SetAttributes(attribute.Int("seed", len(ev.Seed))) - for _, seed := range ev.Seed { - f.todo[seed.String()] = seed - } - case *EventBroadcastStop: - for _, n := range f.todo { - delete(f.todo, n.String()) - f.failed[n.String()] = struct { - Node N - Err error - }{Node: n, Err: fmt.Errorf("cancelled")} - } - - for _, n := range f.waiting { - delete(f.waiting, n.String()) - f.failed[n.String()] = struct { - Node N - Err error - }{Node: n, Err: fmt.Errorf("cancelled")} - } - case *EventBroadcastStoreRecordSuccess[K, N, M]: - delete(f.waiting, ev.NodeID.String()) - f.success[ev.NodeID.String()] = ev.NodeID - case *EventBroadcastStoreRecordFailure[K, N, M]: - delete(f.waiting, ev.NodeID.String()) - f.failed[ev.NodeID.String()] = struct { - Node N - Err error - }{Node: ev.NodeID, Err: ev.Error} - case *EventBroadcastPoll: - // ignore, nothing to do - default: - panic(fmt.Sprintf("unexpected event: %T", ev)) - } - - for k, n := range f.todo { - delete(f.todo, k) - f.waiting[k] = n - return &StateBroadcastStoreRecord[K, N, M]{ - QueryID: f.queryID, - NodeID: n, - Message: f.msg, - } - } - - if len(f.waiting) > 0 { - return &StateBroadcastWaiting{} - } - - if len(f.todo) == 0 { - contacted := make([]N, 0, len(f.success)+len(f.failed)) - for _, n := range f.success { - contacted = append(contacted, n) - } - for _, n := range f.failed { - contacted = append(contacted, n.Node) - } - - return &StateBroadcastFinished[K, N]{ - QueryID: f.queryID, - Contacted: contacted, - Errors: f.failed, - } - } - - return &StateBroadcastIdle{} -} diff --git a/internal/coord/brdcst_events.go b/internal/coord/brdcst_events.go deleted file mode 100644 index 60b44f8..0000000 --- a/internal/coord/brdcst_events.go +++ /dev/null @@ -1,35 +0,0 @@ -package coord - -import ( - "github.com/plprobelab/zikade/internal/coord/brdcst" - "github.com/plprobelab/zikade/internal/coord/coordt" - "github.com/plprobelab/zikade/kadt" - "github.com/plprobelab/zikade/pb" -) - -// EventStartBroadcast starts a new -type EventStartBroadcast struct { - QueryID coordt.QueryID - Target kadt.Key - Message *pb.Message - Seed []kadt.PeerID - Config brdcst.Config - Notify QueryMonitor[*EventBroadcastFinished] -} - -func (*EventStartBroadcast) behaviourEvent() {} - -// EventBroadcastFinished is emitted by the coordinator when a broadcasting -// a record to the network has finished, either through running to completion or -// by being canceled. -type EventBroadcastFinished struct { - QueryID coordt.QueryID - Contacted []kadt.PeerID - Errors map[string]struct { - Node kadt.PeerID - Err error - } -} - -func (*EventBroadcastFinished) behaviourEvent() {} -func (*EventBroadcastFinished) terminalQueryEvent() {} diff --git a/internal/coord/coordinator.go b/internal/coord/coordinator.go index 08d2b17..dad36d8 100644 --- a/internal/coord/coordinator.go +++ b/internal/coord/coordinator.go @@ -333,7 +333,7 @@ func (c *Coordinator) QueryClosest(ctx context.Context, target kadt.Key, fn coor return nil, coordt.QueryStats{}, err } - waiter := NewQueryWaiter(numResults) + waiter := NewQueryWaiter(cfg.NumResults) queryID := c.newOperationID() cmd := &EventStartFindCloserQuery{ @@ -383,7 +383,7 @@ func (c *Coordinator) QueryMessage(ctx context.Context, msg *pb.Message, fn coor return nil, coordt.QueryStats{}, err } - waiter := NewQueryWaiter(numResults) + waiter := NewQueryWaiter(cfg.NumResults) queryID := c.newOperationID() cmd := &EventStartMessageQuery{ @@ -403,31 +403,21 @@ func (c *Coordinator) QueryMessage(ctx context.Context, msg *pb.Message, fn coor return closest, stats, err } -func (c *Coordinator) BroadcastRecord(ctx context.Context, msg *pb.Message) error { - ctx, span := c.tele.Tracer.Start(ctx, "Coordinator.BroadcastRecord") - defer span.End() - if msg == nil { - return fmt.Errorf("no message supplied for broadcast") - } - c.cfg.Logger.Debug("starting broadcast with message", tele.LogAttrKey(msg.Target()), slog.String("type", msg.Type.String())) - - ctx, cancel := context.WithCancel(ctx) - defer cancel() +func (c *Coordinator) BroadcastRecord(ctx context.Context, msg *pb.Message, seed []kadt.PeerID) error { + msgFunc := func(k kadt.Key) *pb.Message { return msg } + return c.broadcast(ctx, msgFunc, seed, brdcst.DefaultConfigFollowUp(msg.Target())) +} - seeds, err := c.GetClosestNodes(ctx, msg.Target(), 20) // TODO: parameterize - if err != nil { - return err - } - return c.broadcast(ctx, msg, seeds, brdcst.DefaultConfigFollowUp()) +func (c *Coordinator) BroadcastStatic(ctx context.Context, msg *pb.Message, seed []kadt.PeerID) error { + msgFunc := func(k kadt.Key) *pb.Message { return msg } + return c.broadcast(ctx, msgFunc, seed, brdcst.DefaultConfigOneToMany(msg.Target())) } -func (c *Coordinator) BroadcastStatic(ctx context.Context, msg *pb.Message, seeds []kadt.PeerID) error { - ctx, span := c.tele.Tracer.Start(ctx, "Coordinator.BroadcastStatic") - defer span.End() - return c.broadcast(ctx, msg, seeds, brdcst.DefaultConfigStatic()) +func (c *Coordinator) BroadcastMany(ctx context.Context, keys []kadt.Key, seed []kadt.PeerID, msgFn func(k kadt.Key) *pb.Message) error { + return c.broadcast(ctx, msgFn, seed, brdcst.DefaultConfigManyToMany(keys)) } -func (c *Coordinator) broadcast(ctx context.Context, msg *pb.Message, seeds []kadt.PeerID, cfg brdcst.Config) error { +func (c *Coordinator) broadcast(ctx context.Context, msgFunc func(k kadt.Key) *pb.Message, seeds []kadt.PeerID, cfg brdcst.Config) error { ctx, span := c.tele.Tracer.Start(ctx, "Coordinator.broadcast") defer span.End() @@ -439,8 +429,7 @@ func (c *Coordinator) broadcast(ctx context.Context, msg *pb.Message, seeds []ka cmd := &EventStartBroadcast{ QueryID: queryID, - Target: msg.Target(), - Message: msg, + MsgFunc: msgFunc, Seed: seeds, Notify: waiter, Config: cfg, diff --git a/internal/coord/event.go b/internal/coord/event.go index e661247..7057cfe 100644 --- a/internal/coord/event.go +++ b/internal/coord/event.go @@ -1,6 +1,7 @@ package coord import ( + "github.com/plprobelab/zikade/internal/coord/brdcst" "github.com/plprobelab/zikade/internal/coord/coordt" "github.com/plprobelab/zikade/internal/coord/query" "github.com/plprobelab/zikade/kadt" @@ -29,12 +30,6 @@ type QueryCommand interface { queryCommand() } -// BrdcstCommand is a type of [BehaviourEvent] that instructs a [BrdcstBehaviour] to perform an action. -type BrdcstCommand interface { - BehaviourEvent - brdcstCommand() -} - type NodeHandlerRequest interface { BehaviourEvent nodeHandlerRequest() @@ -77,6 +72,7 @@ func (*EventOutboundGetCloserNodes) networkCommand() {} type EventOutboundSendMessage struct { QueryID coordt.QueryID To kadt.PeerID + Target kadt.Key Message *pb.Message Notify Notify[BehaviourEvent] } @@ -86,25 +82,25 @@ func (*EventOutboundSendMessage) nodeHandlerRequest() {} func (*EventOutboundSendMessage) networkCommand() {} type EventStartMessageQuery struct { - QueryID coordt.QueryID - Target kadt.Key - Message *pb.Message - KnownClosestNodes []kadt.PeerID - Notify QueryMonitor[*EventQueryFinished] - NumResults int // the minimum number of nodes to successfully contact before considering iteration complete - Strategy query.QueryStrategy // the way the query should be performed - [query.QueryStrategyConverge] will be used by default. + QueryID coordt.QueryID + Target kadt.Key + Message *pb.Message + Seed []kadt.PeerID + Notify QueryMonitor[*EventQueryFinished] + NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + Strategy query.QueryStrategy // the way the query should be performed - [query.QueryStrategyConverge] will be used by default. } func (*EventStartMessageQuery) behaviourEvent() {} func (*EventStartMessageQuery) queryCommand() {} type EventStartFindCloserQuery struct { - QueryID coordt.QueryID - Target kadt.Key - KnownClosestNodes []kadt.PeerID - Notify QueryMonitor[*EventQueryFinished] - NumResults int // the minimum number of nodes to successfully contact before considering iteration complete - Strategy query.QueryStrategy // the way the query should be performed - [query.QueryStrategyConverge] will be used by default. + QueryID coordt.QueryID + Target kadt.Key + Seed []kadt.PeerID + Notify QueryMonitor[*EventQueryFinished] + NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + Strategy query.QueryStrategy // the way the query should be performed - [query.QueryStrategyConverge] will be used by default. } func (*EventStartFindCloserQuery) behaviourEvent() {} @@ -157,6 +153,7 @@ type EventSendMessageSuccess struct { Request *pb.Message To kadt.PeerID // To is the peer that the SendMessage request was sent to. Response *pb.Message + Target kadt.Key CloserNodes []kadt.PeerID } @@ -254,3 +251,35 @@ type EventStartCrawl struct { } func (*EventStartCrawl) behaviourEvent() {} + +// BrdcstCommand is a type of [BehaviourEvent] that instructs a [BrdcstBehaviour] to perform an action. +type BrdcstCommand interface { + BehaviourEvent + brdcstCommand() +} + +// EventStartBroadcast starts a new +type EventStartBroadcast struct { + QueryID coordt.QueryID + MsgFunc func(k kadt.Key) *pb.Message + Seed []kadt.PeerID + Config brdcst.Config + Notify QueryMonitor[*EventBroadcastFinished] +} + +func (*EventStartBroadcast) behaviourEvent() {} + +// EventBroadcastFinished is emitted by the coordinator when a broadcasting +// a record to the network has finished, either through running to completion or +// by being canceled. +type EventBroadcastFinished struct { + QueryID coordt.QueryID + Contacted []kadt.PeerID + Errors map[string]struct { + Node kadt.PeerID + Err error + } +} + +func (*EventBroadcastFinished) behaviourEvent() {} +func (*EventBroadcastFinished) terminalQueryEvent() {} diff --git a/internal/coord/network.go b/internal/coord/network.go index 4dc81a7..f70f3bd 100644 --- a/internal/coord/network.go +++ b/internal/coord/network.go @@ -166,6 +166,7 @@ func (h *NodeHandler) send(ctx context.Context, ev NodeHandlerRequest) bool { QueryID: cmd.QueryID, To: h.self, Request: cmd.Message, + Target: cmd.Target, Err: fmt.Errorf("NodeHandler: %w", err), }) return false @@ -176,6 +177,7 @@ func (h *NodeHandler) send(ctx context.Context, ev NodeHandlerRequest) bool { To: h.self, Request: cmd.Message, Response: resp, + Target: cmd.Target, CloserNodes: resp.CloserNodes(), }) default: diff --git a/internal/coord/query.go b/internal/coord/query.go index 62eb82c..8244c45 100644 --- a/internal/coord/query.go +++ b/internal/coord/query.go @@ -391,6 +391,7 @@ func (p *PooledQueryBehaviour) advancePool(ctx context.Context, ev query.PoolEve return &EventOutboundSendMessage{ QueryID: st.QueryID, To: st.NodeID, + Target: st.Target, Message: st.Message, Notify: p, }, true diff --git a/internal/coord/query/pool.go b/internal/coord/query/pool.go index 9f891cf..1821c8a 100644 --- a/internal/coord/query/pool.go +++ b/internal/coord/query/pool.go @@ -209,6 +209,7 @@ func (p *Pool[K, N, M]) advanceQuery(ctx context.Context, qry *Query[K, N, M], q QueryID: st.QueryID, Stats: st.Stats, NodeID: st.NodeID, + Target: st.Target, Message: st.Message, }, true case *StateQueryFinished[K, N]: @@ -350,6 +351,7 @@ type StatePoolFindCloser[K kad.Key[K], N kad.NodeID[K]] struct { type StatePoolSendMessage[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID NodeID N // the node to send the message to + Target K Message M Stats QueryStats } diff --git a/internal/coord/query/query.go b/internal/coord/query/query.go index 7f0348d..f82803c 100644 --- a/internal/coord/query/query.go +++ b/internal/coord/query/query.go @@ -239,6 +239,7 @@ func (q *Query[K, N, M]) Advance(ctx context.Context, ev QueryEvent) (out QueryS returnState = &StateQuerySendMessage[K, N, M]{ NodeID: ni.NodeID, QueryID: q.id, + Target: q.target, Stats: q.stats, Message: q.msg, } @@ -399,6 +400,7 @@ type StateQueryFindCloser[K kad.Key[K], N kad.NodeID[K]] struct { type StateQuerySendMessage[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID NodeID N // the node to send the message to + Target K Message M Stats QueryStats } diff --git a/internal/coord/query_test.go b/internal/coord/query_test.go index 40d285b..83afe34 100644 --- a/internal/coord/query_test.go +++ b/internal/coord/query_test.go @@ -116,11 +116,11 @@ func (ts *QueryBehaviourBaseTestSuite) TestNotifiesNoProgress() { waiter := NewQueryWaiter(5) cmd := &EventStartFindCloserQuery{ - QueryID: "test", - Target: target, - KnownClosestNodes: seeds, - Notify: waiter, - NumResults: 10, + QueryID: "test", + Target: target, + Seed: seeds, + Notify: waiter, + NumResults: 10, } // queue the start of the query @@ -163,11 +163,11 @@ func (ts *QueryBehaviourBaseTestSuite) TestNotifiesQueryProgressed() { waiter := NewQueryWaiter(5) cmd := &EventStartFindCloserQuery{ - QueryID: "test", - Target: target, - KnownClosestNodes: seeds, - Notify: waiter, - NumResults: 10, + QueryID: "test", + Target: target, + Seed: seeds, + Notify: waiter, + NumResults: 10, } // queue the start of the query @@ -211,11 +211,11 @@ func (ts *QueryBehaviourBaseTestSuite) TestNotifiesQueryFinished() { waiter := NewQueryWaiter(5) cmd := &EventStartFindCloserQuery{ - QueryID: "test", - Target: target, - KnownClosestNodes: seeds, - Notify: waiter, - NumResults: 10, + QueryID: "test", + Target: target, + Seed: seeds, + Notify: waiter, + NumResults: 10, } // queue the start of the query @@ -297,6 +297,7 @@ func TestPooledQuery_deadlock_regression(t *testing.T) { QueryID: queryID, Request: msg, To: to, + Target: msg.Target(), Response: nil, CloserNodes: closer, } @@ -319,12 +320,12 @@ func TestPooledQuery_deadlock_regression(t *testing.T) { // start the message query c.queryBehaviour.Notify(ctx, &EventStartMessageQuery{ - QueryID: queryID, - Target: msg.Target(), - Message: msg, - KnownClosestNodes: []kadt.PeerID{nodes[1].NodeID}, - Notify: wrappedWaiter, - NumResults: 0, + QueryID: queryID, + Target: msg.Target(), + Message: msg, + Seed: []kadt.PeerID{nodes[1].NodeID}, + Notify: wrappedWaiter, + NumResults: 0, }) // advance state machines and assert that the state machine diff --git a/routing.go b/routing.go index 251b686..eb0a874 100644 --- a/routing.go +++ b/routing.go @@ -106,7 +106,7 @@ func (d *DHT) Provide(ctx context.Context, c cid.Cid, brdcst bool) error { } // construct message - addrInfo := peer.AddrInfo{ + self := peer.AddrInfo{ ID: d.host.ID(), Addrs: d.host.Addrs(), } @@ -115,12 +115,17 @@ func (d *DHT) Provide(ctx context.Context, c cid.Cid, brdcst bool) error { Type: pb.Message_ADD_PROVIDER, Key: c.Hash(), ProviderPeers: []*pb.Message_Peer{ - pb.FromAddrInfo(addrInfo), + pb.FromAddrInfo(self), }, } + seed, err := d.kad.GetClosestNodes(ctx, msg.Target(), d.cfg.BucketSize) + if err != nil { + return fmt.Errorf("getting closest nodes: %w", err) + } + // finally, find the closest peers to the target key. - return d.kad.BroadcastRecord(ctx, msg) + return d.kad.BroadcastRecord(ctx, msg, seed) } func (d *DHT) FindProvidersAsync(ctx context.Context, c cid.Cid, count int) <-chan peer.AddrInfo { @@ -261,13 +266,13 @@ func (d *DHT) PutValue(ctx context.Context, keyStr string, value []byte, opts .. Record: record.MakePutRecord(keyStr, value), } - // finally, find the closest peers to the target key. - err := d.kad.BroadcastRecord(ctx, msg) + seed, err := d.kad.GetClosestNodes(ctx, msg.Target(), d.cfg.BucketSize) if err != nil { - return fmt.Errorf("query error: %w", err) + return fmt.Errorf("getting closest nodes: %w", err) } - return nil + // finally, find the closest peers to the target key. + return d.kad.BroadcastRecord(ctx, msg, seed) } // putValueLocal stores a value in the local datastore without reaching out to From 16a4aa670e05b69f74d59699ed0cf61d4d9b3c96 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 13 Oct 2023 19:04:42 +0200 Subject: [PATCH 13/23] WIP --- fullrt_test.go | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/fullrt_test.go b/fullrt_test.go index 65187e7..9715d2c 100644 --- a/fullrt_test.go +++ b/fullrt_test.go @@ -1,23 +1,15 @@ package zikade -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/require" -) - -func TestNewFullRT(t *testing.T) { - cfg := &FullRTConfig{ - Config: DefaultConfig(), - CrawlInterval: 60 * time.Minute, - } - h := newTestHost(t) - fullRT, err := NewFullRT(h, cfg) - require.NoError(t, err) - - fullRT.Bootstrap(context.Background()) - - time.Sleep(time.Hour) -} +//func TestNewFullRT(t *testing.T) { +// cfg := &FullRTConfig{ +// Config: DefaultConfig(), +// CrawlInterval: 60 * time.Minute, +// } +// h := newTestHost(t) +// fullRT, err := NewFullRT(h, cfg) +// require.NoError(t, err) +// +// fullRT.Bootstrap(context.Background()) +// +// time.Sleep(time.Hour) +//} From 7d2f3f46330f2c4e01d8d024959d7c6009c17602 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 17 Oct 2023 13:19:58 +0200 Subject: [PATCH 14/23] WIP --- internal/coord/brdcst/mtm.go | 118 ++++++++----- internal/coord/brdcst/mtm_test.go | 267 ++++++++++++++++++++++++++++-- internal/coord/query/query.go | 2 + 3 files changed, 333 insertions(+), 54 deletions(-) diff --git a/internal/coord/brdcst/mtm.go b/internal/coord/brdcst/mtm.go index d1e23a1..9aad9da 100644 --- a/internal/coord/brdcst/mtm.go +++ b/internal/coord/brdcst/mtm.go @@ -24,22 +24,35 @@ type ManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { // a struct holding configuration options cfg *ConfigManyToMany[K] - // TODO + // keyReports tracks for each key this [ManyToMany] state machine should + // broadcast the number of successes and failures. keyReports map[string]*report - // TODO + // unprocessedNodes is a map from a node's ID to its [nodeState]. The + // [nodeState] contains information about all the keys that should be + // stored with that node, as well as, a map of all inflight requests and + // all keys that have already been tried to store with that node. unprocessedNodes map[string]*nodeState[K, N] - // TODO + // inflightWithCapacity holds information about nodes that we are currently + // contacting but still have capacity to receive more requests from us. The + // term capacity refers to the number of concurrent streams we can open to + // a single node based on [ConfigManyToMany.StreamConcurrency]. inflightWithCapacity map[string]*nodeState[K, N] - // TODO + // inflightWithCapacity holds information about nodes that we are currently + // contacting with no capacity to receive more concurrent streams. The + // term capacity refers to the number of concurrent streams we can open + // to a single node based on [ConfigManyToMany.StreamConcurrency]. inflightAtCapacity map[string]*nodeState[K, N] - // TODO + // processedNodes is a map from a node's ID to its [nodeState]. All nodes + // in this map have been fully processed. This means that all keys we wanted + // to store with a node have been attempted to be stored with it. processedNodes map[string]*nodeState[K, N] - // TODO + // msgFunc takes a key and returns the corresponding message that we will + // need to send to the remote node to store said key. msgFunc func(K) M } @@ -117,29 +130,38 @@ func NewManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.Q } // Advance advances the state of the [ManyToMany] [Broadcast] state machine. -func (otm *ManyToMany[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) (out BroadcastState) { +func (mtm *ManyToMany[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) (out BroadcastState) { _, span := tele.StartSpan(ctx, "ManyToMany.Advance", trace.WithAttributes(tele.AttrInEvent(ev))) - defer span.End() + defer func() { + span.SetAttributes(tele.AttrOutEvent(out)) + span.End() + }() switch ev := ev.(type) { case *EventBroadcastStop: case *EventBroadcastStoreRecordSuccess[K, N, M]: - if nstate, found := otm.inflightAtCapacity[ev.NodeID.String()]; found { + mapKey := ev.NodeID.String() + if nstate, found := mtm.inflightAtCapacity[mapKey]; found { + delete(mtm.inflightAtCapacity, mapKey) delete(nstate.inflight, key.HexString(ev.Target)) nstate.done = append(nstate.done, ev.Target) - delete(otm.inflightAtCapacity, ev.NodeID.String()) - if len(nstate.todo) == 0 && len(nstate.inflight) == 0 { - otm.processedNodes[ev.NodeID.String()] = nstate - } else { - otm.inflightWithCapacity[ev.NodeID.String()] = nstate + if len(nstate.todo) == 0 { + if len(nstate.inflight) == 0 { + mtm.processedNodes[mapKey] = nstate + } else { + mtm.inflightAtCapacity[mapKey] = nstate + } + } else if len(nstate.inflight) != 0 { + mtm.inflightWithCapacity[mapKey] = nstate } - } else if nstate, found := otm.inflightWithCapacity[ev.NodeID.String()]; found { + } else if nstate, found := mtm.inflightWithCapacity[mapKey]; found { + delete(mtm.inflightWithCapacity, mapKey) delete(nstate.inflight, key.HexString(ev.Target)) nstate.done = append(nstate.done, ev.Target) - if len(nstate.todo) == 0 && len(nstate.inflight) == 0 { - otm.processedNodes[ev.NodeID.String()] = nstate + if len(nstate.todo) != 0 { + mtm.inflightWithCapacity[mapKey] = nstate } } @@ -150,51 +172,67 @@ func (otm *ManyToMany[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) panic(fmt.Sprintf("unexpected event: %T", ev)) } - if len(otm.inflightWithCapacity)+len(otm.inflightAtCapacity) == otm.cfg.NodeConcurrency { - for node, nstate := range otm.inflightWithCapacity { - var popped K - popped, nstate.todo = nstate.todo[0], nstate.todo[1:] + for node, nstate := range mtm.inflightWithCapacity { + var popped K + popped, nstate.todo = nstate.todo[0], nstate.todo[1:] - nstate.inflight[key.HexString(popped)] = popped + nstate.inflight[key.HexString(popped)] = popped - if len(nstate.todo) == 0 || len(nstate.inflight) == otm.cfg.StreamConcurrency { - delete(otm.inflightWithCapacity, node) - otm.inflightAtCapacity[nstate.node.String()] = nstate - } + if len(nstate.todo) == 0 || len(nstate.inflight) == mtm.cfg.StreamConcurrency { + delete(mtm.inflightWithCapacity, node) + mtm.inflightAtCapacity[nstate.node.String()] = nstate + } - return &StateBroadcastStoreRecord[K, N, M]{ - QueryID: otm.queryID, - NodeID: nstate.node, - Target: popped, - Message: otm.msgFunc(popped), - } + return &StateBroadcastStoreRecord[K, N, M]{ + QueryID: mtm.queryID, + NodeID: nstate.node, + Target: popped, + Message: mtm.msgFunc(popped), } + } + // check if we are currently talking to the maximum number of nodes + // concurrently. + inflightNodes := len(mtm.inflightWithCapacity) + len(mtm.inflightAtCapacity) + if inflightNodes == mtm.cfg.NodeConcurrency || (inflightNodes > 0 && len(mtm.unprocessedNodes) == 0) { return &StateBroadcastWaiting{ - QueryID: otm.queryID, + QueryID: mtm.queryID, } } - for nodeStr, nstate := range otm.unprocessedNodes { - delete(otm.unprocessedNodes, nodeStr) + // we still have the capacity to contact more nodes + for nodeStr, nstate := range mtm.unprocessedNodes { + delete(mtm.unprocessedNodes, nodeStr) var popped K popped, nstate.todo = nstate.todo[0], nstate.todo[1:] nstate.inflight[key.HexString(popped)] = popped if len(nstate.todo) == 0 { - otm.inflightAtCapacity[nstate.node.String()] = nstate + mtm.inflightAtCapacity[nodeStr] = nstate } else { - otm.inflightWithCapacity[nstate.node.String()] = nstate + mtm.inflightWithCapacity[nodeStr] = nstate } return &StateBroadcastStoreRecord[K, N, M]{ - QueryID: otm.queryID, + QueryID: mtm.queryID, NodeID: nstate.node, Target: popped, - Message: otm.msgFunc(popped), + Message: mtm.msgFunc(popped), } } - return &StateBroadcastIdle{} + contacted := make([]N, 0, len(mtm.processedNodes)) + for _, ns := range mtm.processedNodes { + contacted = append(contacted, ns.node) + } + + return &StateBroadcastFinished[K, N]{ + QueryID: mtm.queryID, + Contacted: contacted, + Errors: map[string]struct { + Node N + Err error + }{}, + } } diff --git a/internal/coord/brdcst/mtm_test.go b/internal/coord/brdcst/mtm_test.go index e3ab4ab..5820f92 100644 --- a/internal/coord/brdcst/mtm_test.go +++ b/internal/coord/brdcst/mtm_test.go @@ -14,31 +14,248 @@ import ( func TestNewManyToMany(t *testing.T) { ctx := context.Background() - targets := make([]tiny.Key, 0, 100) - seed := make([]tiny.Node, 0, 100) - for i := 0; i < 100; i++ { + msgFunc := func(k tiny.Key) tiny.Message { + return tiny.Message{ + Content: k.String(), + } + } + + count := 64 + targets := make([]tiny.Key, 0, count) + seed := make([]tiny.Node, 0, count) + for i := 0; i < count; i++ { seed = append(seed, tiny.NewNode(tiny.Key(i+1))) - targets = append(targets, tiny.Key(i+1)) + targets = append(targets, tiny.Key(count+i+1)) + } + + cfg := DefaultConfigManyToMany(targets) + cfg.NodeConcurrency = 2 + cfg.StreamConcurrency = 5 + + qid := coordt.QueryID("test") + + t.Run("no seed", func(t *testing.T) { + sm := NewManyToMany[tiny.Key, tiny.Node, tiny.Message](qid, msgFunc, []tiny.Node{}, cfg) + require.Equal(t, qid, sm.queryID) + require.Equal(t, cfg, sm.cfg) + require.NotNil(t, sm.unprocessedNodes) + require.Len(t, sm.unprocessedNodes, 0) + require.NotNil(t, sm.inflightWithCapacity) + require.Len(t, sm.inflightWithCapacity, 0) + require.NotNil(t, sm.inflightAtCapacity) + require.Len(t, sm.inflightAtCapacity, 0) + require.NotNil(t, sm.processedNodes) + require.Len(t, sm.processedNodes, 0) + require.NotNil(t, sm.msgFunc) + + state := sm.Advance(ctx, &EventBroadcastPoll{}) + tstate, ok := state.(*StateBroadcastFinished[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + + require.Equal(t, qid, tstate.QueryID) + require.NotNil(t, tstate.Contacted) + require.Len(t, tstate.Contacted, 0) + require.NotNil(t, tstate.Errors) + require.Len(t, tstate.Errors, 0) + }) + + t.Run("no targets", func(t *testing.T) { + cfg := DefaultConfigManyToMany([]tiny.Key{}) + sm := NewManyToMany[tiny.Key, tiny.Node, tiny.Message](qid, msgFunc, seed, cfg) + require.Equal(t, qid, sm.queryID) + require.Equal(t, cfg, sm.cfg) + require.Len(t, sm.unprocessedNodes, 0) + require.Len(t, sm.inflightWithCapacity, 0) + require.Len(t, sm.inflightAtCapacity, 0) + require.Len(t, sm.processedNodes, 0) + require.NotNil(t, sm.msgFunc) + + state := sm.Advance(ctx, &EventBroadcastPoll{}) + tstate, ok := state.(*StateBroadcastFinished[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + + require.Equal(t, qid, tstate.QueryID) + require.NotNil(t, tstate.Contacted) + require.Len(t, tstate.Contacted, 0) + require.NotNil(t, tstate.Errors) + require.Len(t, tstate.Errors, 0) + }) + + t.Run("bucket sized seed and targets", func(t *testing.T) { + // TODO: make "20" based on some configuration + cfg := DefaultConfigManyToMany(targets[:20]) + sm := NewManyToMany[tiny.Key, tiny.Node, tiny.Message](qid, msgFunc, seed[:20], cfg) + require.Equal(t, qid, sm.queryID) + require.Equal(t, cfg, sm.cfg) + require.Len(t, sm.unprocessedNodes, 20) + for node, nodeStatus := range sm.unprocessedNodes { + require.Equal(t, node, nodeStatus.node.String()) + require.Len(t, nodeStatus.todo, 20) + require.Len(t, nodeStatus.done, 0) + require.Len(t, nodeStatus.inflight, 0) + } + require.Len(t, sm.inflightWithCapacity, 0) + require.Len(t, sm.inflightAtCapacity, 0) + require.Len(t, sm.processedNodes, 0) + require.NotNil(t, sm.msgFunc) + }) + + t.Run("more seeds than targets", func(t *testing.T) { + // because [seed] has incrementing IDs starting with 1, a key of 0 + // will have seeds 1-20 as their closest nodes. This is asserted in this + // test. + cfg := DefaultConfigManyToMany([]tiny.Key{tiny.Key(0)}) + sm := NewManyToMany[tiny.Key, tiny.Node, tiny.Message](qid, msgFunc, seed[:40], cfg) + require.Len(t, sm.unprocessedNodes, 20) + + for _, s := range seed[:20] { + nodeStatus := sm.unprocessedNodes[s.String()] + require.NotNil(t, nodeStatus) + require.Equal(t, s.String(), nodeStatus.node.String()) + require.Len(t, nodeStatus.todo, 1) + require.Len(t, nodeStatus.done, 0) + require.Len(t, nodeStatus.inflight, 0) + } + + for _, s := range seed[20:40] { + require.Nil(t, sm.unprocessedNodes[s.String()]) + } + }) +} + +func TestManyToMany_Advance_single_target_single_seed(t *testing.T) { + ctx := context.Background() + msgFunc := func(k tiny.Key) tiny.Message { + return tiny.Message{ + Content: k.String(), + } } + targets := []tiny.Key{tiny.Key(0)} + seed := []tiny.Node{tiny.NewNode(tiny.Key(1))} + + qid := coordt.QueryID("test") + + // create new state machine + sm := NewManyToMany[tiny.Key, tiny.Node, tiny.Message](qid, msgFunc, seed, DefaultConfigManyToMany(targets)) + require.Len(t, sm.unprocessedNodes, 1) + + state := sm.Advance(ctx, &EventBroadcastPoll{}) + tstate, ok := state.(*StateBroadcastStoreRecord[tiny.Key, tiny.Node, tiny.Message]) + require.True(t, ok, "type is %T", state) + + require.Equal(t, tstate.QueryID, qid) + require.NotNil(t, tstate.NodeID) + require.NotNil(t, tstate.Message) + + state = sm.Advance(ctx, &EventBroadcastStoreRecordSuccess[tiny.Key, tiny.Node, tiny.Message]{ + NodeID: tstate.NodeID, + Target: targets[0], + Request: tstate.Message, + Response: tstate.Message, + }) + fstate, ok := state.(*StateBroadcastFinished[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + require.Equal(t, fstate.QueryID, qid) + require.Equal(t, seed[0], fstate.Contacted[0]) +} + +func TestManyToMany_Advance_multi_target_single_seed(t *testing.T) { + ctx := context.Background() msgFunc := func(k tiny.Key) tiny.Message { return tiny.Message{ Content: k.String(), } } + + targets := []tiny.Key{tiny.Key(0), tiny.Key(2)} + seed := []tiny.Node{tiny.NewNode(tiny.Key(1))} + + qid := coordt.QueryID("test") + + // create new state machine + cfg := DefaultConfigManyToMany(targets) + cfg.StreamConcurrency = 1 + + sm := NewManyToMany[tiny.Key, tiny.Node, tiny.Message](qid, msgFunc, seed, cfg) + require.Len(t, sm.unprocessedNodes, 1) + + state := sm.Advance(ctx, &EventBroadcastPoll{}) + tstate, ok := state.(*StateBroadcastStoreRecord[tiny.Key, tiny.Node, tiny.Message]) + require.True(t, ok, "type is %T", state) + + require.Equal(t, tstate.QueryID, qid) + require.NotNil(t, tstate.NodeID) + require.NotNil(t, tstate.Message) + + n := new(big.Int) + n.SetString(tstate.Message.Content, 16) + state = sm.Advance(ctx, &EventBroadcastStoreRecordSuccess[tiny.Key, tiny.Node, tiny.Message]{ + NodeID: tstate.NodeID, + Target: tiny.Key(n.Int64()), + Request: tstate.Message, + Response: tstate.Message, + }) + tstate, ok = state.(*StateBroadcastStoreRecord[tiny.Key, tiny.Node, tiny.Message]) + require.True(t, ok, "type is %T", state) + + require.Equal(t, tstate.QueryID, qid) + require.NotNil(t, tstate.NodeID) + require.NotNil(t, tstate.Message) + + state = sm.Advance(ctx, &EventBroadcastPoll{}) + require.IsType(t, &StateBroadcastWaiting{}, state) + + n.SetString(tstate.Message.Content, 16) + state = sm.Advance(ctx, &EventBroadcastStoreRecordSuccess[tiny.Key, tiny.Node, tiny.Message]{ + NodeID: tstate.NodeID, + Target: tiny.Key(n.Int64()), + Request: tstate.Message, + Response: tstate.Message, + }) + + fstate, ok := state.(*StateBroadcastFinished[tiny.Key, tiny.Node]) + require.True(t, ok, "type is %T", state) + require.Equal(t, fstate.QueryID, qid) + require.Equal(t, seed[0], fstate.Contacted[0]) +} + +func TestManyToMany_Advance_multi_target_multi_seed(t *testing.T) { + ctx := context.Background() + + msgFunc := func(k tiny.Key) tiny.Message { + return tiny.Message{ + Content: k.String(), + } + } + + count := 64 + targets := make([]tiny.Key, 0, count) + seed := make([]tiny.Node, 0, count) + for i := 0; i < count; i += 2 { + seed = append(seed, tiny.NewNode(tiny.Key(i+1))) + targets = append(targets, tiny.Key(i+2)) + } + cfg := DefaultConfigManyToMany(targets) cfg.NodeConcurrency = 2 cfg.StreamConcurrency = 5 qid := coordt.QueryID("test") + + // create new state machine sm := NewManyToMany[tiny.Key, tiny.Node, tiny.Message](qid, msgFunc, seed, cfg) - require.Len(t, sm.unprocessedNodes, 100) + require.Len(t, sm.unprocessedNodes, 32) + storeRecordRequestCount := 0 + // poll as often until we are at capacity pending := map[tiny.Node][]tiny.Message{} for i := 0; i < cfg.NodeConcurrency*cfg.StreamConcurrency; i++ { state := sm.Advance(ctx, &EventBroadcastPoll{}) tstate, ok := state.(*StateBroadcastStoreRecord[tiny.Key, tiny.Node, tiny.Message]) require.True(t, ok, "type is %T", state) + storeRecordRequestCount += 1 + require.Equal(t, tstate.QueryID, qid) require.NotNil(t, tstate.NodeID) require.NotNil(t, tstate.Message) @@ -48,6 +265,7 @@ func TestNewManyToMany(t *testing.T) { pending[tstate.NodeID] = append(pending[tstate.NodeID], tstate.Message) } + // assert that we're at capacity require.Len(t, pending, cfg.NodeConcurrency) require.Len(t, sm.inflightAtCapacity, cfg.NodeConcurrency) for _, messages := range pending { @@ -57,14 +275,32 @@ func TestNewManyToMany(t *testing.T) { require.Len(t, inflight.inflight, cfg.StreamConcurrency) } + // because we're at capactiy another poll will return waiting. state := sm.Advance(ctx, &EventBroadcastPoll{}) wstate, ok := state.(*StateBroadcastWaiting) require.True(t, ok, "type is %T", state) require.Equal(t, qid, wstate.QueryID) - for node, messages := range pending { + for { + if len(pending) == 0 { + break + } + + var ( + node tiny.Node + messages []tiny.Message + ) + + for node, messages = range pending { + break + } + var popped tiny.Message - popped, messages = messages[0], messages[1:] + popped, pending[node] = messages[0], messages[1:] + + if len(pending[node]) == 0 { + delete(pending, node) + } n := new(big.Int) n.SetString(popped.Content, 16) @@ -76,12 +312,15 @@ func TestNewManyToMany(t *testing.T) { }) tstate, ok := state.(*StateBroadcastStoreRecord[tiny.Key, tiny.Node, tiny.Message]) - require.True(t, ok, "type is %T", state) - _ = tstate - - state := sm.Advance(ctx, &EventBroadcastPoll{}) - wstate, ok := state.(*StateBroadcastWaiting) - require.True(t, ok, "type is %T", state) - require.Equal(t, qid, wstate.QueryID) + if !ok { + continue + } + storeRecordRequestCount += 1 + if _, found := pending[tstate.NodeID]; !found { + pending[tstate.NodeID] = []tiny.Message{} + } + pending[tstate.NodeID] = append(pending[tstate.NodeID], tstate.Message) } + + require.Equal(t, len(targets)*20, storeRecordRequestCount) } diff --git a/internal/coord/query/query.go b/internal/coord/query/query.go index f82803c..d2d99e9 100644 --- a/internal/coord/query/query.go +++ b/internal/coord/query/query.go @@ -141,6 +141,8 @@ func (q *Query[K, N, M]) Advance(ctx context.Context, ev QueryEvent) (out QueryS span.End() }() + span.SpanContext() + if q.finished { return &StateQueryFinished[K, N]{ QueryID: q.id, From 7b270b50fb57e08ea2b6f00b4607106a21001016 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 17 Oct 2023 16:45:42 +0200 Subject: [PATCH 15/23] WIP --- internal/coord/brdcst/mtm.go | 95 +++++++++++++++++++------------ internal/coord/brdcst/mtm_test.go | 9 +++ internal/coord/coordinator.go | 16 +++++- 3 files changed, 80 insertions(+), 40 deletions(-) diff --git a/internal/coord/brdcst/mtm.go b/internal/coord/brdcst/mtm.go index 9aad9da..89643c2 100644 --- a/internal/coord/brdcst/mtm.go +++ b/internal/coord/brdcst/mtm.go @@ -26,30 +26,31 @@ type ManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { // keyReports tracks for each key this [ManyToMany] state machine should // broadcast the number of successes and failures. + // TODO: perhaps this is better tracked outside of this state machine? keyReports map[string]*report - // unprocessedNodes is a map from a node's ID to its [nodeState]. The - // [nodeState] contains information about all the keys that should be + // unprocessedNodes is a map from a node's ID to its [NodeState]. The + // [NodeState] contains information about all the keys that should be // stored with that node, as well as, a map of all inflight requests and // all keys that have already been tried to store with that node. - unprocessedNodes map[string]*nodeState[K, N] + unprocessedNodes map[string]*NodeState[K, N] // inflightWithCapacity holds information about nodes that we are currently // contacting but still have capacity to receive more requests from us. The // term capacity refers to the number of concurrent streams we can open to // a single node based on [ConfigManyToMany.StreamConcurrency]. - inflightWithCapacity map[string]*nodeState[K, N] + inflightWithCapacity map[string]*NodeState[K, N] // inflightWithCapacity holds information about nodes that we are currently // contacting with no capacity to receive more concurrent streams. The // term capacity refers to the number of concurrent streams we can open // to a single node based on [ConfigManyToMany.StreamConcurrency]. - inflightAtCapacity map[string]*nodeState[K, N] + inflightAtCapacity map[string]*NodeState[K, N] - // processedNodes is a map from a node's ID to its [nodeState]. All nodes + // processedNodes is a map from a node's ID to its [NodeState]. All nodes // in this map have been fully processed. This means that all keys we wanted // to store with a node have been attempted to be stored with it. - processedNodes map[string]*nodeState[K, N] + processedNodes map[string]*NodeState[K, N] // msgFunc takes a key and returns the corresponding message that we will // need to send to the remote node to store said key. @@ -61,7 +62,7 @@ type brdcstManyMapVal[K kad.Key[K], N kad.NodeID[K]] struct { node N } -type nodeState[K kad.Key[K], N kad.NodeID[K]] struct { +type NodeState[K kad.Key[K], N kad.NodeID[K]] struct { node N todo []K inflight map[string]K @@ -81,6 +82,8 @@ func NewManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.Q t.Add(s.Key(), s) } + // TODO: the below is quite expensive for many keys. It's probably worth doing this outside of the event loop + // find out which seed nodes are responsible to hold the provider/put // record for which target key. keyReports := make(map[string]*report, len(cfg.Targets)) @@ -88,7 +91,11 @@ func NewManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.Q for _, target := range cfg.Targets { entries := trie.Closest(t, target, 20) // TODO: make configurable targetMapKey := key.HexString(target) - keyReports[targetMapKey] = &report{} + + if len(entries) > 0 { + keyReports[targetMapKey] = &report{} + } + for _, entry := range entries { node := entry.Data nodeMapKey := node.String() @@ -100,13 +107,13 @@ func NewManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.Q } } - unprocessedNodes := make(map[string]*nodeState[K, N], len(mappings)) + unprocessedNodes := make(map[string]*NodeState[K, N], len(mappings)) for node, mapVals := range mappings { if len(mapVals) == 0 { continue } - unprocessedNodes[node] = &nodeState[K, N]{ + unprocessedNodes[node] = &NodeState[K, N]{ todo: make([]K, 0, len(mapVals)), done: make([]K, 0, len(mapVals)), inflight: map[string]K{}, @@ -122,9 +129,9 @@ func NewManyToMany[K kad.Key[K], N kad.NodeID[K], M coordt.Message](qid coordt.Q cfg: cfg, keyReports: keyReports, unprocessedNodes: unprocessedNodes, - inflightWithCapacity: map[string]*nodeState[K, N]{}, - inflightAtCapacity: map[string]*nodeState[K, N]{}, - processedNodes: map[string]*nodeState[K, N]{}, + inflightWithCapacity: map[string]*NodeState[K, N]{}, + inflightAtCapacity: map[string]*NodeState[K, N]{}, + processedNodes: map[string]*NodeState[K, N]{}, msgFunc: msgFunc, } } @@ -140,32 +147,18 @@ func (mtm *ManyToMany[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) switch ev := ev.(type) { case *EventBroadcastStop: case *EventBroadcastStoreRecordSuccess[K, N, M]: - mapKey := ev.NodeID.String() - if nstate, found := mtm.inflightAtCapacity[mapKey]; found { - delete(mtm.inflightAtCapacity, mapKey) - delete(nstate.inflight, key.HexString(ev.Target)) - nstate.done = append(nstate.done, ev.Target) - - if len(nstate.todo) == 0 { - if len(nstate.inflight) == 0 { - mtm.processedNodes[mapKey] = nstate - } else { - mtm.inflightAtCapacity[mapKey] = nstate - } - } else if len(nstate.inflight) != 0 { - mtm.inflightWithCapacity[mapKey] = nstate - } - } else if nstate, found := mtm.inflightWithCapacity[mapKey]; found { - delete(mtm.inflightWithCapacity, mapKey) - delete(nstate.inflight, key.HexString(ev.Target)) - nstate.done = append(nstate.done, ev.Target) + mtm.handleStoreRecordResult(ev.NodeID, ev.Target) - if len(nstate.todo) != 0 { - mtm.inflightWithCapacity[mapKey] = nstate - } - } + targetMapKey := key.HexString(ev.Target) + mtm.keyReports[targetMapKey].successes += 1 + mtm.keyReports[targetMapKey].lastSuccess = time.Now() case *EventBroadcastStoreRecordFailure[K, N, M]: + mtm.handleStoreRecordResult(ev.NodeID, ev.Target) + + targetMapKey := key.HexString(ev.Target) + mtm.keyReports[targetMapKey].failures += 1 + case *EventBroadcastPoll: // ignore, nothing to do default: @@ -236,3 +229,31 @@ func (mtm *ManyToMany[K, N, M]) Advance(ctx context.Context, ev BroadcastEvent) }{}, } } + +func (mtm *ManyToMany[K, N, M]) handleStoreRecordResult(node N, target K) { + nodeMapKey := node.String() + targetMapKey := key.HexString(target) + if nstate, found := mtm.inflightAtCapacity[nodeMapKey]; found { + delete(mtm.inflightAtCapacity, nodeMapKey) + delete(nstate.inflight, targetMapKey) + nstate.done = append(nstate.done, target) + + if len(nstate.todo) == 0 { + if len(nstate.inflight) == 0 { + mtm.processedNodes[nodeMapKey] = nstate + } else { + mtm.inflightAtCapacity[nodeMapKey] = nstate + } + } else if len(nstate.inflight) != 0 { + mtm.inflightWithCapacity[nodeMapKey] = nstate + } + } else if nstate, found := mtm.inflightWithCapacity[nodeMapKey]; found { + delete(mtm.inflightWithCapacity, nodeMapKey) + delete(nstate.inflight, targetMapKey) + nstate.done = append(nstate.done, target) + + if len(nstate.todo) != 0 { + mtm.inflightWithCapacity[nodeMapKey] = nstate + } + } +} diff --git a/internal/coord/brdcst/mtm_test.go b/internal/coord/brdcst/mtm_test.go index 5820f92..a908df5 100644 --- a/internal/coord/brdcst/mtm_test.go +++ b/internal/coord/brdcst/mtm_test.go @@ -5,6 +5,7 @@ import ( "math/big" "testing" + "github.com/plprobelab/go-libdht/kad/key" "github.com/stretchr/testify/require" "github.com/plprobelab/zikade/internal/coord/coordt" @@ -46,6 +47,8 @@ func TestNewManyToMany(t *testing.T) { require.Len(t, sm.inflightAtCapacity, 0) require.NotNil(t, sm.processedNodes) require.Len(t, sm.processedNodes, 0) + require.Len(t, sm.keyReports, 0) + require.NotNil(t, sm.keyReports) require.NotNil(t, sm.msgFunc) state := sm.Advance(ctx, &EventBroadcastPoll{}) @@ -68,6 +71,7 @@ func TestNewManyToMany(t *testing.T) { require.Len(t, sm.inflightWithCapacity, 0) require.Len(t, sm.inflightAtCapacity, 0) require.Len(t, sm.processedNodes, 0) + require.Len(t, sm.keyReports, 0) require.NotNil(t, sm.msgFunc) state := sm.Advance(ctx, &EventBroadcastPoll{}) @@ -97,6 +101,7 @@ func TestNewManyToMany(t *testing.T) { require.Len(t, sm.inflightWithCapacity, 0) require.Len(t, sm.inflightAtCapacity, 0) require.Len(t, sm.processedNodes, 0) + require.Len(t, sm.keyReports, 20) require.NotNil(t, sm.msgFunc) }) @@ -158,6 +163,10 @@ func TestManyToMany_Advance_single_target_single_seed(t *testing.T) { require.True(t, ok, "type is %T", state) require.Equal(t, fstate.QueryID, qid) require.Equal(t, seed[0], fstate.Contacted[0]) + + require.Equal(t, 1, sm.keyReports[key.HexString(targets[0])].successes) + require.Equal(t, 0, sm.keyReports[key.HexString(targets[0])].failures) + require.False(t, sm.keyReports[key.HexString(targets[0])].lastSuccess.IsZero()) } func TestManyToMany_Advance_multi_target_single_seed(t *testing.T) { diff --git a/internal/coord/coordinator.go b/internal/coord/coordinator.go index dad36d8..0f28a8e 100644 --- a/internal/coord/coordinator.go +++ b/internal/coord/coordinator.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "math" "reflect" "sync" "sync/atomic" @@ -413,11 +414,20 @@ func (c *Coordinator) BroadcastStatic(ctx context.Context, msg *pb.Message, seed return c.broadcast(ctx, msgFunc, seed, brdcst.DefaultConfigOneToMany(msg.Target())) } -func (c *Coordinator) BroadcastMany(ctx context.Context, keys []kadt.Key, seed []kadt.PeerID, msgFn func(k kadt.Key) *pb.Message) error { +func (c *Coordinator) BroadcastMany(ctx context.Context, keys []kadt.Key, msgFn func(k kadt.Key) *pb.Message) error { + // verify that we have keys to push into the network + if len(keys) == 0 { + return fmt.Errorf("no keys to broadcast") + } + + // grab the entire routing table contents + seed := c.rt.NearestNodes(keys[0], math.MaxInt) + + // start broadcasting return c.broadcast(ctx, msgFn, seed, brdcst.DefaultConfigManyToMany(keys)) } -func (c *Coordinator) broadcast(ctx context.Context, msgFunc func(k kadt.Key) *pb.Message, seeds []kadt.PeerID, cfg brdcst.Config) error { +func (c *Coordinator) broadcast(ctx context.Context, msgFunc func(k kadt.Key) *pb.Message, seed []kadt.PeerID, cfg brdcst.Config) error { ctx, span := c.tele.Tracer.Start(ctx, "Coordinator.broadcast") defer span.End() @@ -430,7 +440,7 @@ func (c *Coordinator) broadcast(ctx context.Context, msgFunc func(k kadt.Key) *p cmd := &EventStartBroadcast{ QueryID: queryID, MsgFunc: msgFunc, - Seed: seeds, + Seed: seed, Notify: waiter, Config: cfg, } From 50a19a6dde01a95da4145fae36abbf8bfe288f73 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 17 Oct 2023 16:59:58 +0200 Subject: [PATCH 16/23] WIP --- internal/coord/coordinator.go | 14 -------------- internal/coord/query.go | 7 ++++++- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/internal/coord/coordinator.go b/internal/coord/coordinator.go index f975928..d7d310a 100644 --- a/internal/coord/coordinator.go +++ b/internal/coord/coordinator.go @@ -9,8 +9,6 @@ import ( "sync" "sync/atomic" - "github.com/plprobelab/zikade/internal/coord/query" - "github.com/benbjohnson/clock" "github.com/plprobelab/go-libdht/kad" "go.opentelemetry.io/otel" @@ -293,18 +291,6 @@ func (c *Coordinator) GetClosestNodes(ctx context.Context, k kadt.Key, n int) ([ return c.rt.NearestNodes(k, n), nil } -type QueryConfig struct { - NumResults int - Strategy query.QueryStrategy -} - -func DefaultQueryConfig() *QueryConfig { - return &QueryConfig{ - NumResults: 20, - Strategy: &query.QueryStrategyConverge{}, - } -} - // QueryClosest starts a query that attempts to find the closest nodes to the target key. // It returns the closest nodes found to the target key and statistics on the actions of the query. // diff --git a/internal/coord/query.go b/internal/coord/query.go index 9fe117a..b20b5e2 100644 --- a/internal/coord/query.go +++ b/internal/coord/query.go @@ -39,6 +39,10 @@ type QueryConfig struct { // RequestTimeout is the timeout queries should use for contacting a single node RequestTimeout time.Duration + + NumResults int + + Strategy query.QueryStrategy } // Validate checks the configuration options and returns an error if any have invalid values. @@ -103,7 +107,8 @@ func DefaultQueryConfig() *QueryConfig { Timeout: 5 * time.Minute, // MAGIC RequestConcurrency: 3, // MAGIC RequestTimeout: time.Minute, // MAGIC - + NumResults: 20, // MAGIC + Strategy: &query.QueryStrategyConverge{}, } } From ea58ea15c87831a147a59a9691f7063841e46c09 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 17 Oct 2023 17:10:48 +0200 Subject: [PATCH 17/23] WIP --- fullrt.go | 86 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 35 deletions(-) diff --git a/fullrt.go b/fullrt.go index 04afcaf..c029b14 100644 --- a/fullrt.go +++ b/fullrt.go @@ -307,8 +307,7 @@ func (f *FullRT) PutValue(ctx context.Context, keyStr string, value []byte, opts return nil } - // construct Kademlia-key. Yes, we hash the complete key string which - // includes the namespace prefix. + // construct message that we will send to other peer msg := &pb.Message{ Type: pb.Message_PUT_VALUE, Key: []byte(keyStr), @@ -569,38 +568,55 @@ func (f *FullRT) ProvideMany(ctx context.Context, mhashes []mh.Multihash) error keys = append(keys, kadt.NewKey(mhash)) } - // TODO: get seed set of peers - return f.kad.BroadcastMany(ctx, keys, nil, msgFn) + return f.kad.BroadcastMany(ctx, keys, msgFn) } -//func (f *FullRT) PutMany(ctx context.Context, keys []string, values [][]byte) error { -// _, span := f.tele.Tracer.Start(ctx, "FullRT.PutMany") -// defer span.End() -// -// -// if !dht.enableValues { -// return routing.ErrNotSupported -// } -// -// if len(keys) != len(values) { -// return fmt.Errorf("number of keys does not match the number of values") -// } -// -// keysAsPeerIDs := make([]peer.ID, 0, len(keys)) -// keyRecMap := make(map[string][]byte) -// for i, k := range keys { -// keysAsPeerIDs = append(keysAsPeerIDs, peer.ID(k)) -// keyRecMap[k] = values[i] -// } -// -// if len(keys) != len(keyRecMap) { -// return fmt.Errorf("does not support duplicate keys") -// } -// -// fn := func(ctx context.Context, p, k peer.ID) error { -// keyStr := string(k) -// return dht.protoMessenger.PutValue(ctx, p, record.MakePutRecord(keyStr, keyRecMap[keyStr])) -// } -// -// return dht.bulkMessageSend(ctx, keysAsPeerIDs, fn, false) -//} +func (f *FullRT) PutMany(ctx context.Context, keySlice []string, valueSlice [][]byte) error { + _, span := f.tele.Tracer.Start(ctx, "FullRT.PutMany") + defer span.End() + + if len(keySlice) == 0 { + return fmt.Errorf("no keys") + } + + ns, _, err := record.SplitKey(keySlice[0]) + if err != nil { + return fmt.Errorf("splitting key: %w", err) + } + + _, found := f.backends[ns] + if !found { + return routing.ErrNotSupported + } + + if len(keySlice) != len(valueSlice) { + return fmt.Errorf("number of keys does not match the number of values") + } + + kadKeys := make([]kadt.Key, 0, len(keySlice)) + valueMap := make(map[string][]byte, len(valueSlice)) + for i, preimage := range keySlice { + valueMap[preimage] = valueSlice[i] + kadKeys = append(kadKeys, kadt.NewKey([]byte(preimage))) + } + + // Compute addresses once for all provides + self := peer.AddrInfo{ + ID: f.host.ID(), + Addrs: f.host.Addrs(), + } + if len(self.Addrs) < 1 { + return fmt.Errorf("no known addresses for self, cannot put provider") + } + + msgFn := func(k kadt.Key) *pb.Message { + strKey := string(k.MsgKey()) + return &pb.Message{ + Type: pb.Message_PUT_VALUE, + Key: k.MsgKey(), + Record: record.MakePutRecord(strKey, valueMap[strKey]), + } + } + + return f.kad.BroadcastMany(ctx, kadKeys, msgFn) +} From 6583292147710827be66620ffc66f734b7613df9 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 17 Oct 2023 17:36:40 +0200 Subject: [PATCH 18/23] fix test --- internal/coord/routing_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/coord/routing_test.go b/internal/coord/routing_test.go index 89ea2ff..5b236cc 100644 --- a/internal/coord/routing_test.go +++ b/internal/coord/routing_test.go @@ -635,10 +635,11 @@ func TestRoutingStartCrawlSendsEvent(t *testing.T) { } routingBehaviour.Notify(ctx, ev) + routingBehaviour.Perform(ctx) // the event that should be passed to the bootstrap state machine expected := &routing.EventCrawlStart[kadt.Key, kadt.PeerID]{ Seed: ev.Seed, } - require.Equal(t, expected, crawl.Received) + require.Equal(t, expected, crawl.first()) } From 1877f061569329fc77cbaa5bffaa515a74d1355b Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 17 Oct 2023 17:52:04 +0200 Subject: [PATCH 19/23] WIP --- internal/coord/brdcst/brdcst.go | 10 +-- internal/coord/brdcst/config.go | 97 -------------------------- internal/coord/brdcst/config_test.go | 40 ----------- internal/coord/brdcst/followup.go | 19 +++++ internal/coord/brdcst/followup_test.go | 15 ++++ internal/coord/brdcst/mtm.go | 27 +++++++ internal/coord/brdcst/otm.go | 20 ++++++ internal/coord/brdcst/pool.go | 23 ++++++ internal/coord/brdcst/pool_test.go | 24 +++++++ 9 files changed, 133 insertions(+), 142 deletions(-) delete mode 100644 internal/coord/brdcst/config_test.go create mode 100644 internal/coord/brdcst/followup_test.go diff --git a/internal/coord/brdcst/brdcst.go b/internal/coord/brdcst/brdcst.go index bcfc80b..c10f15b 100644 --- a/internal/coord/brdcst/brdcst.go +++ b/internal/coord/brdcst/brdcst.go @@ -30,8 +30,8 @@ type StateBroadcastFindCloser[K kad.Key[K], N kad.NodeID[K]] struct { type StateBroadcastStoreRecord[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the id of the broadcast operation that wants to send the message NodeID N // the node to send the message to - Target K - Message M // the message the broadcast behaviour wants to send + Target K // the key that we want to store the record for + Message M // the message the broadcast behaviour wants to send } // StateBroadcastWaiting indicates that a [Broadcast] state machine is waiting @@ -113,7 +113,7 @@ type EventBroadcastNodeFailure[K kad.Key[K], N kad.NodeID[K]] struct { // receive a response. type EventBroadcastStoreRecordSuccess[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { NodeID N // the node the message was sent to - Target K + Target K // the key that we successfully stored the record for Request M // the message that was sent to the remote node Response M // the reply we got from the remote node (nil in many cases of the Amino DHT) } @@ -122,8 +122,8 @@ type EventBroadcastStoreRecordSuccess[K kad.Key[K], N kad.NodeID[K], M coordt.Me // machine that storing a record with a remote node (NodeID) has failed. The // message that was sent is held in Request, and the error will be in Error. type EventBroadcastStoreRecordFailure[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { - NodeID N // the node the message was sent to - Target K + NodeID N // the node the message was sent to + Target K // the key that we failed to store the record for Request M // the message that was sent to the remote node Error error // the error that caused the failure, if any } diff --git a/internal/coord/brdcst/config.go b/internal/coord/brdcst/config.go index fd709c9..9d4e132 100644 --- a/internal/coord/brdcst/config.go +++ b/internal/coord/brdcst/config.go @@ -1,36 +1,5 @@ package brdcst -import ( - "fmt" - - "github.com/plprobelab/go-libdht/kad" - - "github.com/plprobelab/zikade/internal/coord/query" -) - -// ConfigPool specifies the configuration for a broadcast [Pool]. -type ConfigPool struct { - pCfg *query.PoolConfig -} - -// Validate checks the configuration options and returns an error if any have -// invalid values. -func (cfg *ConfigPool) Validate() error { - if cfg.pCfg == nil { - return fmt.Errorf("query pool config must not be nil") - } - - return nil -} - -// DefaultConfigPool returns the default configuration options for a Pool. -// Options may be overridden before passing to NewPool -func DefaultConfigPool() *ConfigPool { - return &ConfigPool{ - pCfg: query.DefaultPoolConfig(), - } -} - // Config is an interface that all broadcast configurations must implement. // Because we have multiple ways of broadcasting records to the network, like // [FollowUp] or [OneToMany], the [EventPoolStartBroadcast] has a configuration @@ -44,69 +13,3 @@ type Config interface { func (c *ConfigFollowUp[K]) broadcastConfig() {} func (c *ConfigOneToMany[K]) broadcastConfig() {} func (c *ConfigManyToMany[K]) broadcastConfig() {} - -// ConfigFollowUp specifies the configuration for the [FollowUp] state machine. -type ConfigFollowUp[K kad.Key[K]] struct { - Target K -} - -// Validate checks the configuration options and returns an error if any have -// invalid values. -func (c *ConfigFollowUp[K]) Validate() error { - return nil -} - -// DefaultConfigFollowUp returns the default configuration options for the -// [FollowUp] state machine. -func DefaultConfigFollowUp[K kad.Key[K]](target K) *ConfigFollowUp[K] { - return &ConfigFollowUp[K]{ - Target: target, - } -} - -// ConfigOneToMany specifies the configuration for the [OneToMany] state -// machine. -type ConfigOneToMany[K kad.Key[K]] struct { - Target K -} - -// Validate checks the configuration options and returns an error if any have -// invalid values. -func (c *ConfigOneToMany[K]) Validate() error { - return nil -} - -// DefaultConfigOneToMany returns the default configuration options for the -// [OneToMany] state machine. -func DefaultConfigOneToMany[K kad.Key[K]](target K) *ConfigOneToMany[K] { - return &ConfigOneToMany[K]{ - Target: target, - } -} - -// ConfigManyToMany specifies the configuration for the [ManyToMany] state -// machine. -type ConfigManyToMany[K kad.Key[K]] struct { - NodeConcurrency int - StreamConcurrency int - Targets []K -} - -// Validate checks the configuration options and returns an error if any have -// invalid values. -func (c *ConfigManyToMany[K]) Validate() error { - if len(c.Targets) == 0 { - return fmt.Errorf("targets must not be empty") - } - return nil -} - -// DefaultConfigManyToMany returns the default configuration options for the -// [ManyToMany] state machine. -func DefaultConfigManyToMany[K kad.Key[K]](targets []K) *ConfigManyToMany[K] { - return &ConfigManyToMany[K]{ - NodeConcurrency: 100, // MAGIC - StreamConcurrency: 10, // MAGIC - Targets: targets, - } -} diff --git a/internal/coord/brdcst/config_test.go b/internal/coord/brdcst/config_test.go deleted file mode 100644 index 16654fe..0000000 --- a/internal/coord/brdcst/config_test.go +++ /dev/null @@ -1,40 +0,0 @@ -package brdcst - -import ( - "testing" - - "github.com/plprobelab/zikade/internal/tiny" - - "github.com/stretchr/testify/assert" -) - -func TestConfigPool_Validate(t *testing.T) { - t.Run("default is valid", func(t *testing.T) { - cfg := DefaultConfigPool() - assert.NoError(t, cfg.Validate()) - }) - - t.Run("nil pool config", func(t *testing.T) { - cfg := DefaultConfigPool() - cfg.pCfg = nil - assert.Error(t, cfg.Validate()) - }) -} - -func TestConfigFollowUp_Validate(t *testing.T) { - t.Run("default is valid", func(t *testing.T) { - cfg := DefaultConfigFollowUp[tiny.Key](tiny.Key(0)) - assert.NoError(t, cfg.Validate()) - }) -} - -func TestConfig_interface_conformance(t *testing.T) { - configs := []Config{ - &ConfigFollowUp[tiny.Key]{}, - &ConfigOneToMany[tiny.Key]{}, - &ConfigManyToMany[tiny.Key]{}, - } - for _, c := range configs { - c.broadcastConfig() // drives test coverage - } -} diff --git a/internal/coord/brdcst/followup.go b/internal/coord/brdcst/followup.go index 56df355..e8a3aa6 100644 --- a/internal/coord/brdcst/followup.go +++ b/internal/coord/brdcst/followup.go @@ -12,6 +12,25 @@ import ( "github.com/plprobelab/zikade/tele" ) +// ConfigFollowUp specifies the configuration for the [FollowUp] state machine. +type ConfigFollowUp[K kad.Key[K]] struct { + Target K +} + +// Validate checks the configuration options and returns an error if any have +// invalid values. +func (c *ConfigFollowUp[K]) Validate() error { + return nil +} + +// DefaultConfigFollowUp returns the default configuration options for the +// [FollowUp] state machine. +func DefaultConfigFollowUp[K kad.Key[K]](target K) *ConfigFollowUp[K] { + return &ConfigFollowUp[K]{ + Target: target, + } +} + // FollowUp is a [Broadcast] state machine and encapsulates the logic around // doing a "classic" put operation. This mimics the algorithm employed in the // original go-libp2p-kad-dht v1 code base. It first queries the closest nodes diff --git a/internal/coord/brdcst/followup_test.go b/internal/coord/brdcst/followup_test.go new file mode 100644 index 0000000..62784de --- /dev/null +++ b/internal/coord/brdcst/followup_test.go @@ -0,0 +1,15 @@ +package brdcst + +import ( + "testing" + + "github.com/plprobelab/zikade/internal/tiny" + "github.com/stretchr/testify/require" +) + +func TestConfigFollowUp_Validate(t *testing.T) { + t.Run("default is valid", func(t *testing.T) { + cfg := DefaultConfigFollowUp[tiny.Key](tiny.Key(0)) + require.NoError(t, cfg.Validate()) + }) +} diff --git a/internal/coord/brdcst/mtm.go b/internal/coord/brdcst/mtm.go index 89643c2..f8799ea 100644 --- a/internal/coord/brdcst/mtm.go +++ b/internal/coord/brdcst/mtm.go @@ -14,6 +14,33 @@ import ( "github.com/plprobelab/zikade/tele" ) +// ConfigManyToMany specifies the configuration for the [ManyToMany] state +// machine. +type ConfigManyToMany[K kad.Key[K]] struct { + NodeConcurrency int + StreamConcurrency int + Targets []K +} + +// Validate checks the configuration options and returns an error if any have +// invalid values. +func (c *ConfigManyToMany[K]) Validate() error { + if len(c.Targets) == 0 { + return fmt.Errorf("targets must not be empty") + } + return nil +} + +// DefaultConfigManyToMany returns the default configuration options for the +// [ManyToMany] state machine. +func DefaultConfigManyToMany[K kad.Key[K]](targets []K) *ConfigManyToMany[K] { + return &ConfigManyToMany[K]{ + NodeConcurrency: 100, // MAGIC + StreamConcurrency: 10, // MAGIC + Targets: targets, + } +} + // ManyToMany is a [Broadcast] state machine and encapsulates the logic around // doing a put operation to a static set of nodes. That static set of nodes // is given by the list of seed nodes in the [EventBroadcastStart] event. diff --git a/internal/coord/brdcst/otm.go b/internal/coord/brdcst/otm.go index 4f1a4e7..3cade77 100644 --- a/internal/coord/brdcst/otm.go +++ b/internal/coord/brdcst/otm.go @@ -12,6 +12,26 @@ import ( "github.com/plprobelab/zikade/tele" ) +// ConfigOneToMany specifies the configuration for the [OneToMany] state +// machine. +type ConfigOneToMany[K kad.Key[K]] struct { + Target K +} + +// Validate checks the configuration options and returns an error if any have +// invalid values. +func (c *ConfigOneToMany[K]) Validate() error { + return nil +} + +// DefaultConfigOneToMany returns the default configuration options for the +// [OneToMany] state machine. +func DefaultConfigOneToMany[K kad.Key[K]](target K) *ConfigOneToMany[K] { + return &ConfigOneToMany[K]{ + Target: target, + } +} + // OneToMany is a [Broadcast] state machine and encapsulates the logic around // doing a ONE put operation to MANY preconfigured nodes. That static set of // nodes is given by the list of seed nodes in the [EventBroadcastStart] event. diff --git a/internal/coord/brdcst/pool.go b/internal/coord/brdcst/pool.go index af78734..ae3497c 100644 --- a/internal/coord/brdcst/pool.go +++ b/internal/coord/brdcst/pool.go @@ -17,6 +17,29 @@ import ( // are the [FollowUp] and [Static] state machines. type Broadcast = coordt.StateMachine[BroadcastEvent, BroadcastState] +// ConfigPool specifies the configuration for a broadcast [Pool]. +type ConfigPool struct { + pCfg *query.PoolConfig +} + +// Validate checks the configuration options and returns an error if any have +// invalid values. +func (cfg *ConfigPool) Validate() error { + if cfg.pCfg == nil { + return fmt.Errorf("query pool config must not be nil") + } + + return nil +} + +// DefaultConfigPool returns the default configuration options for a Pool. +// Options may be overridden before passing to NewPool +func DefaultConfigPool() *ConfigPool { + return &ConfigPool{ + pCfg: query.DefaultPoolConfig(), + } +} + // Pool is a [coordt.StateMachine] that manages all running broadcast // operations. In the future it could limit the number of concurrent operations, // but right now it is just keeping track of all running broadcasts. The diff --git a/internal/coord/brdcst/pool_test.go b/internal/coord/brdcst/pool_test.go index 0ac0bcd..abf3934 100644 --- a/internal/coord/brdcst/pool_test.go +++ b/internal/coord/brdcst/pool_test.go @@ -15,6 +15,30 @@ import ( // Assert that Pool implements the common state machine interface var _ coordt.StateMachine[PoolEvent, PoolState] = (*Pool[tiny.Key, tiny.Node, tiny.Message])(nil) +func TestConfigPool_Validate(t *testing.T) { + t.Run("default is valid", func(t *testing.T) { + cfg := DefaultConfigPool() + require.NoError(t, cfg.Validate()) + }) + + t.Run("nil pool config", func(t *testing.T) { + cfg := DefaultConfigPool() + cfg.pCfg = nil + require.Error(t, cfg.Validate()) + }) +} + +func TestConfig_interface_conformance(t *testing.T) { + configs := []Config{ + &ConfigFollowUp[tiny.Key]{}, + &ConfigOneToMany[tiny.Key]{}, + &ConfigManyToMany[tiny.Key]{}, + } + for _, c := range configs { + c.broadcastConfig() // drives test coverage + } +} + func TestPoolStopWhenNoQueries(t *testing.T) { ctx := context.Background() cfg := DefaultConfigPool() From 8173c35570016aac5c118d0552cdb5f03d6a2254 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 17 Oct 2023 18:01:54 +0200 Subject: [PATCH 20/23] WIP --- fullrt.go | 2 +- internal/coord/event.go | 8 ++++---- internal/coord/query.go | 10 ++++++++-- internal/coord/query/pool.go | 12 ++++++------ internal/coord/query/query.go | 2 -- internal/coord/query/strategy.go | 18 ++++++++++++------ 6 files changed, 31 insertions(+), 21 deletions(-) diff --git a/fullrt.go b/fullrt.go index c029b14..60805bc 100644 --- a/fullrt.go +++ b/fullrt.go @@ -343,7 +343,7 @@ func (f *FullRT) Bootstrap(ctx context.Context) error { func (f *FullRT) queryConfig() *coord.QueryConfig { cfg := coord.DefaultQueryConfig() cfg.NumResults = f.cfg.BucketSize - cfg.Strategy = &query.QueryStrategyStatic{} + cfg.Strategy = &query.StrategyStatic{} return cfg } diff --git a/internal/coord/event.go b/internal/coord/event.go index 7057cfe..cecfa35 100644 --- a/internal/coord/event.go +++ b/internal/coord/event.go @@ -87,8 +87,8 @@ type EventStartMessageQuery struct { Message *pb.Message Seed []kadt.PeerID Notify QueryMonitor[*EventQueryFinished] - NumResults int // the minimum number of nodes to successfully contact before considering iteration complete - Strategy query.QueryStrategy // the way the query should be performed - [query.QueryStrategyConverge] will be used by default. + NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + Strategy query.Strategy // the way the query should be performed - [query.StrategyConverge] will be used by default. } func (*EventStartMessageQuery) behaviourEvent() {} @@ -99,8 +99,8 @@ type EventStartFindCloserQuery struct { Target kadt.Key Seed []kadt.PeerID Notify QueryMonitor[*EventQueryFinished] - NumResults int // the minimum number of nodes to successfully contact before considering iteration complete - Strategy query.QueryStrategy // the way the query should be performed - [query.QueryStrategyConverge] will be used by default. + NumResults int // the minimum number of nodes to successfully contact before considering iteration complete + Strategy query.Strategy // the way the query should be performed - [query.StrategyConverge] will be used by default. } func (*EventStartFindCloserQuery) behaviourEvent() {} diff --git a/internal/coord/query.go b/internal/coord/query.go index b20b5e2..4b60429 100644 --- a/internal/coord/query.go +++ b/internal/coord/query.go @@ -40,9 +40,15 @@ type QueryConfig struct { // RequestTimeout is the timeout queries should use for contacting a single node RequestTimeout time.Duration + // NumResults specifies the number of results (nodes) we are searching for NumResults int - Strategy query.QueryStrategy + // Strategy specifies the query strategy that should be used. By default, + // we are using the [query.StrategyConverge] which searches for ever + // closer nodes to a certain key and hence converging in the key space. + // Alternatively, there's also [query.StrategyStatic] which just + // contacts a static list of preconfigured peers. + Strategy query.Strategy } // Validate checks the configuration options and returns an error if any have invalid values. @@ -108,7 +114,7 @@ func DefaultQueryConfig() *QueryConfig { RequestConcurrency: 3, // MAGIC RequestTimeout: time.Minute, // MAGIC NumResults: 20, // MAGIC - Strategy: &query.QueryStrategyConverge{}, + Strategy: &query.StrategyConverge{}, } } diff --git a/internal/coord/query/pool.go b/internal/coord/query/pool.go index 1821c8a..ee5c0e5 100644 --- a/internal/coord/query/pool.go +++ b/internal/coord/query/pool.go @@ -266,9 +266,9 @@ func (p *Pool[K, N, M]) addQuery(ctx context.Context, evt *EventPoolAddQuery[K, var iter NodeIter[K, N] switch evt.Strategy.(type) { - case *QueryStrategyConverge: + case *StrategyConverge: iter = NewClosestNodesIter[K, N](evt.Target) - case *QueryStrategyStatic: + case *StrategyStatic: iter = NewStaticIter[K, N](evt.Seed) default: iter = NewClosestNodesIter[K, N](evt.Target) // default if unset @@ -302,9 +302,9 @@ func (p *Pool[K, N, M]) addFindCloserQuery(ctx context.Context, evt *EventPoolAd var iter NodeIter[K, N] switch evt.Strategy.(type) { - case *QueryStrategyConverge: + case *StrategyConverge: iter = NewClosestNodesIter[K, N](evt.Target) - case *QueryStrategyStatic: + case *StrategyStatic: iter = NewStaticIter[K, N](evt.Seed) default: iter = NewClosestNodesIter[K, N](evt.Target) // default if unset @@ -397,7 +397,7 @@ type EventPoolAddFindCloserQuery[K kad.Key[K], N kad.NodeID[K]] struct { Target K // the target key for the query Seed []N // an initial set of close nodes the query should use NumResults int // the minimum number of nodes to successfully contact before considering iteration complete - Strategy QueryStrategy // the way the query should be performed - [QueryStrategyConverge] will be used by default. + Strategy Strategy // the way the query should be performed - [StrategyConverge] will be used by default. } // EventPoolAddQuery is an event that attempts to add a new query that sends a message. @@ -407,7 +407,7 @@ type EventPoolAddQuery[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { Message M // message to be sent to each node Seed []N // an initial set of close nodes the query should use NumResults int // the minimum number of nodes to successfully contact before considering iteration complete - Strategy QueryStrategy // the way the query should be performed - [QueryStrategyConverge] will be used by default. + Strategy Strategy // the way the query should be performed - [StrategyConverge] will be used by default. } // EventPoolStopQuery notifies a [Pool] to stop a query. diff --git a/internal/coord/query/query.go b/internal/coord/query/query.go index d2d99e9..f82803c 100644 --- a/internal/coord/query/query.go +++ b/internal/coord/query/query.go @@ -141,8 +141,6 @@ func (q *Query[K, N, M]) Advance(ctx context.Context, ev QueryEvent) (out QueryS span.End() }() - span.SpanContext() - if q.finished { return &StateQueryFinished[K, N]{ QueryID: q.id, diff --git a/internal/coord/query/strategy.go b/internal/coord/query/strategy.go index d3b1312..4af9ae4 100644 --- a/internal/coord/query/strategy.go +++ b/internal/coord/query/strategy.go @@ -1,13 +1,19 @@ package query -type QueryStrategy interface { +// Strategy is an interface that all query strategies need to implement. +// This ensures that only valid and supported strategies can be passed into +// the query behaviour/state machines. +type Strategy interface { queryStrategy() } -type QueryStrategyConverge struct{} +func (q *StrategyConverge) queryStrategy() {} +func (q *StrategyStatic) queryStrategy() {} -func (q *QueryStrategyConverge) queryStrategy() {} +// StrategyConverge is used by default. In this case we are searching for ever +// closer nodes to a certain key and hence converging in the key space. +type StrategyConverge struct{} -type QueryStrategyStatic struct{} - -func (q *QueryStrategyStatic) queryStrategy() {} +// StrategyStatic is the alternative query strategy in which we just contact +// a static list of preconfigured nodes. +type StrategyStatic struct{} From e6118a3f1564048e976cbdf8f4feb5dab6026fd2 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 18 Oct 2023 11:02:24 +0200 Subject: [PATCH 21/23] WIP --- fullrt.go | 51 +++++++++++++++++------------- go.mod | 6 ++++ go.sum | 20 ++++++++++++ internal/coord/behaviour.go | 6 ++-- internal/coord/brdcst/followup.go | 7 ++-- internal/coord/brdcst/pool.go | 16 +++++----- internal/coord/coordinator.go | 24 +++++++++----- internal/coord/coordt/coretypes.go | 7 +++- internal/coord/event.go | 34 +++++++++++++++----- internal/coord/query.go | 2 +- internal/coord/routing.go | 1 - 11 files changed, 120 insertions(+), 54 deletions(-) diff --git a/fullrt.go b/fullrt.go index 60805bc..493118d 100644 --- a/fullrt.go +++ b/fullrt.go @@ -7,6 +7,7 @@ import ( "fmt" "time" + "github.com/ipfs/boxo/provider" "github.com/ipfs/go-cid" ds "github.com/ipfs/go-datastore" record "github.com/libp2p/go-libp2p-record" @@ -31,20 +32,23 @@ import ( type FullRT struct { *DHT - cfg *FullRTConfig + cfg *FullRTConfig + queryConfig *coord.QueryConfig } type FullRTConfig struct { *Config - CrawlInterval time.Duration - QuorumFrac float64 + CrawlInterval time.Duration + QuorumFrac float64 + FindPeerConnectTimeout time.Duration } func DefaultFullRTConfig() *FullRTConfig { return &FullRTConfig{ - Config: DefaultConfig(), - CrawlInterval: time.Hour, // MAGIC - QuorumFrac: 0.25, // MAGIC + Config: DefaultConfig(), + CrawlInterval: time.Hour, // MAGIC + QuorumFrac: 0.25, // MAGIC + FindPeerConnectTimeout: 5 * time.Second, } } @@ -58,15 +62,20 @@ func NewFullRT(h host.Host, cfg *FullRTConfig) (*FullRT, error) { cfg.Query.DefaultQuorum = int(float64(cfg.BucketSize) * cfg.QuorumFrac) } + qcfg := coord.DefaultQueryConfig() + qcfg.NumResults = cfg.BucketSize + qcfg.Strategy = &query.StrategyStatic{} + frt := &FullRT{ - DHT: d, - cfg: cfg, + DHT: d, + cfg: cfg, + queryConfig: qcfg, } return frt, nil } -var _ routing.Routing = (*FullRT)(nil) +var _ provider.ProvideMany = (*FullRT)(nil) func (f *FullRT) FindPeer(ctx context.Context, pid peer.ID) (peer.AddrInfo, error) { ctx, span := f.tele.Tracer.Start(ctx, "FullRT.FindPeer") @@ -102,7 +111,7 @@ func (f *FullRT) FindPeer(ctx context.Context, pid peer.ID) (peer.AddrInfo, erro } // start the query with a static set of peers (see queryConfig) - _, _, err = f.kad.QueryClosest(ctx, kadt.PeerID(pid).Key(), fn, f.queryConfig()) + _, _, err = f.kad.QueryClosest(ctx, kadt.PeerID(pid).Key(), fn, f.queryConfig) if err != nil { return peer.AddrInfo{}, fmt.Errorf("failed to run query: %w", err) } @@ -119,7 +128,7 @@ func (f *FullRT) FindPeer(ctx context.Context, pid peer.ID) (peer.AddrInfo, erro } // connect to peer (this also happens in the non-fullrt case) - connCtx, cancel := context.WithTimeout(ctx, 5*time.Second) // TODO: put timeout in config + connCtx, cancel := context.WithTimeout(ctx, f.cfg.FindPeerConnectTimeout) defer cancel() _ = f.host.Connect(connCtx, peer.AddrInfo{ ID: pid, @@ -274,7 +283,7 @@ func (f *FullRT) findProvidersAsyncRoutine(ctx context.Context, c cid.Cid, count return nil } - _, _, err = f.kad.QueryMessage(ctx, msg, fn, f.queryConfig()) + _, _, err = f.kad.QueryMessage(ctx, msg, fn, f.queryConfig) if err != nil { span.RecordError(err) f.log.Warn("Failed querying", slog.String("cid", c.String()), slog.String("err", err.Error())) @@ -340,13 +349,6 @@ func (f *FullRT) Bootstrap(ctx context.Context) error { return f.kad.Crawl(ctx, seed) } -func (f *FullRT) queryConfig() *coord.QueryConfig { - cfg := coord.DefaultQueryConfig() - cfg.NumResults = f.cfg.BucketSize - cfg.Strategy = &query.StrategyStatic{} - return cfg -} - func (f *FullRT) GetValue(ctx context.Context, key string, opts ...routing.Option) ([]byte, error) { ctx, span := f.tele.Tracer.Start(ctx, "FullRT.GetValue") defer span.End() @@ -510,7 +512,7 @@ func (f *FullRT) searchValueRoutine(ctx context.Context, backend Backend, ns str return nil } - _, _, err := f.kad.QueryMessage(ctx, req, fn, f.queryConfig()) + _, _, err := f.kad.QueryMessage(ctx, req, fn, f.queryConfig) if err != nil { f.warnErr(err, "Search value query failed") return @@ -568,7 +570,12 @@ func (f *FullRT) ProvideMany(ctx context.Context, mhashes []mh.Multihash) error keys = append(keys, kadt.NewKey(mhash)) } - return f.kad.BroadcastMany(ctx, keys, msgFn) + // track successes + fn := func(ctx context.Context, id kadt.PeerID, resp *pb.Message) { + // TODO + } + + return f.kad.BroadcastMany(ctx, keys, fn, msgFn) } func (f *FullRT) PutMany(ctx context.Context, keySlice []string, valueSlice [][]byte) error { @@ -618,5 +625,5 @@ func (f *FullRT) PutMany(ctx context.Context, keySlice []string, valueSlice [][] } } - return f.kad.BroadcastMany(ctx, kadKeys, msgFn) + return f.kad.BroadcastMany(ctx, kadKeys, nil, msgFn) } diff --git a/go.mod b/go.mod index b909bbf..2550395 100644 --- a/go.mod +++ b/go.mod @@ -59,7 +59,13 @@ require ( github.com/gorilla/websocket v1.5.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect github.com/huin/goupnp v1.2.0 // indirect + github.com/ipfs/bbloom v0.0.4 // indirect + github.com/ipfs/go-block-format v0.1.2 // indirect + github.com/ipfs/go-cidutil v0.1.0 // indirect + github.com/ipfs/go-ipfs-util v0.0.2 // indirect + github.com/ipfs/go-ipld-format v0.5.0 // indirect github.com/ipfs/go-log v1.0.5 // indirect + github.com/ipfs/go-metrics-interface v0.0.1 // indirect github.com/ipld/go-ipld-prime v0.21.0 // indirect github.com/jackpal/go-nat-pmp v1.0.2 // indirect github.com/jbenet/go-temp-err-catcher v0.1.0 // indirect diff --git a/go.sum b/go.sum index 0a21e7b..25b6a2e 100644 --- a/go.sum +++ b/go.sum @@ -33,6 +33,7 @@ github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/cskr/pubsub v1.0.2 h1:vlOzMhl6PFn60gRlTQQsIfVwaPB/B/8MziK8FhEPt/0= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -122,10 +123,16 @@ github.com/hashicorp/golang-lru/v2 v2.0.5/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyf github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/huin/goupnp v1.2.0 h1:uOKW26NG1hsSSbXIZ1IR7XP9Gjd1U8pnLaCMgntmkmY= github.com/huin/goupnp v1.2.0/go.mod h1:gnGPsThkYa7bFi/KWmEysQRf48l2dvR5bxr2OFckNX8= +github.com/ipfs/bbloom v0.0.4 h1:Gi+8EGJ2y5qiD5FbsbpX/TMNcJw8gSqr7eyjHa4Fhvs= +github.com/ipfs/bbloom v0.0.4/go.mod h1:cS9YprKXpoZ9lT0n/Mw/a6/aFV6DTjTLYHeA+gyqMG0= github.com/ipfs/boxo v0.12.0 h1:AXHg/1ONZdRQHQLgG5JHsSC3XoE4DjCAMgK+asZvUcQ= github.com/ipfs/boxo v0.12.0/go.mod h1:xAnfiU6PtxWCnRqu7dcXQ10bB5/kvI1kXRotuGqGBhg= +github.com/ipfs/go-block-format v0.1.2 h1:GAjkfhVx1f4YTODS6Esrj1wt2HhrtwTnhEr+DyPUaJo= +github.com/ipfs/go-block-format v0.1.2/go.mod h1:mACVcrxarQKstUU3Yf/RdwbC4DzPV6++rO2a3d+a/KE= github.com/ipfs/go-cid v0.4.1 h1:A/T3qGvxi4kpKWWcPC/PgbvDA2bjVLO7n4UeVwnbs/s= github.com/ipfs/go-cid v0.4.1/go.mod h1:uQHwDeX4c6CtyrFwdqyhpNcxVewur1M7l7fNU7LKwZk= +github.com/ipfs/go-cidutil v0.1.0 h1:RW5hO7Vcf16dplUU60Hs0AKDkQAVPVplr7lk97CFL+Q= +github.com/ipfs/go-cidutil v0.1.0/go.mod h1:e7OEVBMIv9JaOxt9zaGEmAoSlXW9jdFZ5lP/0PwcfpA= github.com/ipfs/go-datastore v0.5.0/go.mod h1:9zhEApYMTl17C8YDp7JmU7sQZi2/wqiYh73hakZ90Bk= github.com/ipfs/go-datastore v0.6.1-0.20230901172804-1caa2449ed7c h1:iSyhKydtSJiEkmf5O3KizuySDB0zgyWPth76NACTMVI= github.com/ipfs/go-datastore v0.6.1-0.20230901172804-1caa2449ed7c/go.mod h1:3Et7HSjOA8tPu9OjYuDZxLAgBLfvlNMD4r8BIuri9eo= @@ -133,13 +140,22 @@ github.com/ipfs/go-detect-race v0.0.1 h1:qX/xay2W3E4Q1U7d9lNs1sU9nvguX0a7319XbyQ github.com/ipfs/go-detect-race v0.0.1/go.mod h1:8BNT7shDZPo99Q74BpGMK+4D8Mn4j46UU0LZ723meps= github.com/ipfs/go-ds-leveldb v0.5.0 h1:s++MEBbD3ZKc9/8/njrn4flZLnCuY9I79v94gBUNumo= github.com/ipfs/go-ds-leveldb v0.5.0/go.mod h1:d3XG9RUDzQ6V4SHi8+Xgj9j1XuEk1z82lquxrVbml/Q= +github.com/ipfs/go-ipfs-blocksutil v0.0.1 h1:Eh/H4pc1hsvhzsQoMEP3Bke/aW5P5rVM1IWFJMcGIPQ= github.com/ipfs/go-ipfs-delay v0.0.0-20181109222059-70721b86a9a8/go.mod h1:8SP1YXK1M1kXuc4KJZINY3TQQ03J2rwBG9QfXmbRPrw= +github.com/ipfs/go-ipfs-delay v0.0.1 h1:r/UXYyRcddO6thwOnhiznIAiSvxMECGgtv35Xs1IeRQ= +github.com/ipfs/go-ipfs-pq v0.0.3 h1:YpoHVJB+jzK15mr/xsWC574tyDLkezVrDNeaalQBsTE= github.com/ipfs/go-ipfs-util v0.0.2 h1:59Sswnk1MFaiq+VcaknX7aYEyGyGDAA73ilhEK2POp8= +github.com/ipfs/go-ipfs-util v0.0.2/go.mod h1:CbPtkWJzjLdEcezDns2XYaehFVNXG9zrdrtMecczcsQ= +github.com/ipfs/go-ipld-format v0.5.0 h1:WyEle9K96MSrvr47zZHKKcDxJ/vlpET6PSiQsAFO+Ds= +github.com/ipfs/go-ipld-format v0.5.0/go.mod h1:ImdZqJQaEouMjCvqCe0ORUS+uoBmf7Hf+EO/jh+nk3M= github.com/ipfs/go-log v1.0.5 h1:2dOuUCB1Z7uoczMWgAyDck5JLb72zHzrMnGnCNNbvY8= github.com/ipfs/go-log v1.0.5/go.mod h1:j0b8ZoR+7+R99LD9jZ6+AJsrzkPbSXbZfGakb5JPtIo= github.com/ipfs/go-log/v2 v2.1.3/go.mod h1:/8d0SH3Su5Ooc31QlL1WysJhvyOTDCjcCZ9Axpmri6g= github.com/ipfs/go-log/v2 v2.5.1 h1:1XdUzF7048prq4aBjDQQ4SL5RxftpRGdXhNRwKSAlcY= github.com/ipfs/go-log/v2 v2.5.1/go.mod h1:prSpmC1Gpllc9UYWxDiZDreBYw7zp4Iqp1kOLU9U5UI= +github.com/ipfs/go-metrics-interface v0.0.1 h1:j+cpbjYvu4R8zbleSs36gvB7jR+wsL2fGD6n0jO4kdg= +github.com/ipfs/go-metrics-interface v0.0.1/go.mod h1:6s6euYU4zowdslK0GKHmqaIZ3j/b/tL7HTWtJ4VPgWY= +github.com/ipfs/go-peertaskqueue v0.8.1 h1:YhxAs1+wxb5jk7RvS0LHdyiILpNmRIRnZVztekOF0pg= github.com/ipld/go-ipld-prime v0.21.0 h1:n4JmcpOlPDIxBcY037SVfpd1G+Sj1nKZah0m6QH9C2E= github.com/ipld/go-ipld-prime v0.21.0/go.mod h1:3RLqy//ERg/y5oShXXdx5YIp50cFGOanyMctpPjsvxQ= github.com/jackpal/go-nat-pmp v1.0.2 h1:KzKSgb7qkJvOUTqYl9/Hg/me3pWgBmERKrTGD7BdWus= @@ -223,6 +239,7 @@ github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/mr-tron/base58 v1.1.2/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc= +github.com/mr-tron/base58 v1.1.3/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc= github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o= github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc= github.com/multiformats/go-base32 v0.1.0 h1:pVx9xoSPqEIQG8o+UbAe7DNi51oej1NtK+aGkbLYxPE= @@ -242,11 +259,13 @@ github.com/multiformats/go-multibase v0.2.0/go.mod h1:bFBZX4lKCA/2lyOFSAoKH5SS6o github.com/multiformats/go-multicodec v0.9.0 h1:pb/dlPnzee/Sxv/j4PmkDRxCOi3hXTz3IbPKOXWJkmg= github.com/multiformats/go-multicodec v0.9.0/go.mod h1:L3QTQvMIaVBkXOXXtVmYE+LI16i14xuaojr/H7Ai54k= github.com/multiformats/go-multihash v0.0.8/go.mod h1:YSLudS+Pi8NHE7o6tb3D8vrpKa63epEDmG8nTduyAew= +github.com/multiformats/go-multihash v0.0.13/go.mod h1:VdAWLKTwram9oKAatUcLxBNUjdtcVwxObEQBtRfuyjc= github.com/multiformats/go-multihash v0.2.3 h1:7Lyc8XfX/IY2jWb/gI7JP+o7JEq9hOa7BFvVU9RSh+U= github.com/multiformats/go-multihash v0.2.3/go.mod h1:dXgKXCXjBzdscBLk9JkjINiEsCKRVch90MdaGiKsvSM= github.com/multiformats/go-multistream v0.4.1 h1:rFy0Iiyn3YT0asivDUIR05leAdwZq3de4741sbiSdfo= github.com/multiformats/go-multistream v0.4.1/go.mod h1:Mz5eykRVAjJWckE2U78c6xqdtyNUEhKSM0Lwar2p77Q= github.com/multiformats/go-varint v0.0.1/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= +github.com/multiformats/go-varint v0.0.5/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= github.com/multiformats/go-varint v0.0.7 h1:sWSGR+f/eu5ABZA2ZpYKBILXTTs9JWpdEM/nEGOHFS8= github.com/multiformats/go-varint v0.0.7/go.mod h1:r8PUYw/fD/SjBCiKOoDlGF6QawOELpZAu9eioSos/OU= github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86/go.mod h1:kHJEU3ofeGjhHklVoIGuVj85JJwZ6kWPaJwCIxgnFmo= @@ -351,6 +370,7 @@ github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtX github.com/urfave/cli v1.22.10/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/viant/assertly v0.4.8/go.mod h1:aGifi++jvCrUaklKEKT0BU95igDNaqkvz+49uaYMPRU= github.com/viant/toolbox v0.24.0/go.mod h1:OxMCG57V0PXuIP2HNQrtJf2CjqdmbrOx5EkMILuUhzM= +github.com/warpfork/go-testmark v0.12.1 h1:rMgCpJfwy1sJ50x0M0NgyphxYYPMOODIJHhsXyEHU0s= github.com/warpfork/go-wish v0.0.0-20220906213052-39a1cc7a02d0 h1:GDDkbFiaK8jsSDJfjId/PEGEShv6ugrt4kYsC5UIDaQ= github.com/warpfork/go-wish v0.0.0-20220906213052-39a1cc7a02d0/go.mod h1:x6AKhvSSexNrVSrViXSHUEbICjmGXhtgABaHIySUSGw= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= diff --git a/internal/coord/behaviour.go b/internal/coord/behaviour.go index 28819aa..0fa8228 100644 --- a/internal/coord/behaviour.go +++ b/internal/coord/behaviour.go @@ -150,7 +150,7 @@ func (w *Waiter[E]) Chan() <-chan WaiterEvent[E] { } // A QueryMonitor receives event notifications on the progress of a query -type QueryMonitor[E TerminalQueryEvent] interface { +type QueryMonitor[E TerminalBehaviourEvent] interface { // NotifyProgressed returns a channel that can be used to send notification that a // query has made progress. If the notification cannot be sent then it will be // queued and retried at a later time. If the query completes before the progress @@ -168,7 +168,7 @@ type QueryMonitor[E TerminalQueryEvent] interface { // QueryMonitorHook wraps a [QueryMonitor] interface and provides hooks // that are invoked before calls to the QueryMonitor methods are forwarded. -type QueryMonitorHook[E TerminalQueryEvent] struct { +type QueryMonitorHook[E TerminalBehaviourEvent] struct { qm QueryMonitor[E] BeforeProgressed func() BeforeFinished func() @@ -176,7 +176,7 @@ type QueryMonitorHook[E TerminalQueryEvent] struct { var _ QueryMonitor[*EventQueryFinished] = (*QueryMonitorHook[*EventQueryFinished])(nil) -func NewQueryMonitorHook[E TerminalQueryEvent](qm QueryMonitor[E]) *QueryMonitorHook[E] { +func NewQueryMonitorHook[E TerminalBehaviourEvent](qm QueryMonitor[E]) *QueryMonitorHook[E] { return &QueryMonitorHook[E]{ qm: qm, BeforeProgressed: func() {}, diff --git a/internal/coord/brdcst/followup.go b/internal/coord/brdcst/followup.go index e8a3aa6..6790de0 100644 --- a/internal/coord/brdcst/followup.go +++ b/internal/coord/brdcst/followup.go @@ -49,14 +49,17 @@ type FollowUp[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { // the logic much easier to implement. pool *query.Pool[K, N, M] - // TODO: ... + // started indicates that this state machine has sent out the first + // message to a node. Even after this state machine has returned a finished + // state, this flag will stay true. started bool // the message generator that takes a target key and will return the message // that we will send to the closest nodes in the follow-up phase msgFunc func(K) M - // TODO: + // seed holds the nodes from where we should start our query to find closer + // nodes to the target key (held by [ConfigFollowUp]). seed []N // the closest nodes to the target key. This will be filled after the query diff --git a/internal/coord/brdcst/pool.go b/internal/coord/brdcst/pool.go index ae3497c..38785fe 100644 --- a/internal/coord/brdcst/pool.go +++ b/internal/coord/brdcst/pool.go @@ -258,8 +258,8 @@ type StatePoolWaiting struct{} type StatePoolStoreRecord[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the id of the broadcast operation that wants to send the message NodeID N // the node to send the message to - Target K - Message M // the message that should be sent to the remote node + Target K // the key we want to store a record for + Message M // the message that should be sent to the remote node } // StatePoolBroadcastFinished indicates that the broadcast operation with the @@ -347,9 +347,9 @@ type EventPoolGetCloserNodesFailure[K kad.Key[K], N kad.NodeID[K]] struct { type EventPoolStoreRecordSuccess[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the id of the query that sent the message NodeID N // the node the message was sent to - Target K - Request M // the message that was sent to the remote node - Response M // the reply we got from the remote node (nil in many cases of the Amino DHT) + Target K // the key we successfully stored a record for + Request M // the message that was sent to the remote node + Response M // the reply we got from the remote node (nil in many cases of the Amino DHT) } // EventPoolStoreRecordFailure noties the broadcast [Pool] that storing a record @@ -358,9 +358,9 @@ type EventPoolStoreRecordSuccess[K kad.Key[K], N kad.NodeID[K], M coordt.Message type EventPoolStoreRecordFailure[K kad.Key[K], N kad.NodeID[K], M coordt.Message] struct { QueryID coordt.QueryID // the id of the query that sent the message NodeID N // the node the message was sent to - Target K - Request M // the message that was sent to the remote node - Error error // the error that caused the failure + Target K // the key we failed to store a record for + Request M // the message that was sent to the remote node + Error error // the error that caused the failure } // poolEvent() ensures that only events accepted by a broadcast [Pool] can be diff --git a/internal/coord/coordinator.go b/internal/coord/coordinator.go index d7d310a..091803b 100644 --- a/internal/coord/coordinator.go +++ b/internal/coord/coordinator.go @@ -309,8 +309,9 @@ func (c *Coordinator) QueryClosest(ctx context.Context, target kadt.Key, fn coor if cfg == nil { cfg = DefaultQueryConfig() + } else if err := cfg.Validate(); err != nil { + return nil, coordt.QueryStats{}, fmt.Errorf("validate query config: %w", err) } - // TODO: validate config ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -392,15 +393,15 @@ func (c *Coordinator) QueryMessage(ctx context.Context, msg *pb.Message, fn coor func (c *Coordinator) BroadcastRecord(ctx context.Context, msg *pb.Message, seed []kadt.PeerID) error { msgFunc := func(k kadt.Key) *pb.Message { return msg } - return c.broadcast(ctx, msgFunc, seed, brdcst.DefaultConfigFollowUp(msg.Target())) + return c.broadcast(ctx, msgFunc, seed, coordt.BrdcstFuncNoop, brdcst.DefaultConfigFollowUp(msg.Target())) } func (c *Coordinator) BroadcastStatic(ctx context.Context, msg *pb.Message, seed []kadt.PeerID) error { msgFunc := func(k kadt.Key) *pb.Message { return msg } - return c.broadcast(ctx, msgFunc, seed, brdcst.DefaultConfigOneToMany(msg.Target())) + return c.broadcast(ctx, msgFunc, seed, coordt.BrdcstFuncNoop, brdcst.DefaultConfigOneToMany(msg.Target())) } -func (c *Coordinator) BroadcastMany(ctx context.Context, keys []kadt.Key, msgFn func(k kadt.Key) *pb.Message) error { +func (c *Coordinator) BroadcastMany(ctx context.Context, keys []kadt.Key, fn coordt.BrdcstFunc, msgFn func(k kadt.Key) *pb.Message) error { // verify that we have keys to push into the network if len(keys) == 0 { return fmt.Errorf("no keys to broadcast") @@ -410,10 +411,10 @@ func (c *Coordinator) BroadcastMany(ctx context.Context, keys []kadt.Key, msgFn seed := c.rt.NearestNodes(keys[0], math.MaxInt) // start broadcasting - return c.broadcast(ctx, msgFn, seed, brdcst.DefaultConfigManyToMany(keys)) + return c.broadcast(ctx, msgFn, seed, fn, brdcst.DefaultConfigManyToMany(keys)) } -func (c *Coordinator) broadcast(ctx context.Context, msgFunc func(k kadt.Key) *pb.Message, seed []kadt.PeerID, cfg brdcst.Config) error { +func (c *Coordinator) broadcast(ctx context.Context, msgFunc func(k kadt.Key) *pb.Message, seed []kadt.PeerID, fn coordt.BrdcstFunc, cfg brdcst.Config) error { ctx, span := c.tele.Tracer.Start(ctx, "Coordinator.broadcast") defer span.End() @@ -434,7 +435,7 @@ func (c *Coordinator) broadcast(ctx context.Context, msgFunc func(k kadt.Key) *p // queue the start of the query c.brdcstBehaviour.Notify(ctx, cmd) - contacted, _, err := c.waitForBroadcast(ctx, waiter) + contacted, _, err := c.waitForBroadcast(ctx, waiter, fn) if err != nil { return err } @@ -507,7 +508,7 @@ func (c *Coordinator) waitForQuery(ctx context.Context, queryID coordt.QueryID, } } -func (c *Coordinator) waitForBroadcast(ctx context.Context, waiter *BroadcastWaiter) ([]kadt.PeerID, map[string]struct { +func (c *Coordinator) waitForBroadcast(ctx context.Context, waiter *BroadcastWaiter, fn coordt.BrdcstFunc) ([]kadt.PeerID, map[string]struct { Node kadt.PeerID Err error }, error, @@ -516,6 +517,13 @@ func (c *Coordinator) waitForBroadcast(ctx context.Context, waiter *BroadcastWai select { case <-ctx.Done(): return nil, nil, ctx.Err() + + case wev, more := <-waiter.Progressed(): + if !more { + return nil, nil, ctx.Err() + } + fn(wev.Ctx, wev.Event.NodeID, wev.Event.Response) + case wev, more := <-waiter.Finished(): if !more { return nil, nil, ctx.Err() diff --git a/internal/coord/coordt/coretypes.go b/internal/coord/coordt/coretypes.go index 640c658..87072b4 100644 --- a/internal/coord/coordt/coretypes.go +++ b/internal/coord/coordt/coretypes.go @@ -40,7 +40,7 @@ var ( // Query stops entirely and returns that error. // // The stats argument contains statistics on the progress of the query so far. -type QueryFunc func(ctx context.Context, id kadt.PeerID, resp *pb.Message, stats QueryStats) error +type QueryFunc func(ctx context.Context, id kadt.PeerID, resp *pb.Message, stats QueryStats) error // TODO: move to query package? type QueryStats struct { Start time.Time // Start is the time the query began executing. @@ -51,6 +51,11 @@ type QueryStats struct { Exhausted bool // Exhausted is true if the query ended after visiting every node it could. } +// BrdcstFunc is the type of the function called when broadcasting to the network after we have received a response from a node. +type BrdcstFunc func(ctx context.Context, id kadt.PeerID, resp *pb.Message) + +func BrdcstFuncNoop(ctx context.Context, id kadt.PeerID, resp *pb.Message) {} + var ( // ErrSkipNode is used as a return value from a QueryFunc to indicate that the node is to be skipped. ErrSkipNode = errors.New("skip node") diff --git a/internal/coord/event.go b/internal/coord/event.go index cecfa35..d16ce4e 100644 --- a/internal/coord/event.go +++ b/internal/coord/event.go @@ -45,10 +45,10 @@ type RoutingNotification interface { routingNotification() } -// TerminalQueryEvent is a type of [BehaviourEvent] that indicates a query has completed. -type TerminalQueryEvent interface { +// TerminalBehaviourEvent is a type of [BehaviourEvent] that indicates a query has completed. +type TerminalBehaviourEvent interface { BehaviourEvent - terminalQueryEvent() + terminalBehaviourEvent() } type EventStartBootstrap struct { @@ -192,8 +192,8 @@ type EventQueryFinished struct { ClosestNodes []kadt.PeerID } -func (*EventQueryFinished) behaviourEvent() {} -func (*EventQueryFinished) terminalQueryEvent() {} +func (*EventQueryFinished) behaviourEvent() {} +func (*EventQueryFinished) terminalBehaviourEvent() {} // EventRoutingUpdated is emitted by the coordinator when a new node has been verified and added to the routing table. type EventRoutingUpdated struct { @@ -258,7 +258,7 @@ type BrdcstCommand interface { brdcstCommand() } -// EventStartBroadcast starts a new +// EventStartBroadcast starts a new broadcast operation type EventStartBroadcast struct { QueryID coordt.QueryID MsgFunc func(k kadt.Key) *pb.Message @@ -269,6 +269,24 @@ type EventStartBroadcast struct { func (*EventStartBroadcast) behaviourEvent() {} +type EventStopBroadcast struct { + QueryID coordt.QueryID +} + +func (*EventStopBroadcast) behaviourEvent() {} +func (*EventStopBroadcast) queryCommand() {} + +// EventBroadcastProgressed is emitted by the coordinator when a broadcast +// operation has progressed. +type EventBroadcastProgressed struct { + QueryID coordt.QueryID + NodeID kadt.PeerID + Response *pb.Message + Stats query.QueryStats +} + +func (*EventBroadcastProgressed) behaviourEvent() {} + // EventBroadcastFinished is emitted by the coordinator when a broadcasting // a record to the network has finished, either through running to completion or // by being canceled. @@ -281,5 +299,5 @@ type EventBroadcastFinished struct { } } -func (*EventBroadcastFinished) behaviourEvent() {} -func (*EventBroadcastFinished) terminalQueryEvent() {} +func (*EventBroadcastFinished) behaviourEvent() {} +func (*EventBroadcastFinished) terminalBehaviourEvent() {} diff --git a/internal/coord/query.go b/internal/coord/query.go index 4b60429..960ba73 100644 --- a/internal/coord/query.go +++ b/internal/coord/query.go @@ -445,7 +445,7 @@ func (p *QueryBehaviour) queueNonConnectivityEvent(nid kadt.PeerID) { }) } -type queryNotifier[E TerminalQueryEvent] struct { +type queryNotifier[E TerminalBehaviourEvent] struct { monitor QueryMonitor[E] pending []CtxEvent[*EventQueryProgressed] stopping bool diff --git a/internal/coord/routing.go b/internal/coord/routing.go index a23489c..47e7721 100644 --- a/internal/coord/routing.go +++ b/internal/coord/routing.go @@ -558,7 +558,6 @@ func (r *RoutingBehaviour) perfomNextInbound() (BehaviourEvent, bool) { NodeID: ev.NodeID, } } else { - // TODO: apply ttl cmd = &routing.EventIncludeAddCandidate[kadt.Key, kadt.PeerID]{ NodeID: ev.NodeID, } From d8a45469aa4cefdbd14c565cdc2caf71ecb1ec23 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 18 Oct 2023 11:23:07 +0200 Subject: [PATCH 22/23] WIP --- internal/coord/coordinator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/coord/coordinator.go b/internal/coord/coordinator.go index 091803b..ae7afbe 100644 --- a/internal/coord/coordinator.go +++ b/internal/coord/coordinator.go @@ -520,7 +520,7 @@ func (c *Coordinator) waitForBroadcast(ctx context.Context, waiter *BroadcastWai case wev, more := <-waiter.Progressed(): if !more { - return nil, nil, ctx.Err() + continue } fn(wev.Ctx, wev.Event.NodeID, wev.Event.Response) From 7edf5d16046bdfb0e6bdbe7112864b1f07585ed7 Mon Sep 17 00:00:00 2001 From: Ian Davis <18375+iand@users.noreply.github.com> Date: Wed, 18 Oct 2023 15:42:07 +0100 Subject: [PATCH 23/23] Add logging to crawl and include state machines --- internal/coord/routing.go | 3 +++ internal/coord/routing/crawl.go | 9 ++++++++- internal/coord/routing/include.go | 7 +++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/internal/coord/routing.go b/internal/coord/routing.go index 47e7721..09e5ad4 100644 --- a/internal/coord/routing.go +++ b/internal/coord/routing.go @@ -350,6 +350,7 @@ func NewRoutingBehaviour(self kadt.PeerID, rt routing.RoutingTableCpl[kadt.Key, includeCfg.Clock = cfg.Clock includeCfg.Tracer = cfg.Tracer includeCfg.Meter = cfg.Meter + includeCfg.Logger = cfg.Logger.With("statemachine", "include") includeCfg.Timeout = cfg.ConnectivityCheckTimeout includeCfg.QueueCapacity = cfg.IncludeQueueCapacity includeCfg.Concurrency = cfg.IncludeRequestConcurrency @@ -391,6 +392,8 @@ func NewRoutingBehaviour(self kadt.PeerID, rt routing.RoutingTableCpl[kadt.Key, } crawlCfg := routing.DefaultCrawlConfig() + crawlCfg.Tracer = cfg.Tracer + crawlCfg.Logger = cfg.Logger.With("statemachine", "crawl") crawl, err := routing.NewCrawl(self, cplutil.GenRandPeerID, crawlCfg) if err != nil { diff --git a/internal/coord/routing/crawl.go b/internal/coord/routing/crawl.go index e7074d5..640eebf 100644 --- a/internal/coord/routing/crawl.go +++ b/internal/coord/routing/crawl.go @@ -9,6 +9,7 @@ import ( "github.com/plprobelab/go-libdht/kad/key" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" + "golang.org/x/exp/slog" "github.com/plprobelab/zikade/errs" "github.com/plprobelab/zikade/internal/coord/coordt" @@ -24,6 +25,7 @@ type CrawlConfig struct { Interval time.Duration // the interval in which the network should be crawled (0 means no crawling) Concurrency int // the maximum number of concurrent peers that we may query Tracer trace.Tracer // Tracer is the tracer that should be used to trace execution. + Logger *slog.Logger // Logger is a structured logger that will be used when logging. } // Validate checks the configuration options and returns an error if any have invalid values. @@ -66,6 +68,7 @@ func DefaultCrawlConfig() *CrawlConfig { MaxCPL: 16, Concurrency: 200, Tracer: tele.NoopTracer(), + Logger: tele.DefaultLogger("routing"), } } @@ -146,7 +149,7 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat node: node, target: target.Key(), } - + c.cfg.Logger.Debug("creating crawl job", "node", node.String(), "cpl", j, "key", key.HexString(job.target)) ci.cpls[job.mapKey()] = j ci.todo = append(ci.todo, job) } @@ -165,6 +168,7 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat node: tev.NodeID, target: tev.Target, } + c.cfg.Logger.Debug("received response for crawl job", "node", job.node.String(), "key", key.HexString(job.target)) mapKey := job.mapKey() if _, found := c.info.waiting[mapKey]; !found { @@ -192,6 +196,7 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat continue } + c.cfg.Logger.Debug("creating crawl job", "node", node.String(), "cpl", i, "key", key.HexString(newJob.target)) c.info.cpls[newMapKey] = i c.info.todo = append(c.info.todo, newJob) } @@ -206,6 +211,7 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat node: tev.NodeID, target: tev.Target, } + c.cfg.Logger.Debug("received failure for crawl job", "node", job.node.String(), "key", key.HexString(job.target), "error", tev.Error) mapKey := job.mapKey() if _, found := c.info.waiting[mapKey]; !found { @@ -236,6 +242,7 @@ func (c *Crawl[K, N]) Advance(ctx context.Context, ev CrawlEvent) (out CrawlStat // pop next crawl job from queue var job crawlJob[K, N] job, c.info.todo = c.info.todo[0], c.info.todo[1:] + c.cfg.Logger.Debug("starting crawl job", "node", job.node.String(), "key", key.HexString(job.target)) // mark the job as waiting mapKey := job.mapKey() diff --git a/internal/coord/routing/include.go b/internal/coord/routing/include.go index 3da3c55..e56f48f 100644 --- a/internal/coord/routing/include.go +++ b/internal/coord/routing/include.go @@ -11,6 +11,7 @@ import ( "github.com/plprobelab/go-libdht/kad/key" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/trace" + "golang.org/x/exp/slog" "github.com/plprobelab/zikade/errs" "github.com/plprobelab/zikade/tele" @@ -65,6 +66,9 @@ type IncludeConfig struct { // Tracer is the tracer that should be used to trace execution. Tracer trace.Tracer + // Logger is a structured logger that will be used when logging. + Logger *slog.Logger + // Meter is the meter that should be used to record metrics. Meter metric.Meter } @@ -123,6 +127,7 @@ func DefaultIncludeConfig() *IncludeConfig { Clock: clock.New(), // use standard time Tracer: tele.NoopTracer(), Meter: tele.NoopMeter(), + Logger: tele.DefaultLogger("routing"), Concurrency: 3, Timeout: time.Minute, @@ -225,6 +230,7 @@ func (in *Include[K, N]) Advance(ctx context.Context, ev IncludeEvent) (out Incl case *EventIncludeNode[K, N]: delete(in.checks, key.HexString(tev.NodeID.Key())) if in.rt.AddNode(tev.NodeID) { + in.cfg.Logger.Debug("node directly added to routing table", "node", tev.NodeID) return &StateIncludeRoutingUpdated[K, N]{ NodeID: tev.NodeID, } @@ -240,6 +246,7 @@ func (in *Include[K, N]) Advance(ctx context.Context, ev IncludeEvent) (out Incl if ok { delete(in.checks, key.HexString(tev.NodeID.Key())) if in.rt.AddNode(tev.NodeID) { + in.cfg.Logger.Debug("node added to routing table after successful check", "node", tev.NodeID) return &StateIncludeRoutingUpdated[K, N]{ NodeID: ch.NodeID, }