diff --git a/HACKING.md b/HACKING.md index c0f9a8c7e..74dd9e60c 100644 --- a/HACKING.md +++ b/HACKING.md @@ -14,6 +14,7 @@ Run with, eg, `go run ./cmd/bigsky`): - `cmd/supercollider`: event stream load generation tool - `cmd/sonar`: event stream monitoring tool - `cmd/hepa`: auto-moderation rule engine service +- `cmd/rainbow`: firehose fanout service - `gen`: dev tool to run CBOR type codegen Packages: diff --git a/cmd/goat/actorgetPreferences.go b/api/agnostic/actorgetPreferences.go similarity index 90% rename from cmd/goat/actorgetPreferences.go rename to api/agnostic/actorgetPreferences.go index bd6e8a18c..9ee242c09 100644 --- a/cmd/goat/actorgetPreferences.go +++ b/api/agnostic/actorgetPreferences.go @@ -1,6 +1,6 @@ -// Copied from indigo:api/atproto/actorgetPreferences.go +// Copied from indigo:api/bsky/actorgetPreferences.go -package main +package agnostic // schema: app.bsky.actor.getPreferences diff --git a/cmd/goat/actorputPreferences.go b/api/agnostic/actorputPreferences.go similarity index 90% rename from cmd/goat/actorputPreferences.go rename to api/agnostic/actorputPreferences.go index 24042236d..3648e5bd2 100644 --- a/cmd/goat/actorputPreferences.go +++ b/api/agnostic/actorputPreferences.go @@ -1,6 +1,6 @@ -// Copied from indigo:api/atproto/actorputPreferences.go +// Copied from indigo:api/bsky/actorputPreferences.go -package main +package agnostic // schema: app.bsky.actor.putPreferences diff --git a/api/agnostic/doc.go b/api/agnostic/doc.go new file mode 100644 index 000000000..1b9cf632c --- /dev/null +++ b/api/agnostic/doc.go @@ -0,0 +1,4 @@ +// Package indigo/api/agnositc provides schema-agnostic helpers for fetching records from the network. +// +// These are variants of endpoints in indigo/api/atproto. +package agnostic diff --git a/cmd/goat/identitygetRecommendedDidCredentials.go b/api/agnostic/identitygetRecommendedDidCredentials.go similarity index 97% rename from cmd/goat/identitygetRecommendedDidCredentials.go rename to api/agnostic/identitygetRecommendedDidCredentials.go index 75abcc522..ea0a70a9d 100644 --- a/cmd/goat/identitygetRecommendedDidCredentials.go +++ b/api/agnostic/identitygetRecommendedDidCredentials.go @@ -1,6 +1,6 @@ // Copied from indigo:api/atproto/identitygetRecommendedDidCredentials.go -package main +package agnostic // schema: com.atproto.identity.getRecommendedDidCredentials diff --git a/cmd/goat/identitysignPlcOperation.go b/api/agnostic/identitysignPlcOperation.go similarity index 98% rename from cmd/goat/identitysignPlcOperation.go rename to api/agnostic/identitysignPlcOperation.go index f9c96bea2..59d83a23c 100644 --- a/cmd/goat/identitysignPlcOperation.go +++ b/api/agnostic/identitysignPlcOperation.go @@ -1,6 +1,6 @@ // Copied from indigo:api/atproto/identitysignPlcOperation.go -package main +package agnostic // schema: com.atproto.identity.signPlcOperation diff --git a/cmd/goat/identitysubmitPlcOperation.go b/api/agnostic/identitysubmitPlcOperation.go similarity index 97% rename from cmd/goat/identitysubmitPlcOperation.go rename to api/agnostic/identitysubmitPlcOperation.go index 9f15e8adc..91f9cf0cf 100644 --- a/cmd/goat/identitysubmitPlcOperation.go +++ b/api/agnostic/identitysubmitPlcOperation.go @@ -1,6 +1,6 @@ // Copied from indigo:api/atproto/identitysubmitPlcOperation.go -package main +package agnostic // schema: com.atproto.identity.submitPlcOperation diff --git a/api/agnostic/repoapplyWrites.go b/api/agnostic/repoapplyWrites.go new file mode 100644 index 000000000..de0799ac8 --- /dev/null +++ b/api/agnostic/repoapplyWrites.go @@ -0,0 +1,189 @@ +// Copied from indigo:api/atproto/repoapplyWrites.go + +package agnostic + +// schema: com.atproto.repo.applyWrites + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/bluesky-social/indigo/lex/util" + "github.com/bluesky-social/indigo/xrpc" +) + +// RepoApplyWrites_Create is a "create" in the com.atproto.repo.applyWrites schema. +// +// Operation which creates a new record. +// +// RECORDTYPE: RepoApplyWrites_Create +type RepoApplyWrites_Create struct { + LexiconTypeID string `json:"$type,const=com.atproto.repo.applyWrites#create" cborgen:"$type,const=com.atproto.repo.applyWrites#create"` + Collection string `json:"collection" cborgen:"collection"` + Rkey *string `json:"rkey,omitempty" cborgen:"rkey,omitempty"` + Value *json.RawMessage `json:"value" cborgen:"value"` +} + +// RepoApplyWrites_CreateResult is a "createResult" in the com.atproto.repo.applyWrites schema. +// +// RECORDTYPE: RepoApplyWrites_CreateResult +type RepoApplyWrites_CreateResult struct { + LexiconTypeID string `json:"$type,const=com.atproto.repo.applyWrites#createResult" cborgen:"$type,const=com.atproto.repo.applyWrites#createResult"` + Cid string `json:"cid" cborgen:"cid"` + Uri string `json:"uri" cborgen:"uri"` + ValidationStatus *string `json:"validationStatus,omitempty" cborgen:"validationStatus,omitempty"` +} + +// RepoApplyWrites_Delete is a "delete" in the com.atproto.repo.applyWrites schema. +// +// Operation which deletes an existing record. +// +// RECORDTYPE: RepoApplyWrites_Delete +type RepoApplyWrites_Delete struct { + LexiconTypeID string `json:"$type,const=com.atproto.repo.applyWrites#delete" cborgen:"$type,const=com.atproto.repo.applyWrites#delete"` + Collection string `json:"collection" cborgen:"collection"` + Rkey string `json:"rkey" cborgen:"rkey"` +} + +// RepoApplyWrites_DeleteResult is a "deleteResult" in the com.atproto.repo.applyWrites schema. +// +// RECORDTYPE: RepoApplyWrites_DeleteResult +type RepoApplyWrites_DeleteResult struct { + LexiconTypeID string `json:"$type,const=com.atproto.repo.applyWrites#deleteResult" cborgen:"$type,const=com.atproto.repo.applyWrites#deleteResult"` +} + +// RepoApplyWrites_Input is the input argument to a com.atproto.repo.applyWrites call. +type RepoApplyWrites_Input struct { + // repo: The handle or DID of the repo (aka, current account). + Repo string `json:"repo" cborgen:"repo"` + // swapCommit: If provided, the entire operation will fail if the current repo commit CID does not match this value. Used to prevent conflicting repo mutations. + SwapCommit *string `json:"swapCommit,omitempty" cborgen:"swapCommit,omitempty"` + // validate: Can be set to 'false' to skip Lexicon schema validation of record data across all operations, 'true' to require it, or leave unset to validate only for known Lexicons. + Validate *bool `json:"validate,omitempty" cborgen:"validate,omitempty"` + Writes []*RepoApplyWrites_Input_Writes_Elem `json:"writes" cborgen:"writes"` +} + +type RepoApplyWrites_Input_Writes_Elem struct { + RepoApplyWrites_Create *RepoApplyWrites_Create + RepoApplyWrites_Update *RepoApplyWrites_Update + RepoApplyWrites_Delete *RepoApplyWrites_Delete +} + +func (t *RepoApplyWrites_Input_Writes_Elem) MarshalJSON() ([]byte, error) { + if t.RepoApplyWrites_Create != nil { + t.RepoApplyWrites_Create.LexiconTypeID = "com.atproto.repo.applyWrites#create" + return json.Marshal(t.RepoApplyWrites_Create) + } + if t.RepoApplyWrites_Update != nil { + t.RepoApplyWrites_Update.LexiconTypeID = "com.atproto.repo.applyWrites#update" + return json.Marshal(t.RepoApplyWrites_Update) + } + if t.RepoApplyWrites_Delete != nil { + t.RepoApplyWrites_Delete.LexiconTypeID = "com.atproto.repo.applyWrites#delete" + return json.Marshal(t.RepoApplyWrites_Delete) + } + return nil, fmt.Errorf("cannot marshal empty enum") +} +func (t *RepoApplyWrites_Input_Writes_Elem) UnmarshalJSON(b []byte) error { + typ, err := util.TypeExtract(b) + if err != nil { + return err + } + + switch typ { + case "com.atproto.repo.applyWrites#create": + t.RepoApplyWrites_Create = new(RepoApplyWrites_Create) + return json.Unmarshal(b, t.RepoApplyWrites_Create) + case "com.atproto.repo.applyWrites#update": + t.RepoApplyWrites_Update = new(RepoApplyWrites_Update) + return json.Unmarshal(b, t.RepoApplyWrites_Update) + case "com.atproto.repo.applyWrites#delete": + t.RepoApplyWrites_Delete = new(RepoApplyWrites_Delete) + return json.Unmarshal(b, t.RepoApplyWrites_Delete) + + default: + return fmt.Errorf("closed enums must have a matching value") + } +} + +// RepoApplyWrites_Output is the output of a com.atproto.repo.applyWrites call. +type RepoApplyWrites_Output struct { + Commit *RepoDefs_CommitMeta `json:"commit,omitempty" cborgen:"commit,omitempty"` + Results []*RepoApplyWrites_Output_Results_Elem `json:"results,omitempty" cborgen:"results,omitempty"` +} + +type RepoApplyWrites_Output_Results_Elem struct { + RepoApplyWrites_CreateResult *RepoApplyWrites_CreateResult + RepoApplyWrites_UpdateResult *RepoApplyWrites_UpdateResult + RepoApplyWrites_DeleteResult *RepoApplyWrites_DeleteResult +} + +func (t *RepoApplyWrites_Output_Results_Elem) MarshalJSON() ([]byte, error) { + if t.RepoApplyWrites_CreateResult != nil { + t.RepoApplyWrites_CreateResult.LexiconTypeID = "com.atproto.repo.applyWrites#createResult" + return json.Marshal(t.RepoApplyWrites_CreateResult) + } + if t.RepoApplyWrites_UpdateResult != nil { + t.RepoApplyWrites_UpdateResult.LexiconTypeID = "com.atproto.repo.applyWrites#updateResult" + return json.Marshal(t.RepoApplyWrites_UpdateResult) + } + if t.RepoApplyWrites_DeleteResult != nil { + t.RepoApplyWrites_DeleteResult.LexiconTypeID = "com.atproto.repo.applyWrites#deleteResult" + return json.Marshal(t.RepoApplyWrites_DeleteResult) + } + return nil, fmt.Errorf("cannot marshal empty enum") +} +func (t *RepoApplyWrites_Output_Results_Elem) UnmarshalJSON(b []byte) error { + typ, err := util.TypeExtract(b) + if err != nil { + return err + } + + switch typ { + case "com.atproto.repo.applyWrites#createResult": + t.RepoApplyWrites_CreateResult = new(RepoApplyWrites_CreateResult) + return json.Unmarshal(b, t.RepoApplyWrites_CreateResult) + case "com.atproto.repo.applyWrites#updateResult": + t.RepoApplyWrites_UpdateResult = new(RepoApplyWrites_UpdateResult) + return json.Unmarshal(b, t.RepoApplyWrites_UpdateResult) + case "com.atproto.repo.applyWrites#deleteResult": + t.RepoApplyWrites_DeleteResult = new(RepoApplyWrites_DeleteResult) + return json.Unmarshal(b, t.RepoApplyWrites_DeleteResult) + + default: + return fmt.Errorf("closed enums must have a matching value") + } +} + +// RepoApplyWrites_Update is a "update" in the com.atproto.repo.applyWrites schema. +// +// Operation which updates an existing record. +// +// RECORDTYPE: RepoApplyWrites_Update +type RepoApplyWrites_Update struct { + LexiconTypeID string `json:"$type,const=com.atproto.repo.applyWrites#update" cborgen:"$type,const=com.atproto.repo.applyWrites#update"` + Collection string `json:"collection" cborgen:"collection"` + Rkey string `json:"rkey" cborgen:"rkey"` + Value *json.RawMessage `json:"value" cborgen:"value"` +} + +// RepoApplyWrites_UpdateResult is a "updateResult" in the com.atproto.repo.applyWrites schema. +// +// RECORDTYPE: RepoApplyWrites_UpdateResult +type RepoApplyWrites_UpdateResult struct { + LexiconTypeID string `json:"$type,const=com.atproto.repo.applyWrites#updateResult" cborgen:"$type,const=com.atproto.repo.applyWrites#updateResult"` + Cid string `json:"cid" cborgen:"cid"` + Uri string `json:"uri" cborgen:"uri"` + ValidationStatus *string `json:"validationStatus,omitempty" cborgen:"validationStatus,omitempty"` +} + +// RepoApplyWrites calls the XRPC method "com.atproto.repo.applyWrites". +func RepoApplyWrites(ctx context.Context, c *xrpc.Client, input *RepoApplyWrites_Input) (*RepoApplyWrites_Output, error) { + var out RepoApplyWrites_Output + if err := c.Do(ctx, xrpc.Procedure, "application/json", "com.atproto.repo.applyWrites", nil, input, &out); err != nil { + return nil, err + } + + return &out, nil +} diff --git a/cmd/goat/repocreateRecord.go b/api/agnostic/repocreateRecord.go similarity index 99% rename from cmd/goat/repocreateRecord.go rename to api/agnostic/repocreateRecord.go index c1fa67a39..9dcddadb3 100644 --- a/cmd/goat/repocreateRecord.go +++ b/api/agnostic/repocreateRecord.go @@ -1,6 +1,6 @@ // Copied from indigo:api/atproto/repocreateRecords.go -package main +package agnostic // schema: com.atproto.repo.createRecord diff --git a/cmd/astrolabe/repogetRecord.go b/api/agnostic/repogetRecord.go similarity index 98% rename from cmd/astrolabe/repogetRecord.go rename to api/agnostic/repogetRecord.go index e3b857cd5..6025c9593 100644 --- a/cmd/astrolabe/repogetRecord.go +++ b/api/agnostic/repogetRecord.go @@ -1,6 +1,6 @@ // Copied from indigo:api/atproto/repolistRecords.go -package main +package agnostic // schema: com.atproto.repo.getRecord diff --git a/cmd/astrolabe/repolistRecords.go b/api/agnostic/repolistRecords.go similarity index 99% rename from cmd/astrolabe/repolistRecords.go rename to api/agnostic/repolistRecords.go index 3cfdd9b24..5d32e8650 100644 --- a/cmd/astrolabe/repolistRecords.go +++ b/api/agnostic/repolistRecords.go @@ -1,6 +1,6 @@ // Copied from indigo:api/atproto/repolistRecords.go -package main +package agnostic // schema: com.atproto.repo.listRecords diff --git a/cmd/goat/repoputRecord.go b/api/agnostic/repoputRecord.go similarity index 99% rename from cmd/goat/repoputRecord.go rename to api/agnostic/repoputRecord.go index 34011797b..15e1452b4 100644 --- a/cmd/goat/repoputRecord.go +++ b/api/agnostic/repoputRecord.go @@ -1,6 +1,6 @@ // Copied from indigo:api/atproto/repoputRecords.go -package main +package agnostic // schema: com.atproto.repo.putRecord diff --git a/atproto/data/data.go b/atproto/data/data.go index 1a4a8c5b9..560566c2d 100644 --- a/atproto/data/data.go +++ b/atproto/data/data.go @@ -66,17 +66,11 @@ func extractBlobsAtom(atom any) []Blob { out = append(out, v) case []any: for _, el := range v { - down := extractBlobsAtom(el) - for _, d := range down { - out = append(out, d) - } + out = append(out, extractBlobsAtom(el)...) } case map[string]any: for _, val := range v { - down := extractBlobsAtom(val) - for _, d := range down { - out = append(out, d) - } + out = append(out, extractBlobsAtom(val)...) } default: } diff --git a/atproto/identity/did.go b/atproto/identity/did.go index ab2f3439a..ca13b323c 100644 --- a/atproto/identity/did.go +++ b/atproto/identity/did.go @@ -13,26 +13,6 @@ import ( "github.com/bluesky-social/indigo/atproto/syntax" ) -type DIDDocument struct { - DID syntax.DID `json:"id"` - AlsoKnownAs []string `json:"alsoKnownAs,omitempty"` - VerificationMethod []DocVerificationMethod `json:"verificationMethod,omitempty"` - Service []DocService `json:"service,omitempty"` -} - -type DocVerificationMethod struct { - ID string `json:"id"` - Type string `json:"type"` - Controller string `json:"controller"` - PublicKeyMultibase string `json:"publicKeyMultibase"` -} - -type DocService struct { - ID string `json:"id"` - Type string `json:"type"` - ServiceEndpoint string `json:"serviceEndpoint"` -} - // WARNING: this does *not* bi-directionally verify account metadata; it only implements direct DID-to-DID-document lookup for the supported DID methods, and parses the resulting DID Doc into an Identity struct func (d *BaseDirectory) ResolveDID(ctx context.Context, did syntax.DID) (*DIDDocument, error) { start := time.Now() diff --git a/atproto/identity/diddoc.go b/atproto/identity/diddoc.go new file mode 100644 index 000000000..4c7070c20 --- /dev/null +++ b/atproto/identity/diddoc.go @@ -0,0 +1,25 @@ +package identity + +import ( + "github.com/bluesky-social/indigo/atproto/syntax" +) + +type DIDDocument struct { + DID syntax.DID `json:"id"` + AlsoKnownAs []string `json:"alsoKnownAs,omitempty"` + VerificationMethod []DocVerificationMethod `json:"verificationMethod,omitempty"` + Service []DocService `json:"service,omitempty"` +} + +type DocVerificationMethod struct { + ID string `json:"id"` + Type string `json:"type"` + Controller string `json:"controller"` + PublicKeyMultibase string `json:"publicKeyMultibase"` +} + +type DocService struct { + ID string `json:"id"` + Type string `json:"type"` + ServiceEndpoint string `json:"serviceEndpoint"` +} diff --git a/atproto/identity/did_test.go b/atproto/identity/diddoc_text.go similarity index 100% rename from atproto/identity/did_test.go rename to atproto/identity/diddoc_text.go diff --git a/atproto/identity/directory.go b/atproto/identity/directory.go new file mode 100644 index 000000000..b0d2b2f10 --- /dev/null +++ b/atproto/identity/directory.go @@ -0,0 +1,81 @@ +package identity + +import ( + "context" + "errors" + "net" + "net/http" + "time" + + "github.com/bluesky-social/indigo/atproto/syntax" +) + +// API for doing account lookups by DID or handle, with bi-directional verification handled automatically. Almost all atproto services and clients should use an implementation of this interface instead of resolving handles or DIDs separately +// +// Handles which fail to resolve, or don't match DID alsoKnownAs, are an error. DIDs which resolve but the handle does not resolve back to the DID return an Identity where the Handle is the special `handle.invalid` value. +// +// Some example implementations of this interface could be: +// - basic direct resolution on every call +// - local in-memory caching layer to reduce network hits +// - API client, which just makes requests to PDS (or other remote service) +// - client for shared network cache (eg, Redis) +type Directory interface { + LookupHandle(ctx context.Context, h syntax.Handle) (*Identity, error) + LookupDID(ctx context.Context, d syntax.DID) (*Identity, error) + Lookup(ctx context.Context, i syntax.AtIdentifier) (*Identity, error) + + // Flushes any cache of the indicated identifier. If directory is not using caching, can ignore this. + Purge(ctx context.Context, i syntax.AtIdentifier) error +} + +// Indicates that handle resolution failed. A wrapped error may provide more context. This is only returned when looking up a handle, not when looking up a DID. +var ErrHandleResolutionFailed = errors.New("handle resolution failed") + +// Indicates that resolution process completed successfully, but handle does not exist. This is only returned when looking up a handle, not when looking up a DID. +var ErrHandleNotFound = errors.New("handle not found") + +// Indicates that resolution process completed successfully, handle mapped to a different DID. This is only returned when looking up a handle, not when looking up a DID. +var ErrHandleMismatch = errors.New("handle/DID mismatch") + +// Indicates that DID document did not include any handle ("alsoKnownAs"). This is only returned when looking up a handle, not when looking up a DID. +var ErrHandleNotDeclared = errors.New("DID document did not declare a handle") + +// Handle top-level domain (TLD) is one of the special "Reserved" suffixes, and not allowed for atproto use +var ErrHandleReservedTLD = errors.New("handle top-level domain is disallowed") + +// Indicates that resolution process completed successfully, but the DID does not exist. +var ErrDIDNotFound = errors.New("DID not found") + +// Indicates that DID resolution process failed. A wrapped error may provide more context. +var ErrDIDResolutionFailed = errors.New("DID resolution failed") + +// Indicates that DID document did not include a public key with the specified ID +var ErrKeyNotDeclared = errors.New("DID document did not declare a relevant public key") + +var DefaultPLCURL = "https://plc.directory" + +// Returns a reasonable Directory implementation for applications +func DefaultDirectory() Directory { + base := BaseDirectory{ + PLCURL: DefaultPLCURL, + HTTPClient: http.Client{ + Timeout: time.Second * 10, + Transport: &http.Transport{ + // would want this around 100ms for services doing lots of handle resolution. Impacts PLC connections as well, but not too bad. + IdleConnTimeout: time.Millisecond * 1000, + MaxIdleConns: 100, + }, + }, + Resolver: net.Resolver{ + Dial: func(ctx context.Context, network, address string) (net.Conn, error) { + d := net.Dialer{Timeout: time.Second * 3} + return d.DialContext(ctx, network, address) + }, + }, + TryAuthoritativeDNS: true, + // primary Bluesky PDS instance only supports HTTP resolution method + SkipDNSDomainSuffixes: []string{".bsky.social"}, + } + cached := NewCacheDirectory(&base, 250_000, time.Hour*24, time.Minute*2, time.Minute*5) + return &cached +} diff --git a/atproto/identity/identity.go b/atproto/identity/identity.go index 02e66f22c..9779ca0d9 100644 --- a/atproto/identity/identity.go +++ b/atproto/identity/identity.go @@ -1,14 +1,9 @@ package identity import ( - "context" - "errors" "fmt" - "net" - "net/http" "net/url" "strings" - "time" "github.com/bluesky-social/indigo/atproto/crypto" "github.com/bluesky-social/indigo/atproto/syntax" @@ -16,71 +11,6 @@ import ( "github.com/mr-tron/base58" ) -// API for doing account lookups by DID or handle, with bi-directional verification handled automatically. Almost all atproto services and clients should use an implementation of this interface instead of resolving handles or DIDs separately -// -// Handles which fail to resolve, or don't match DID alsoKnownAs, are an error. DIDs which resolve but the handle does not resolve back to the DID return an Identity where the Handle is the special `handle.invalid` value. -// -// Some example implementations of this interface could be: -// - basic direct resolution on every call -// - local in-memory caching layer to reduce network hits -// - API client, which just makes requests to PDS (or other remote service) -// - client for shared network cache (eg, Redis) -type Directory interface { - LookupHandle(ctx context.Context, h syntax.Handle) (*Identity, error) - LookupDID(ctx context.Context, d syntax.DID) (*Identity, error) - Lookup(ctx context.Context, i syntax.AtIdentifier) (*Identity, error) - - // Flushes any cache of the indicated identifier. If directory is not using caching, can ignore this. - Purge(ctx context.Context, i syntax.AtIdentifier) error -} - -// Indicates that handle resolution failed. A wrapped error may provide more context. This is only returned when looking up a handle, not when looking up a DID. -var ErrHandleResolutionFailed = errors.New("handle resolution failed") - -// Indicates that resolution process completed successfully, but handle does not exist. This is only returned when looking up a handle, not when looking up a DID. -var ErrHandleNotFound = errors.New("handle not found") - -// Indicates that resolution process completed successfully, handle mapped to a different DID. This is only returned when looking up a handle, not when looking up a DID. -var ErrHandleMismatch = errors.New("handle/DID mismatch") - -// Indicates that DID document did not include any handle ("alsoKnownAs"). This is only returned when looking up a handle, not when looking up a DID. -var ErrHandleNotDeclared = errors.New("DID document did not declare a handle") - -// Handle top-level domain (TLD) is one of the special "Reserved" suffixes, and not allowed for atproto use -var ErrHandleReservedTLD = errors.New("handle top-level domain is disallowed") - -// Indicates that resolution process completed successfully, but the DID does not exist. -var ErrDIDNotFound = errors.New("DID not found") - -// Indicates that DID resolution process failed. A wrapped error may provide more context. -var ErrDIDResolutionFailed = errors.New("DID resolution failed") - -// Indicates that DID document did not include a public key with the specified ID -var ErrKeyNotDeclared = errors.New("DID document did not declare a relevant public key") - -var DefaultPLCURL = "https://plc.directory" - -// Returns a reasonable Directory implementation for applications -func DefaultDirectory() Directory { - base := BaseDirectory{ - PLCURL: DefaultPLCURL, - HTTPClient: http.Client{ - Timeout: time.Second * 15, - }, - Resolver: net.Resolver{ - Dial: func(ctx context.Context, network, address string) (net.Conn, error) { - d := net.Dialer{Timeout: time.Second * 5} - return d.DialContext(ctx, network, address) - }, - }, - TryAuthoritativeDNS: true, - // primary Bluesky PDS instance only supports HTTP resolution method - SkipDNSDomainSuffixes: []string{".bsky.social"}, - } - cached := NewCacheDirectory(&base, 250_000, time.Hour*24, time.Minute*2, time.Minute*5) - return &cached -} - // Represents an atproto identity. Could be a regular user account, or a service account (eg, feed generator) type Identity struct { DID syntax.DID diff --git a/atproto/lexicon/language.go b/atproto/lexicon/language.go index 734d0031e..1eb5bdf0e 100644 --- a/atproto/lexicon/language.go +++ b/atproto/lexicon/language.go @@ -296,7 +296,7 @@ type SchemaRecord struct { func (s *SchemaRecord) CheckSchema() error { switch s.Key { - case "tid", "any": + case "tid", "nsid", "any": // pass default: if !strings.HasPrefix(s.Key, "literal:") { diff --git a/atproto/syntax/cmd/atp-syntax/main.go b/atproto/syntax/cmd/atp-syntax/main.go index d208b367a..e1410e910 100644 --- a/atproto/syntax/cmd/atp-syntax/main.go +++ b/atproto/syntax/cmd/atp-syntax/main.go @@ -22,6 +22,12 @@ func main() { ArgsUsage: "", Action: runParseTID, }, + &cli.Command{ + Name: "parse-did", + Usage: "parse a DID", + ArgsUsage: "", + Action: runParseDID, + }, } h := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug}) slog.SetDefault(slog.New(h)) @@ -43,3 +49,18 @@ func runParseTID(cctx *cli.Context) error { return nil } + +func runParseDID(cctx *cli.Context) error { + s := cctx.Args().First() + if s == "" { + return fmt.Errorf("need to provide identifier as an argument") + } + + did, err := syntax.ParseDID(s) + if err != nil { + return err + } + fmt.Printf("%s\n", did) + + return nil +} diff --git a/atproto/syntax/did.go b/atproto/syntax/did.go index ef9c0e04f..ffa4fc226 100644 --- a/atproto/syntax/did.go +++ b/atproto/syntax/did.go @@ -14,8 +14,30 @@ import ( type DID string var didRegex = regexp.MustCompile(`^did:[a-z]+:[a-zA-Z0-9._:%-]*[a-zA-Z0-9._-]$`) +var plcChars = "" + +func isASCIIAlphaNum(c rune) bool { + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') { + return true + } + return false +} func ParseDID(raw string) (DID, error) { + // fast-path for did:plc, avoiding regex + if len(raw) == 32 && strings.HasPrefix(raw, "did:plc:") { + // NOTE: this doesn't really check base32, just broader alphanumberic. might pass invalid PLC DIDs, but they still have overall valid DID syntax + isPlc := true + for _, c := range raw[8:32] { + if !isASCIIAlphaNum(c) { + isPlc = false + break + } + } + if isPlc { + return DID(raw), nil + } + } if raw == "" { return "", errors.New("expected DID, got empty string") } diff --git a/atproto/syntax/path.go b/atproto/syntax/path.go new file mode 100644 index 000000000..310bee17b --- /dev/null +++ b/atproto/syntax/path.go @@ -0,0 +1,26 @@ +package syntax + +import ( + "errors" + "fmt" + "strings" +) + +// Parses an atproto repo path string in to "collection" (NSID) and record key parts. +// +// Does not return partial success: either both collection and record key are complete (and error is nil), or both are empty string (and error is not nil) +func ParseRepoPath(raw string) (NSID, RecordKey, error) { + parts := strings.SplitN(raw, "/", 3) + if len(parts) != 2 { + return "", "", errors.New("expected path to have two parts, separated by single slash") + } + nsid, err := ParseNSID(parts[0]) + if err != nil { + return "", "", fmt.Errorf("collection part of path not a valid NSID: %w", err) + } + rkey, err := ParseRecordKey(parts[1]) + if err != nil { + return "", "", fmt.Errorf("record key part of path not valid: %w", err) + } + return nsid, rkey, nil +} diff --git a/atproto/syntax/path_test.go b/atproto/syntax/path_test.go new file mode 100644 index 000000000..e55f94fa7 --- /dev/null +++ b/atproto/syntax/path_test.go @@ -0,0 +1,41 @@ +package syntax + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestRepoPath(t *testing.T) { + assert := assert.New(t) + + testValid := [][]string{ + {"app.bsky.feed.post/asdf", "app.bsky.feed.post", "asdf"}, + } + + testErr := []string{ + "", + "/", + "/app.bsky.feed.post/asdf", + "/asdf", + "./app.bsky.feed.post", + "blob/asdf", + "app.bsky.feed.post/", + "app.bsky.feed.post/.", + "app.bsky.feed.post/!", + } + + for _, parts := range testValid { + nsid, rkey, err := ParseRepoPath(parts[0]) + assert.NoError(err) + assert.Equal(parts[1], nsid.String()) + assert.Equal(parts[2], rkey.String()) + } + + for _, raw := range testErr { + nsid, rkey, err := ParseRepoPath(raw) + assert.Error(err) + assert.Equal("", nsid.String()) + assert.Equal("", rkey.String()) + } +} diff --git a/automod/consumer/firehose.go b/automod/consumer/firehose.go index f210b3055..91470cc43 100644 --- a/automod/consumer/firehose.go +++ b/automod/consumer/firehose.go @@ -116,7 +116,7 @@ func (fc *FirehoseConsumer) Run(ctx context.Context) error { fc.Logger.Info("hepa scheduler configured", "scheduler", "autoscaling", "initial", scaleSettings.Concurrency, "max", scaleSettings.MaxConcurrency) } - return events.HandleRepoStream(ctx, con, scheduler) + return events.HandleRepoStream(ctx, con, scheduler, fc.Logger) } // NOTE: for now, this function basically never errors, just logs and returns nil. Should think through error processing better. diff --git a/automod/engine/circuit_breaker_test.go b/automod/engine/circuit_breaker_test.go index 43c38393a..dee3dcc52 100644 --- a/automod/engine/circuit_breaker_test.go +++ b/automod/engine/circuit_breaker_test.go @@ -44,7 +44,7 @@ func TestTakedownCircuitBreaker(t *testing.T) { p1cbor := p1buf.Bytes() // generate double the quote of events; expect to only count the quote worth of actions - for i := 0; i < 2*QuotaModTakedownDay; i++ { + for i := 0; i < 2*eng.Config.QuotaModTakedownDay; i++ { ident := identity.Identity{ DID: syntax.DID(fmt.Sprintf("did:plc:abc%d", i)), Handle: syntax.Handle("handle.example.com"), @@ -63,7 +63,7 @@ func TestTakedownCircuitBreaker(t *testing.T) { takedowns, err := eng.Counters.GetCount(ctx, "automod-quota", "takedown", countstore.PeriodDay) assert.NoError(err) - assert.Equal(QuotaModTakedownDay, takedowns) + assert.Equal(eng.Config.QuotaModTakedownDay, takedowns) reports, err := eng.Counters.GetCount(ctx, "automod-quota", "report", countstore.PeriodDay) assert.NoError(err) @@ -89,7 +89,7 @@ func TestReportCircuitBreaker(t *testing.T) { p1cbor := p1buf.Bytes() // generate double the quota of events; expect to only count the quota worth of actions - for i := 0; i < 2*QuotaModReportDay; i++ { + for i := 0; i < 2*eng.Config.QuotaModReportDay; i++ { ident := identity.Identity{ DID: syntax.DID(fmt.Sprintf("did:plc:abc%d", i)), Handle: syntax.Handle("handle.example.com"), @@ -112,5 +112,5 @@ func TestReportCircuitBreaker(t *testing.T) { reports, err := eng.Counters.GetCount(ctx, "automod-quota", "report", countstore.PeriodDay) assert.NoError(err) - assert.Equal(QuotaModReportDay, reports) + assert.Equal(eng.Config.QuotaModReportDay, reports) } diff --git a/automod/engine/effects.go b/automod/engine/effects.go index a318615cc..10bbc1fb0 100644 --- a/automod/engine/effects.go +++ b/automod/engine/effects.go @@ -2,18 +2,6 @@ package engine import ( "sync" - "time" -) - -var ( - // time period within which automod will not re-report an account for the same reasonType - ReportDupePeriod = 1 * 24 * time.Hour - // number of reports automod can file per day, for all subjects and types combined (circuit breaker) - QuotaModReportDay = 2000 - // number of takedowns automod can action per day, for all subjects combined (circuit breaker) - QuotaModTakedownDay = 200 - // number of misc actions automod can do per day, for all subjects combined (circuit breaker) - QuotaModActionDay = 1000 ) type CounterRef struct { diff --git a/automod/engine/engine.go b/automod/engine/engine.go index b313a4492..933987072 100644 --- a/automod/engine/engine.go +++ b/automod/engine/engine.go @@ -52,6 +52,14 @@ type Engine struct { type EngineConfig struct { // if enabled, account metadata is not hydrated for every event by default SkipAccountMeta bool + // time period within which automod will not re-report an account for the same reasonType + ReportDupePeriod time.Duration + // number of reports automod can file per day, for all subjects and types combined (circuit breaker) + QuotaModReportDay int + // number of takedowns automod can action per day, for all subjects combined (circuit breaker) + QuotaModTakedownDay int + // number of misc actions automod can do per day, for all subjects combined (circuit breaker) + QuotaModActionDay int } // Entrypoint for external code pushing #identity events in to the engine. diff --git a/automod/engine/fetch_account_meta.go b/automod/engine/fetch_account_meta.go index 5c55b417f..39b4df1eb 100644 --- a/automod/engine/fetch_account_meta.go +++ b/automod/engine/fetch_account_meta.go @@ -139,14 +139,16 @@ func (e *Engine) GetAccountMeta(ctx context.Context, ident *identity.Identity) ( ap.AccountTags = dedupeStrings(rd.Moderation.SubjectStatus.Tags) if rd.Moderation.SubjectStatus.ReviewState != nil { switch *rd.Moderation.SubjectStatus.ReviewState { - case "#reviewOpen": + case "tools.ozone.moderation.defs#reviewOpen": ap.ReviewState = ReviewStateOpen - case "#reviewEscalated": + case "tools.ozone.moderation.defs#reviewEscalated": ap.ReviewState = ReviewStateEscalated - case "#reviewClosed": + case "tools.ozone.moderation.defs#reviewClosed": ap.ReviewState = ReviewStateClosed - case "#reviewNonde": + case "tools.ozone.moderation.defs#reviewNone": ap.ReviewState = ReviewStateNone + default: + logger.Warn("unexpected ozone moderation review state", "state", rd.Moderation.SubjectStatus.ReviewState, "did", ident.DID) } } } diff --git a/automod/engine/persisthelpers.go b/automod/engine/persisthelpers.go index 42ba8934e..906422ab9 100644 --- a/automod/engine/persisthelpers.go +++ b/automod/engine/persisthelpers.go @@ -98,7 +98,12 @@ func (eng *Engine) circuitBreakReports(ctx context.Context, reports []ModReport) if err != nil { return nil, fmt.Errorf("checking report action quota: %w", err) } - if c >= QuotaModReportDay { + + quotaModReportDay := eng.Config.QuotaModReportDay + if quotaModReportDay == 0 { + quotaModReportDay = 10000 + } + if c >= quotaModReportDay { eng.Logger.Warn("CIRCUIT BREAKER: automod reports") return []ModReport{}, nil } @@ -117,7 +122,11 @@ func (eng *Engine) circuitBreakTakedown(ctx context.Context, takedown bool) (boo if err != nil { return false, fmt.Errorf("checking takedown action quota: %w", err) } - if c >= QuotaModTakedownDay { + quotaModTakedownDay := eng.Config.QuotaModTakedownDay + if quotaModTakedownDay == 0 { + quotaModTakedownDay = 200 + } + if c >= quotaModTakedownDay { eng.Logger.Warn("CIRCUIT BREAKER: automod takedowns") return false, nil } @@ -137,7 +146,11 @@ func (eng *Engine) circuitBreakModAction(ctx context.Context, action bool) (bool if err != nil { return false, fmt.Errorf("checking mod action quota: %w", err) } - if c >= QuotaModActionDay { + quotaModActionDay := eng.Config.QuotaModActionDay + if quotaModActionDay == 0 { + quotaModActionDay = 2000 + } + if c >= quotaModActionDay { eng.Logger.Warn("CIRCUIT BREAKER: automod action") return false, nil } @@ -191,7 +204,11 @@ func (eng *Engine) createReportIfFresh(ctx context.Context, xrpcc *xrpc.Client, if err != nil { return false, err } - if time.Since(created.Time()) > ReportDupePeriod { + reportDupePeriod := eng.Config.ReportDupePeriod + if reportDupePeriod == 0 { + reportDupePeriod = 1 * 24 * time.Hour + } + if time.Since(created.Time()) > reportDupePeriod { continue } @@ -267,7 +284,11 @@ func (eng *Engine) createRecordReportIfFresh(ctx context.Context, xrpcc *xrpc.Cl if err != nil { return false, err } - if time.Since(created.Time()) > ReportDupePeriod { + reportDupePeriod := eng.Config.ReportDupePeriod + if reportDupePeriod == 0 { + reportDupePeriod = 1 * 24 * time.Hour + } + if time.Since(created.Time()) > reportDupePeriod { continue } diff --git a/automod/engine/slack.go b/automod/engine/slack.go index ae7c00d20..fe21abb72 100644 --- a/automod/engine/slack.go +++ b/automod/engine/slack.go @@ -86,7 +86,7 @@ func slackBody(header string, acct AccountMeta, newLabels, newFlags []string, ne msg += fmt.Sprintf("Report `%s`: %s\n", rep.ReasonType, rep.Comment) } if newTakedown { - msg += fmt.Sprintf("Takedown!\n") + msg += "Takedown!\n" } return msg } diff --git a/automod/flagstore/flagstore_mem.go b/automod/flagstore/flagstore_mem.go index 4f128075a..bc068475c 100644 --- a/automod/flagstore/flagstore_mem.go +++ b/automod/flagstore/flagstore_mem.go @@ -27,9 +27,7 @@ func (s MemFlagStore) Add(ctx context.Context, key string, flags []string) error if !ok { v = []string{} } - for _, f := range flags { - v = append(v, f) - } + v = append(v, flags...) v = dedupeStrings(v) s.Data[key] = v return nil diff --git a/automod/helpers/bsky.go b/automod/helpers/bsky.go index c7416f2dd..a38ecda92 100644 --- a/automod/helpers/bsky.go +++ b/automod/helpers/bsky.go @@ -11,9 +11,7 @@ import ( func ExtractHashtagsPost(post *appbsky.FeedPost) []string { var tags []string - for _, tag := range post.Tags { - tags = append(tags, tag) - } + tags = append(tags, post.Tags...) for _, facet := range post.Facets { for _, feat := range facet.Features { if feat.RichtextFacet_Tag != nil { diff --git a/automod/rules/harassment.go b/automod/rules/harassment.go index 2cf7ce194..e38e7c44a 100644 --- a/automod/rules/harassment.go +++ b/automod/rules/harassment.go @@ -130,7 +130,7 @@ func HarassmentTrivialPostRule(c *automod.RecordContext, post *appbsky.FeedPost) if count > 5 { //c.AddRecordFlag("trivial-harassing-post") - c.ReportAccount(automod.ReportReasonOther, fmt.Sprintf("possible targetted harassment (also labeled; remove label if this isn't harassment!)")) + c.ReportAccount(automod.ReportReasonOther, "possible targetted harassment (also labeled; remove label if this isn't harassment!)") c.AddAccountLabel("!hide") c.Notify("slack") } diff --git a/automod/rules/nostr.go b/automod/rules/nostr.go index 5f91e7ee6..6ad623ae3 100644 --- a/automod/rules/nostr.go +++ b/automod/rules/nostr.go @@ -1,7 +1,6 @@ package rules import ( - "fmt" "strings" "time" @@ -37,7 +36,7 @@ func NostrSpamPostRule(c *automod.RecordContext, post *appbsky.FeedPost) error { return nil } - c.ReportAccount(automod.ReportReasonOther, fmt.Sprintf("likely nostr spam account (also labeled; remove label if this isn't spam!)")) + c.ReportAccount(automod.ReportReasonOther, "likely nostr spam account (also labeled; remove label if this isn't spam!)") c.AddAccountLabel("!hide") c.Notify("slack") return nil diff --git a/automod/rules/promo.go b/automod/rules/promo.go index f6fe23a24..001dbc715 100644 --- a/automod/rules/promo.go +++ b/automod/rules/promo.go @@ -1,7 +1,6 @@ package rules import ( - "fmt" "net/url" "strings" "time" @@ -54,7 +53,7 @@ func AggressivePromotionRule(c *automod.RecordContext, post *appbsky.FeedPost) e uniqueReplies := c.GetCountDistinct("reply-to", did, countstore.PeriodDay) if uniqueReplies >= 10 { c.AddAccountFlag("promo-multi-reply") - c.ReportAccount(automod.ReportReasonSpam, fmt.Sprintf("possible aggressive self-promotion")) + c.ReportAccount(automod.ReportReasonSpam, "possible aggressive self-promotion") c.Notify("slack") } diff --git a/automod/rules/quick.go b/automod/rules/quick.go index ea6a69e36..127914c7a 100644 --- a/automod/rules/quick.go +++ b/automod/rules/quick.go @@ -29,7 +29,7 @@ func BotLinkProfileRule(c *automod.RecordContext, profile *appbsky.ActorProfile) } if strings.Contains(*profile.Description, "🏈🍕🌀") { c.AddAccountFlag("profile-bot-string") - c.ReportAccount(automod.ReportReasonSpam, fmt.Sprintf("possible bot based on string in profile")) + c.ReportAccount(automod.ReportReasonSpam, "possible bot based on string in profile") c.Notify("slack") return nil } @@ -89,7 +89,7 @@ func TrivialSpamPostRule(c *automod.RecordContext, post *appbsky.FeedPost) error return nil } - c.ReportAccount(automod.ReportReasonOther, fmt.Sprintf("trivial spam account (also labeled; remove label if this isn't spam!)")) + c.ReportAccount(automod.ReportReasonOther, "trivial spam account (also labeled; remove label if this isn't spam!)") c.AddAccountLabel("!hide") c.Notify("slack") return nil diff --git a/backfill/backfill.go b/backfill/backfill.go index 6f199677c..8d1a146b2 100644 --- a/backfill/backfill.go +++ b/backfill/backfill.go @@ -12,8 +12,11 @@ import ( "time" "github.com/bluesky-social/indigo/api/atproto" + "github.com/bluesky-social/indigo/atproto/identity" + "github.com/bluesky-social/indigo/atproto/syntax" "github.com/bluesky-social/indigo/repo" "github.com/bluesky-social/indigo/repomgr" + "github.com/ipfs/go-cid" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/otel" @@ -71,8 +74,8 @@ type Backfiller struct { ParallelRecordCreates int // Prefix match for records to backfill i.e. app.bsky.feed.app/ // If empty, all records will be backfilled - NSIDFilter string - CheckoutPath string + NSIDFilter string + RelayHost string syncLimiter *rate.Limiter @@ -80,6 +83,8 @@ type Backfiller struct { magicHeaderVal string stop chan chan struct{} + + Directory identity.Directory } var ( @@ -110,7 +115,7 @@ type BackfillOptions struct { ParallelRecordCreates int NSIDFilter string SyncRequestsPerSecond int - CheckoutPath string + RelayHost string } func DefaultBackfillOptions() *BackfillOptions { @@ -119,7 +124,7 @@ func DefaultBackfillOptions() *BackfillOptions { ParallelRecordCreates: 100, NSIDFilter: "", SyncRequestsPerSecond: 2, - CheckoutPath: "https://bsky.network/xrpc/com.atproto.sync.getRepo", + RelayHost: "https://bsky.network", } } @@ -135,6 +140,14 @@ func NewBackfiller( if opts == nil { opts = DefaultBackfillOptions() } + + // Convert wss:// or ws:// to https:// or http:// + if strings.HasPrefix(opts.RelayHost, "wss://") { + opts.RelayHost = "https://" + opts.RelayHost[6:] + } else if strings.HasPrefix(opts.RelayHost, "ws://") { + opts.RelayHost = "http://" + opts.RelayHost[5:] + } + return &Backfiller{ Name: name, Store: store, @@ -145,8 +158,9 @@ func NewBackfiller( ParallelRecordCreates: opts.ParallelRecordCreates, NSIDFilter: opts.NSIDFilter, syncLimiter: rate.NewLimiter(rate.Limit(opts.SyncRequestsPerSecond), 1), - CheckoutPath: opts.CheckoutPath, + RelayHost: opts.RelayHost, stop: make(chan chan struct{}, 1), + Directory: identity.DefaultDirectory(), } } @@ -292,25 +306,12 @@ type recordResult struct { err error } -// BackfillRepo backfills a repo -func (b *Backfiller) BackfillRepo(ctx context.Context, job Job) (string, error) { - ctx, span := tracer.Start(ctx, "BackfillRepo") - defer span.End() +// Fetches a repo CAR file over HTTP from the indicated host. If successful, parses the CAR and returns repo.Repo +func (b *Backfiller) fetchRepo(ctx context.Context, did, since, host string) (*repo.Repo, error) { + url := fmt.Sprintf("%s/xrpc/com.atproto.sync.getRepo?did=%s", host, did) - start := time.Now() - - repoDid := job.Repo() - - log := slog.With("source", "backfiller_backfill_repo", "repo", repoDid) - if job.RetryCount() > 0 { - log = log.With("retry_count", job.RetryCount()) - } - log.Info(fmt.Sprintf("processing backfill for %s", repoDid)) - - url := fmt.Sprintf("%s?did=%s", b.CheckoutPath, repoDid) - - if job.Rev() != "" { - url = url + fmt.Sprintf("&since=%s", job.Rev()) + if since != "" { + url = url + fmt.Sprintf("&since=%s", since) } // GET and CAR decode the body @@ -320,8 +321,7 @@ func (b *Backfiller) BackfillRepo(ctx context.Context, job Job) (string, error) } req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { - state := fmt.Sprintf("failed (create request: %s)", err.Error()) - return state, fmt.Errorf("failed to create request: %w", err) + return nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Accept", "application/vnd.ipld.car") @@ -334,8 +334,7 @@ func (b *Backfiller) BackfillRepo(ctx context.Context, job Job) (string, error) resp, err := client.Do(req) if err != nil { - state := fmt.Sprintf("failed (do request: %s)", err.Error()) - return state, fmt.Errorf("failed to send request: %w", err) + return nil, fmt.Errorf("failed to send request: %w", err) } if resp.StatusCode != http.StatusOK { @@ -345,8 +344,7 @@ func (b *Backfiller) BackfillRepo(ctx context.Context, job Job) (string, error) } else { reason = resp.Status } - state := fmt.Sprintf("failed (%s)", reason) - return state, fmt.Errorf("failed to get repo: %s", reason) + return nil, fmt.Errorf("failed to get repo: %s", reason) } instrumentedReader := instrumentedReader{ @@ -356,10 +354,47 @@ func (b *Backfiller) BackfillRepo(ctx context.Context, job Job) (string, error) defer instrumentedReader.Close() - r, err := repo.ReadRepoFromCar(ctx, instrumentedReader) + repo, err := repo.ReadRepoFromCar(ctx, instrumentedReader) + if err != nil { + return nil, fmt.Errorf("failed to parse repo from CAR file: %w", err) + } + return repo, nil +} + +// BackfillRepo backfills a repo +func (b *Backfiller) BackfillRepo(ctx context.Context, job Job) (string, error) { + ctx, span := tracer.Start(ctx, "BackfillRepo") + defer span.End() + + start := time.Now() + + repoDID := job.Repo() + + log := slog.With("source", "backfiller_backfill_repo", "repo", repoDID) + if job.RetryCount() > 0 { + log = log.With("retry_count", job.RetryCount()) + } + log.Info(fmt.Sprintf("processing backfill for %s", repoDID)) + + // first try with Relay endpoint + r, err := b.fetchRepo(ctx, repoDID, job.Rev(), b.RelayHost) if err != nil { - state := "failed (couldn't read repo CAR from response body)" - return state, fmt.Errorf("failed to read repo from car: %w", err) + slog.Warn("repo CAR fetch from relay failed", "did", repoDID, "since", job.Rev(), "relayHost", b.RelayHost, "err", err) + // fallback to direct PDS fetch + ident, err := b.Directory.LookupDID(ctx, syntax.DID(repoDID)) + if err != nil { + return "failed resolving DID to PDS repo", fmt.Errorf("resolving DID for PDS repo fetch: %w", err) + } + pdsHost := ident.PDSEndpoint() + if pdsHost == "" { + return "DID document missing PDS endpoint", fmt.Errorf("no PDS endpoint for DID: %s", repoDID) + } + r, err = b.fetchRepo(ctx, repoDID, job.Rev(), pdsHost) + if err != nil { + slog.Warn("repo CAR fetch from PDS failed", "did", repoDID, "since", job.Rev(), "pdsHost", pdsHost, "err", err) + return "repo CAR fetch from PDS failed", err + } + slog.Info("repo CAR fetch from PDS successful", "did", repoDID, "since", job.Rev(), "pdsHost", pdsHost, "err", err) } numRecords := 0 @@ -396,7 +431,7 @@ func (b *Backfiller) BackfillRepo(ctx context.Context, job Job) (string, error) raw := blk.RawData() - err = b.HandleCreateRecord(ctx, repoDid, rev, item.recordPath, &raw, &item.nodeCid) + err = b.HandleCreateRecord(ctx, repoDID, rev, item.recordPath, &raw, &item.nodeCid) if err != nil { recordResults <- recordResult{recordPath: item.recordPath, err: fmt.Errorf("failed to handle create record: %w", err)} continue diff --git a/bgs/admin.go b/bgs/admin.go index b2f385cd5..44b07cd8f 100644 --- a/bgs/admin.go +++ b/bgs/admin.go @@ -484,7 +484,7 @@ func (bgs *BGS) handleAdminPostResyncPDS(e echo.Context) error { ctx := context.Background() err := bgs.ResyncPDS(ctx, pds) if err != nil { - log.Errorw("failed to resync PDS", "err", err, "pds", pds.Host) + log.Error("failed to resync PDS", "err", err, "pds", pds.Host) } }() diff --git a/bgs/bgs.go b/bgs/bgs.go index 35dfab9d9..119e63f62 100644 --- a/bgs/bgs.go +++ b/bgs/bgs.go @@ -6,10 +6,12 @@ import ( "encoding/json" "errors" "fmt" + "log/slog" "net" "net/http" _ "net/http/pprof" "net/url" + "reflect" "strconv" "strings" "sync" @@ -27,13 +29,13 @@ import ( "github.com/bluesky-social/indigo/models" "github.com/bluesky-social/indigo/repomgr" "github.com/bluesky-social/indigo/xrpc" + lru "github.com/hashicorp/golang-lru/v2" "golang.org/x/sync/semaphore" "golang.org/x/time/rate" "github.com/gorilla/websocket" "github.com/ipfs/go-cid" ipld "github.com/ipfs/go-ipld-format" - logging "github.com/ipfs/go-log" "github.com/labstack/echo/v4" "github.com/labstack/echo/v4/middleware" promclient "github.com/prometheus/client_golang/prometheus" @@ -44,7 +46,6 @@ import ( "gorm.io/gorm" ) -var log = logging.Logger("bgs") var tracer = otel.Tracer("bgs") // serverListenerBootTimeout is how long to wait for the requested server socket @@ -87,6 +88,15 @@ type BGS struct { // Management of Compaction compactor *Compactor + + // User cache + userCache *lru.Cache[string, *User] + + // nextCrawlers gets forwarded POST /xrpc/com.atproto.sync.requestCrawl + nextCrawlers []*url.URL + httpClient http.Client + + log *slog.Logger } type PDSResync struct { @@ -113,6 +123,9 @@ type BGSConfig struct { ConcurrencyPerPDS int64 MaxQueuePerPDS int64 NumCompactionWorkers int + + // NextCrawlers gets forwarded POST /xrpc/com.atproto.sync.requestCrawl + NextCrawlers []*url.URL } func DefaultBGSConfig() *BGSConfig { @@ -136,6 +149,8 @@ func NewBGS(db *gorm.DB, ix *indexer.Indexer, repoman *repomgr.RepoManager, evtm db.AutoMigrate(models.PDS{}) db.AutoMigrate(models.DomainBan{}) + uc, _ := lru.New[string, *User](1_000_000) + bgs := &BGS{ Index: ix, db: db, @@ -151,6 +166,10 @@ func NewBGS(db *gorm.DB, ix *indexer.Indexer, repoman *repomgr.RepoManager, evtm consumers: make(map[uint64]*SocketConsumer), pdsResyncs: make(map[uint]*PDSResync), + + userCache: uc, + + log: slog.Default().With("system", "bgs"), } ix.CreateExternalUser = bgs.createExternalUser @@ -177,6 +196,9 @@ func NewBGS(db *gorm.DB, ix *indexer.Indexer, repoman *repomgr.RepoManager, evtm compactor.Start(bgs) bgs.compactor = compactor + bgs.nextCrawlers = config.NextCrawlers + bgs.httpClient.Timeout = time.Second * 5 + return bgs, nil } @@ -226,13 +248,13 @@ func (bgs *BGS) StartDebug(listen string) error { act, err := bgs.Index.GetUserOrMissing(ctx, did) if err != nil { w.WriteHeader(500) - log.Errorf("failed to get user: %s", err) + bgs.log.Error("failed to get user", "err", err) return } if err := bgs.Index.Crawler.Crawl(ctx, act); err != nil { w.WriteHeader(500) - log.Errorf("failed to add user to crawler: %s", err) + bgs.log.Error("failed to add user to crawler", "err", err) return } }) @@ -317,7 +339,7 @@ func (bgs *BGS) StartWithListener(listen net.Listener) error { if err2 := ctx.JSON(err.Code, map[string]any{ "error": err.Message, }); err2 != nil { - log.Errorf("Failed to write http error: %s", err2) + bgs.log.Error("Failed to write http error", "err", err2) } default: sendHeader := true @@ -325,7 +347,7 @@ func (bgs *BGS) StartWithListener(listen net.Listener) error { sendHeader = false } - log.Warnf("HANDLER ERROR: (%s) %s", ctx.Path(), err) + bgs.log.Warn("HANDLER ERROR: (%s) %s", ctx.Path(), err) if strings.HasPrefix(ctx.Path(), "/admin/") { ctx.JSON(500, map[string]any{ @@ -418,7 +440,7 @@ type HealthStatus struct { func (bgs *BGS) HandleHealthCheck(c echo.Context) error { if err := bgs.db.Exec("SELECT 1").Error; err != nil { - log.Errorf("healthcheck can't connect to database: %v", err) + bgs.log.Error("healthcheck can't connect to database", "err", err) return c.JSON(500, HealthStatus{Status: "error", Message: "can't connect to database"}) } else { return c.JSON(200, HealthStatus{Status: "ok"}) @@ -521,6 +543,44 @@ type User struct { // UpstreamStatus is the state of the user as reported by the upstream PDS UpstreamStatus string `gorm:"index"` + + lk sync.Mutex +} + +func (u *User) SetTakenDown(v bool) { + u.lk.Lock() + defer u.lk.Unlock() + u.TakenDown = v +} + +func (u *User) GetTakenDown() bool { + u.lk.Lock() + defer u.lk.Unlock() + return u.TakenDown +} + +func (u *User) SetTombstoned(v bool) { + u.lk.Lock() + defer u.lk.Unlock() + u.Tombstoned = v +} + +func (u *User) GetTombstoned() bool { + u.lk.Lock() + defer u.lk.Unlock() + return u.Tombstoned +} + +func (u *User) SetUpstreamStatus(v string) { + u.lk.Lock() + defer u.lk.Unlock() + u.UpstreamStatus = v +} + +func (u *User) GetUpstreamStatus() string { + u.lk.Lock() + defer u.lk.Unlock() + return u.UpstreamStatus } type addTargetBody struct { @@ -547,10 +607,10 @@ func (bgs *BGS) cleanupConsumer(id uint64) { var m = &dto.Metric{} if err := c.EventsSent.Write(m); err != nil { - log.Errorf("failed to get sent counter: %s", err) + bgs.log.Error("failed to get sent counter", "err", err) } - log.Infow("consumer disconnected", + bgs.log.Info("consumer disconnected", "consumer_id", id, "remote_addr", c.RemoteAddr, "user_agent", c.UserAgent, @@ -602,7 +662,7 @@ func (bgs *BGS) EventsHandler(c echo.Context) error { } if err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(5*time.Second)); err != nil { - log.Warnf("failed to ping client: %s", err) + bgs.log.Warn("failed to ping client: %s", err) cancel() return } @@ -627,7 +687,7 @@ func (bgs *BGS) EventsHandler(c echo.Context) error { for { _, _, err := conn.ReadMessage() if err != nil { - log.Warnf("failed to read message from client: %s", err) + bgs.log.Warn("failed to read message from client: %s", err) cancel() return } @@ -654,13 +714,13 @@ func (bgs *BGS) EventsHandler(c echo.Context) error { consumerID := bgs.registerConsumer(&consumer) defer bgs.cleanupConsumer(consumerID) - logger := log.With( + logger := bgs.log.With( "consumer_id", consumerID, "remote_addr", consumer.RemoteAddr, "user_agent", consumer.UserAgent, ) - logger.Infow("new consumer", "cursor", since) + logger.Info("new consumer", "cursor", since) for { select { @@ -672,7 +732,7 @@ func (bgs *BGS) EventsHandler(c echo.Context) error { wc, err := conn.NextWriter(websocket.BinaryMessage) if err != nil { - logger.Errorf("failed to get next writer: %s", err) + logger.Error("failed to get next writer", "err", err) return err } @@ -686,7 +746,7 @@ func (bgs *BGS) EventsHandler(c echo.Context) error { } if err := wc.Close(); err != nil { - logger.Warnf("failed to flush-close our event write: %s", err) + logger.Warn("failed to flush-close our event write", "err", err) return nil } @@ -707,14 +767,14 @@ func prometheusHandler() http.Handler { // defensive in case things change under the hood. registry, ok := promclient.DefaultRegisterer.(*promclient.Registry) if !ok { - log.Warnf("failed to export default prometheus registry; some metrics will be unavailable; unexpected type: %T", promclient.DefaultRegisterer) + slog.Warn("failed to export default prometheus registry; some metrics will be unavailable; unexpected type", "type", reflect.TypeOf(promclient.DefaultRegisterer)) } exporter, err := prometheus.NewExporter(prometheus.Options{ Registry: registry, Namespace: "bigsky", }) if err != nil { - log.Errorf("could not create the prometheus stats exporter: %v", err) + slog.Error("could not create the prometheus stats exporter", "err", err, "system", "bgs") } return exporter @@ -771,6 +831,11 @@ func (bgs *BGS) lookupUserByDid(ctx context.Context, did string) (*User, error) ctx, span := tracer.Start(ctx, "lookupUserByDid") defer span.End() + cu, ok := bgs.userCache.Get(did) + if ok { + return cu, nil + } + var u User if err := bgs.db.Find(&u, "did = ?", did).Error; err != nil { return nil, err @@ -780,6 +845,8 @@ func (bgs *BGS) lookupUserByDid(ctx context.Context, did string) (*User, error) return nil, gorm.ErrRecordNotFound } + bgs.userCache.Add(did, &u) + return &u, nil } @@ -822,16 +889,23 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event case env.RepoCommit != nil: repoCommitsReceivedCounter.WithLabelValues(host.Host).Add(1) evt := env.RepoCommit - log.Debugw("bgs got repo append event", "seq", evt.Seq, "pdsHost", host.Host, "repo", evt.Repo) + bgs.log.Debug("bgs got repo append event", "seq", evt.Seq, "pdsHost", host.Host, "repo", evt.Repo) + + s := time.Now() u, err := bgs.lookupUserByDid(ctx, evt.Repo) + userLookupDuration.Observe(time.Since(s).Seconds()) if err != nil { if !errors.Is(err, gorm.ErrRecordNotFound) { + repoCommitsResultCounter.WithLabelValues(host.Host, "nou").Inc() return fmt.Errorf("looking up event user: %w", err) } newUsersDiscovered.Inc() + start := time.Now() subj, err := bgs.createExternalUser(ctx, evt.Repo) + newUserDiscoveryDuration.Observe(time.Since(start).Seconds()) if err != nil { + repoCommitsResultCounter.WithLabelValues(host.Host, "uerr").Inc() return fmt.Errorf("fed event create external user: %w", err) } @@ -840,56 +914,67 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event u.Did = evt.Repo } - span.SetAttributes(attribute.String("upstream_status", u.UpstreamStatus)) + ustatus := u.GetUpstreamStatus() + span.SetAttributes(attribute.String("upstream_status", ustatus)) - if u.TakenDown || u.UpstreamStatus == events.AccountStatusTakendown { - span.SetAttributes(attribute.Bool("taken_down_by_relay_admin", u.TakenDown)) - log.Debugw("dropping commit event from taken down user", "did", evt.Repo, "seq", evt.Seq, "pdsHost", host.Host) + if u.GetTakenDown() || ustatus == events.AccountStatusTakendown { + span.SetAttributes(attribute.Bool("taken_down_by_relay_admin", u.GetTakenDown())) + bgs.log.Debug("dropping commit event from taken down user", "did", evt.Repo, "seq", evt.Seq, "pdsHost", host.Host) + repoCommitsResultCounter.WithLabelValues(host.Host, "tdu").Inc() return nil } - if u.UpstreamStatus == events.AccountStatusSuspended { - log.Debugw("dropping commit event from suspended user", "did", evt.Repo, "seq", evt.Seq, "pdsHost", host.Host) + if ustatus == events.AccountStatusSuspended { + bgs.log.Debug("dropping commit event from suspended user", "did", evt.Repo, "seq", evt.Seq, "pdsHost", host.Host) + repoCommitsResultCounter.WithLabelValues(host.Host, "susu").Inc() return nil } - if u.UpstreamStatus == events.AccountStatusDeactivated { - log.Debugw("dropping commit event from deactivated user", "did", evt.Repo, "seq", evt.Seq, "pdsHost", host.Host) + if ustatus == events.AccountStatusDeactivated { + bgs.log.Debug("dropping commit event from deactivated user", "did", evt.Repo, "seq", evt.Seq, "pdsHost", host.Host) + repoCommitsResultCounter.WithLabelValues(host.Host, "du").Inc() return nil } if evt.Rebase { + repoCommitsResultCounter.WithLabelValues(host.Host, "rebase").Inc() return fmt.Errorf("rebase was true in event seq:%d,host:%s", evt.Seq, host.Host) } if host.ID != u.PDS && u.PDS != 0 { - log.Warnw("received event for repo from different pds than expected", "repo", evt.Repo, "expPds", u.PDS, "gotPds", host.Host) + bgs.log.Warn("received event for repo from different pds than expected", "repo", evt.Repo, "expPds", u.PDS, "gotPds", host.Host) // Flush any cached DID documents for this user bgs.didr.FlushCacheFor(env.RepoCommit.Repo) subj, err := bgs.createExternalUser(ctx, evt.Repo) if err != nil { + repoCommitsResultCounter.WithLabelValues(host.Host, "uerr2").Inc() return err } if subj.PDS != host.ID { + repoCommitsResultCounter.WithLabelValues(host.Host, "noauth").Inc() return fmt.Errorf("event from non-authoritative pds") } } - if u.Tombstoned { + if u.GetTombstoned() { span.SetAttributes(attribute.Bool("tombstoned", true)) // we've checked the authority of the users PDS, so reinstate the account if err := bgs.db.Model(&User{}).Where("id = ?", u.ID).UpdateColumn("tombstoned", false).Error; err != nil { + repoCommitsResultCounter.WithLabelValues(host.Host, "tomb").Inc() return fmt.Errorf("failed to un-tombstone a user: %w", err) } + u.SetTombstoned(false) ai, err := bgs.Index.LookupUser(ctx, u.ID) if err != nil { + repoCommitsResultCounter.WithLabelValues(host.Host, "nou2").Inc() return fmt.Errorf("failed to look up user (tombstone recover): %w", err) } // Now a simple re-crawl should suffice to bring the user back online + repoCommitsResultCounter.WithLabelValues(host.Host, "catchupt").Inc() return bgs.Index.Crawler.AddToCatchupQueue(ctx, host, ai, evt) } @@ -898,6 +983,7 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event rebasesCounter.WithLabelValues(host.Host).Add(1) ai, err := bgs.Index.LookupUser(ctx, u.ID) if err != nil { + repoCommitsResultCounter.WithLabelValues(host.Host, "nou3").Inc() return fmt.Errorf("failed to look up user (slow path): %w", err) } @@ -909,29 +995,36 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event // processor coming off of the pds stream, we should investigate // whether or not we even need this 'slow path' logic, as it makes // accounting for which events have been processed much harder + repoCommitsResultCounter.WithLabelValues(host.Host, "catchup").Inc() return bgs.Index.Crawler.AddToCatchupQueue(ctx, host, ai, evt) } if err := bgs.repoman.HandleExternalUserEvent(ctx, host.ID, u.ID, u.Did, evt.Since, evt.Rev, evt.Blocks, evt.Ops); err != nil { - log.Warnw("failed handling event", "err", err, "pdsHost", host.Host, "seq", evt.Seq, "repo", u.Did, "prev", stringLink(evt.Prev), "commit", evt.Commit.String()) if errors.Is(err, carstore.ErrRepoBaseMismatch) || ipld.IsNotFound(err) { ai, lerr := bgs.Index.LookupUser(ctx, u.ID) if lerr != nil { + log.Warn("failed handling event, no user", "err", err, "pdsHost", host.Host, "seq", evt.Seq, "repo", u.Did, "prev", stringLink(evt.Prev), "commit", evt.Commit.String()) + repoCommitsResultCounter.WithLabelValues(host.Host, "nou4").Inc() return fmt.Errorf("failed to look up user %s (%d) (err case: %s): %w", u.Did, u.ID, err, lerr) } span.SetAttributes(attribute.Bool("catchup_queue", true)) + log.Info("failed handling event, catchup", "err", err, "pdsHost", host.Host, "seq", evt.Seq, "repo", u.Did, "prev", stringLink(evt.Prev), "commit", evt.Commit.String()) + repoCommitsResultCounter.WithLabelValues(host.Host, "catchup2").Inc() return bgs.Index.Crawler.AddToCatchupQueue(ctx, host, ai, evt) } + log.Warn("failed handling event", "err", err, "pdsHost", host.Host, "seq", evt.Seq, "repo", u.Did, "prev", stringLink(evt.Prev), "commit", evt.Commit.String()) + repoCommitsResultCounter.WithLabelValues(host.Host, "err").Inc() return fmt.Errorf("handle user event failed: %w", err) } + repoCommitsResultCounter.WithLabelValues(host.Host, "ok").Inc() return nil case env.RepoHandle != nil: - log.Infow("bgs got repo handle event", "did", env.RepoHandle.Did, "handle", env.RepoHandle.Handle) + bgs.log.Info("bgs got repo handle event", "did", env.RepoHandle.Did, "handle", env.RepoHandle.Handle) // Flush any cached DID documents for this user bgs.didr.FlushCacheFor(env.RepoHandle.Did) @@ -942,7 +1035,7 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event } if act.Handle.String != env.RepoHandle.Handle { - log.Warnw("handle update did not update handle to asserted value", "did", env.RepoHandle.Did, "expected", env.RepoHandle.Handle, "actual", act.Handle) + bgs.log.Warn("handle update did not update handle to asserted value", "did", env.RepoHandle.Did, "expected", env.RepoHandle.Handle, "actual", act.Handle) } // TODO: Update the ReposHandle event type to include "verified" or something @@ -956,13 +1049,13 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event }, }) if err != nil { - log.Errorw("failed to broadcast RepoHandle event", "error", err, "did", env.RepoHandle.Did, "handle", env.RepoHandle.Handle) + bgs.log.Error("failed to broadcast RepoHandle event", "error", err, "did", env.RepoHandle.Did, "handle", env.RepoHandle.Handle) return fmt.Errorf("failed to broadcast RepoHandle event: %w", err) } return nil case env.RepoIdentity != nil: - log.Infow("bgs got identity event", "did", env.RepoIdentity.Did) + bgs.log.Info("bgs got identity event", "did", env.RepoIdentity.Did) // Flush any cached DID documents for this user bgs.didr.FlushCacheFor(env.RepoIdentity.Did) @@ -982,7 +1075,7 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event }, }) if err != nil { - log.Errorw("failed to broadcast Identity event", "error", err, "did", env.RepoIdentity.Did) + bgs.log.Error("failed to broadcast Identity event", "error", err, "did", env.RepoIdentity.Did) return fmt.Errorf("failed to broadcast Identity event: %w", err) } @@ -998,7 +1091,7 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event span.SetAttributes(attribute.String("repo_status", *env.RepoAccount.Status)) } - log.Infow("bgs got account event", "did", env.RepoAccount.Did) + bgs.log.Info("bgs got account event", "did", env.RepoAccount.Did) // Flush any cached DID documents for this user bgs.didr.FlushCacheFor(env.RepoAccount.Did) @@ -1012,7 +1105,7 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event // Check if the PDS is still authoritative // if not we don't want to be propagating this account event if ai.PDS != host.ID { - log.Errorw("account event from non-authoritative pds", + bgs.log.Error("account event from non-authoritative pds", "seq", env.RepoAccount.Seq, "did", env.RepoAccount.Did, "event_from", host.Host, @@ -1041,7 +1134,7 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event return fmt.Errorf("failed to look up user by did: %w", err) } - if u.TakenDown { + if u.GetTakenDown() { shouldBeActive = false status = &events.AccountStatusTakendown } @@ -1057,7 +1150,7 @@ func (bgs *BGS) handleFedEvent(ctx context.Context, host *models.PDS, env *event }, }) if err != nil { - log.Errorw("failed to broadcast Account event", "error", err, "did", env.RepoAccount.Did) + bgs.log.Error("failed to broadcast Account event", "error", err, "did", env.RepoAccount.Did) return fmt.Errorf("failed to broadcast Account event: %w", err) } @@ -1095,6 +1188,7 @@ func (bgs *BGS) handleRepoTombstone(ctx context.Context, pds *models.PDS, evt *a }).Error; err != nil { return err } + u.SetTombstoned(true) if err := bgs.db.Model(&models.ActorInfo{}).Where("uid = ?", u.ID).UpdateColumns(map[string]any{ "handle": nil, @@ -1105,7 +1199,7 @@ func (bgs *BGS) handleRepoTombstone(ctx context.Context, pds *models.PDS, evt *a // delete data from carstore if err := bgs.repoman.TakeDownRepo(ctx, u.ID); err != nil { // don't let a failure here prevent us from propagating this event - log.Errorf("failed to delete user data from carstore: %s", err) + bgs.log.Error("failed to delete user data from carstore", "err", err) } return bgs.events.AddEvent(ctx, &events.XRPCStreamEvent{ @@ -1120,7 +1214,7 @@ func (s *BGS) createExternalUser(ctx context.Context, did string) (*models.Actor externalUserCreationAttempts.Inc() - log.Debugf("create external user: %s", did) + s.log.Debug("create external user", "did", did) doc, err := s.didr.GetDocument(ctx, did) if err != nil { return nil, fmt.Errorf("could not locate DID document for followed user (%s): %w", did, err) @@ -1143,7 +1237,7 @@ func (s *BGS) createExternalUser(ctx context.Context, did string) (*models.Actor // TODO: the PDS's DID should also be in the service, we could use that to look up? var peering models.PDS if err := s.db.Find(&peering, "host = ?", durl.Host).Error; err != nil { - log.Error("failed to find pds", durl.Host) + s.log.Error("failed to find pds", "host", durl.Host) return nil, err } @@ -1216,7 +1310,7 @@ func (s *BGS) createExternalUser(ctx context.Context, did string) (*models.Actor defer func() { if !successfullyCreated { if err := s.db.Model(&models.PDS{}).Where("id = ?", peering.ID).Update("repo_count", gorm.Expr("repo_count - 1")).Error; err != nil { - log.Errorf("failed to decrement repo count for pds: %s", err) + s.log.Error("failed to decrement repo count for pds", "err", err) } } }() @@ -1230,7 +1324,7 @@ func (s *BGS) createExternalUser(ctx context.Context, did string) (*models.Actor return nil, err } - log.Debugw("creating external user", "did", did, "handle", hurl.Host, "pds", peering.ID) + s.log.Debug("creating external user", "did", did, "handle", hurl.Host, "pds", peering.ID) handle := hurl.Host @@ -1238,12 +1332,12 @@ func (s *BGS) createExternalUser(ctx context.Context, did string) (*models.Actor resdid, err := s.hr.ResolveHandleToDid(ctx, handle) if err != nil { - log.Errorf("failed to resolve users claimed handle (%q) on pds: %s", handle, err) + s.log.Error("failed to resolve users claimed handle on pds", "handle", handle, "err", err) validHandle = false } if resdid != did { - log.Errorf("claimed handle did not match servers response (%s != %s)", resdid, did) + s.log.Error("claimed handle did not match servers response", "resdid", resdid, "did", did) validHandle = false } @@ -1252,7 +1346,7 @@ func (s *BGS) createExternalUser(ctx context.Context, did string) (*models.Actor exu, err := s.Index.LookupUserByDid(ctx, did) if err == nil { - log.Debugw("lost the race to create a new user", "did", did, "handle", handle, "existing_hand", exu.Handle) + s.log.Debug("lost the race to create a new user", "did", did, "handle", handle, "existing_hand", exu.Handle) if exu.PDS != peering.ID { // User is now on a different PDS, update if err := s.db.Model(User{}).Where("id = ?", exu.Uid).Update("pds", peering.ID).Error; err != nil { @@ -1323,6 +1417,8 @@ func (s *BGS) createExternalUser(ctx context.Context, did string) (*models.Actor if err := s.db.Create(&u).Error; err != nil { return nil, fmt.Errorf("failed to create user after handle conflict: %w", err) } + + s.userCache.Remove(did) } else { return nil, fmt.Errorf("failed to create other pds user: %w", err) } @@ -1370,18 +1466,22 @@ func (bgs *BGS) UpdateAccountStatus(ctx context.Context, did string, status stri if err := bgs.db.Model(User{}).Where("id = ?", u.ID).Update("upstream_status", events.AccountStatusActive).Error; err != nil { return fmt.Errorf("failed to set user active status: %w", err) } + u.SetUpstreamStatus(events.AccountStatusActive) case events.AccountStatusDeactivated: if err := bgs.db.Model(User{}).Where("id = ?", u.ID).Update("upstream_status", events.AccountStatusDeactivated).Error; err != nil { return fmt.Errorf("failed to set user deactivation status: %w", err) } + u.SetUpstreamStatus(events.AccountStatusDeactivated) case events.AccountStatusSuspended: if err := bgs.db.Model(User{}).Where("id = ?", u.ID).Update("upstream_status", events.AccountStatusSuspended).Error; err != nil { return fmt.Errorf("failed to set user suspension status: %w", err) } + u.SetUpstreamStatus(events.AccountStatusSuspended) case events.AccountStatusTakendown: if err := bgs.db.Model(User{}).Where("id = ?", u.ID).Update("upstream_status", events.AccountStatusTakendown).Error; err != nil { return fmt.Errorf("failed to set user taken down status: %w", err) } + u.SetUpstreamStatus(events.AccountStatusTakendown) if err := bgs.db.Model(&models.ActorInfo{}).Where("uid = ?", u.ID).UpdateColumns(map[string]any{ "handle": nil, @@ -1396,6 +1496,7 @@ func (bgs *BGS) UpdateAccountStatus(ctx context.Context, did string, status stri }).Error; err != nil { return err } + u.SetUpstreamStatus(events.AccountStatusDeleted) if err := bgs.db.Model(&models.ActorInfo{}).Where("uid = ?", u.ID).UpdateColumns(map[string]any{ "handle": nil, @@ -1406,7 +1507,7 @@ func (bgs *BGS) UpdateAccountStatus(ctx context.Context, did string, status stri // delete data from carstore if err := bgs.repoman.TakeDownRepo(ctx, u.ID); err != nil { // don't let a failure here prevent us from propagating this event - log.Errorf("failed to delete user data from carstore: %s", err) + bgs.log.Error("failed to delete user data from carstore", "err", err) } } @@ -1422,6 +1523,7 @@ func (bgs *BGS) TakeDownRepo(ctx context.Context, did string) error { if err := bgs.db.Model(User{}).Where("id = ?", u.ID).Update("taken_down", true).Error; err != nil { return err } + u.SetTakenDown(true) if err := bgs.repoman.TakeDownRepo(ctx, u.ID); err != nil { return err @@ -1443,6 +1545,7 @@ func (bgs *BGS) ReverseTakedown(ctx context.Context, did string) error { if err := bgs.db.Model(User{}).Where("id = ?", u.ID).Update("taken_down", false).Error; err != nil { return err } + u.SetTakenDown(false) return nil } @@ -1511,7 +1614,7 @@ func (bgs *BGS) CompleteResync(resync PDSResync) { func (bgs *BGS) ResyncPDS(ctx context.Context, pds models.PDS) error { ctx, span := tracer.Start(ctx, "ResyncPDS") defer span.End() - log := log.With("pds", pds.Host, "source", "resync_pds") + log := bgs.log.With("pds", pds.Host, "source", "resync_pds") resync, found := bgs.LoadOrStoreResync(pds) if found { return fmt.Errorf("resync already in progress") @@ -1543,18 +1646,18 @@ func (bgs *BGS) ResyncPDS(ctx context.Context, pds models.PDS) error { for { pages++ if pages%10 == 0 { - log.Warnw("fetching PDS page during resync", "pages", pages, "total_repos", len(repos)) + log.Warn("fetching PDS page during resync", "pages", pages, "total_repos", len(repos)) resync.NumRepoPages = pages resync.NumRepos = len(repos) bgs.UpdateResync(resync) } if err := limiter.Wait(ctx); err != nil { - log.Errorw("failed to wait for rate limiter", "error", err) + log.Error("failed to wait for rate limiter", "error", err) return fmt.Errorf("failed to wait for rate limiter: %w", err) } repoList, err := comatproto.SyncListRepos(ctx, &xrpcc, cursor, limit) if err != nil { - log.Errorw("failed to list repos", "error", err) + log.Error("failed to list repos", "error", err) return fmt.Errorf("failed to list repos: %w", err) } @@ -1576,77 +1679,60 @@ func (bgs *BGS) ResyncPDS(ctx context.Context, pds models.PDS) error { repolistDone := time.Now() - log.Warnw("listed all repos, checking roots", "num_repos", len(repos), "took", repolistDone.Sub(start)) + log.Warn("listed all repos, checking roots", "num_repos", len(repos), "took", repolistDone.Sub(start)) resync = bgs.SetResyncStatus(pds.ID, "checking revs") - // Create a buffered channel for collecting results - results := make(chan revCheckResult, len(repos)) + // run loop over repos with some concurrency sem := semaphore.NewWeighted(40) // Check repo revs against our local copy and enqueue crawls for any that are out of date - for _, r := range repos { + for i, r := range repos { if err := sem.Acquire(ctx, 1); err != nil { - log.Errorw("failed to acquire semaphore", "error", err) - results <- revCheckResult{err: err} + log.Error("failed to acquire semaphore", "error", err) continue } go func(r comatproto.SyncListRepos_Repo) { defer sem.Release(1) - log := log.With("did", r.Did, "remote_rev", r.Rev) + log := bgs.log.With("did", r.Did, "remote_rev", r.Rev) // Fetches the user if we have it, otherwise automatically enqueues it for crawling ai, err := bgs.Index.GetUserOrMissing(ctx, r.Did) if err != nil { - log.Errorw("failed to get user while resyncing PDS, we can't recrawl it", "error", err) - results <- revCheckResult{err: err} + log.Error("failed to get user while resyncing PDS, we can't recrawl it", "error", err) return } rev, err := bgs.repoman.GetRepoRev(ctx, ai.Uid) if err != nil { - log.Warnw("recrawling because we failed to get the local repo root", "err", err, "uid", ai.Uid) - results <- revCheckResult{ai: ai} + log.Warn("recrawling because we failed to get the local repo root", "err", err, "uid", ai.Uid) + err := bgs.Index.Crawler.Crawl(ctx, ai) + if err != nil { + log.Error("failed to enqueue crawl for repo during resync", "error", err, "uid", ai.Uid, "did", ai.Did) + } return } if rev == "" || rev < r.Rev { - log.Warnw("recrawling because the repo rev from the PDS is newer than our local repo rev", "local_rev", rev) - results <- revCheckResult{ai: ai} + log.Warn("recrawling because the repo rev from the PDS is newer than our local repo rev", "local_rev", rev) + err := bgs.Index.Crawler.Crawl(ctx, ai) + if err != nil { + log.Error("failed to enqueue crawl for repo during resync", "error", err, "uid", ai.Uid, "did", ai.Did) + } return } - - results <- revCheckResult{} }(r) - } - - var numReposToResync int - for i := 0; i < len(repos); i++ { - res := <-results - if res.err != nil { - log.Errorw("failed to process repo during resync", "error", res.err) - - } - if res.ai != nil { - numReposToResync++ - err := bgs.Index.Crawler.Crawl(ctx, res.ai) - if err != nil { - log.Errorw("failed to enqueue crawl for repo during resync", "error", err, "uid", res.ai.Uid, "did", res.ai.Did) - } - } if i%100 == 0 { if i%10_000 == 0 { - log.Warnw("checked revs during resync", "num_repos_checked", i, "num_repos_to_crawl", numReposToResync, "took", time.Now().Sub(resync.StatusChangedAt)) + log.Warn("checked revs during resync", "num_repos_checked", i, "num_repos_to_crawl", -1, "took", time.Now().Sub(resync.StatusChangedAt)) } resync.NumReposChecked = i - resync.NumReposToResync = numReposToResync bgs.UpdateResync(resync) } } resync.NumReposChecked = len(repos) - resync.NumReposToResync = numReposToResync bgs.UpdateResync(resync) - log.Warnw("enqueued all crawls, exiting resync", "took", time.Now().Sub(start), "num_repos_to_crawl", numReposToResync) + bgs.log.Warn("enqueued all crawls, exiting resync", "took", time.Now().Sub(start), "num_repos_to_crawl", -1) return nil } diff --git a/bgs/compactor.go b/bgs/compactor.go index bca4d8e02..6339d76b0 100644 --- a/bgs/compactor.go +++ b/bgs/compactor.go @@ -210,7 +210,7 @@ func (c *Compactor) Start(bgs *BGS) { } if c.requeueInterval > 0 { go func() { - log.Infow("starting compactor requeue routine", + log.Info("starting compactor requeue routine", "interval", c.requeueInterval, "limit", c.requeueLimit, "shardCount", c.requeueShardCount, @@ -226,7 +226,7 @@ func (c *Compactor) Start(bgs *BGS) { ctx := context.Background() ctx, span := otel.Tracer("compactor").Start(ctx, "RequeueRoutine") if err := c.EnqueueAllRepos(ctx, bgs, c.requeueLimit, c.requeueShardCount, c.requeueFast); err != nil { - log.Errorw("failed to enqueue all repos", "err", err) + log.Error("failed to enqueue all repos", "err", err) } span.End() } @@ -262,7 +262,7 @@ func (c *Compactor) doWork(bgs *BGS, strategy NextStrategy) { time.Sleep(time.Second * 5) continue } - log.Errorw("failed to compact repo", + log.Error("failed to compact repo", "err", err, "uid", state.latestUID, "repo", state.latestDID, @@ -273,7 +273,7 @@ func (c *Compactor) doWork(bgs *BGS, strategy NextStrategy) { // Pause for a bit to avoid spamming failed compactions time.Sleep(time.Millisecond * 100) } else { - log.Infow("compacted repo", + log.Info("compacted repo", "uid", state.latestUID, "repo", state.latestDID, "status", state.status, @@ -349,10 +349,10 @@ func (c *Compactor) compactNext(ctx context.Context, bgs *BGS, strategy NextStra return state, nil } -func (c *Compactor) EnqueueRepo(ctx context.Context, user User, fast bool) { +func (c *Compactor) EnqueueRepo(ctx context.Context, user *User, fast bool) { ctx, span := otel.Tracer("compactor").Start(ctx, "EnqueueRepo") defer span.End() - log.Infow("enqueueing compaction for repo", "repo", user.Did, "uid", user.ID, "fast", fast) + log.Info("enqueueing compaction for repo", "repo", user.Did, "uid", user.ID, "fast", fast) c.q.Append(user.ID, fast) } @@ -396,7 +396,7 @@ func (c *Compactor) EnqueueAllRepos(ctx context.Context, bgs *BGS, lim int, shar c.q.Append(r.Usr, fast) } - log.Infow("done enqueueing all repos", "repos_enqueued", len(repos)) + log.Info("done enqueueing all repos", "repos_enqueued", len(repos)) return nil } diff --git a/bgs/fedmgr.go b/bgs/fedmgr.go index 42ce7407c..c68759d91 100644 --- a/bgs/fedmgr.go +++ b/bgs/fedmgr.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "log/slog" "math/rand" "strings" "sync" @@ -22,6 +23,8 @@ import ( "gorm.io/gorm" ) +var log = slog.Default().With("system", "bgs") + type IndexCallback func(context.Context, *models.PDS, *events.XRPCStreamEvent) error // TODO: rename me @@ -129,7 +132,7 @@ func NewSlurper(db *gorm.DB, cb IndexCallback, opts *SlurperOptions) (*Slurper, var errs []error if errs = s.flushCursors(ctx); len(errs) > 0 { for _, err := range errs { - log.Errorf("failed to flush cursors on shutdown: %s", err) + log.Error("failed to flush cursors on shutdown", "err", err) } } log.Info("done flushing PDS cursors on shutdown") @@ -142,7 +145,7 @@ func NewSlurper(db *gorm.DB, cb IndexCallback, opts *SlurperOptions) (*Slurper, defer span.End() if errs := s.flushCursors(ctx); len(errs) > 0 { for _, err := range errs { - log.Errorf("failed to flush cursors: %s", err) + log.Error("failed to flush cursors", "err", err) } } log.Debug("done flushing PDS cursors") @@ -210,7 +213,7 @@ func (s *Slurper) Shutdown() []error { errs := <-s.shutdownResult if len(errs) > 0 { for _, err := range errs { - log.Errorf("shutdown error: %s", err) + log.Error("shutdown error", "err", err) } } log.Info("slurper shutdown complete") @@ -468,8 +471,17 @@ func (s *Slurper) subscribeWithRedialer(ctx context.Context, host *models.PDS, s protocol = "wss" } + // Special case `.host.bsky.network` PDSs to rewind cursor by 200 events to smooth over unclean shutdowns + if strings.HasSuffix(host.Host, ".host.bsky.network") && host.Cursor > 200 { + host.Cursor -= 200 + } + cursor := host.Cursor + connectedInbound.Inc() + defer connectedInbound.Dec() + // TODO:? maybe keep a gauge of 'in retry backoff' sources? + var backoff int for { select { @@ -481,14 +493,14 @@ func (s *Slurper) subscribeWithRedialer(ctx context.Context, host *models.PDS, s url := fmt.Sprintf("%s://%s/xrpc/com.atproto.sync.subscribeRepos?cursor=%d", protocol, host.Host, cursor) con, res, err := d.DialContext(ctx, url, nil) if err != nil { - log.Warnw("dialing failed", "pdsHost", host.Host, "err", err, "backoff", backoff) + log.Warn("dialing failed", "pdsHost", host.Host, "err", err, "backoff", backoff) time.Sleep(sleepForBackoff(backoff)) backoff++ if backoff > 15 { - log.Warnw("pds does not appear to be online, disabling for now", "pdsHost", host.Host) + log.Warn("pds does not appear to be online, disabling for now", "pdsHost", host.Host) if err := s.db.Model(&models.PDS{}).Where("id = ?", host.ID).Update("registered", false).Error; err != nil { - log.Errorf("failed to unregister failing pds: %w", err) + log.Error("failed to unregister failing pds", "err", err) } return @@ -497,15 +509,15 @@ func (s *Slurper) subscribeWithRedialer(ctx context.Context, host *models.PDS, s continue } - log.Info("event subscription response code: ", res.StatusCode) + log.Info("event subscription response", "code", res.StatusCode) curCursor := cursor if err := s.handleConnection(ctx, host, con, &cursor, sub); err != nil { if errors.Is(err, ErrTimeoutShutdown) { - log.Infof("shutting down pds subscription to %s, no activity after %s", host.Host, EventsTimeout) + log.Info("shutting down pds subscription after timeout", "host", host.Host, "time", EventsTimeout) return } - log.Warnf("connection to %q failed: %s", host.Host, err) + log.Warn("connection to failed", "host", host.Host, "err", err) } if cursor > curCursor { @@ -536,11 +548,11 @@ func (s *Slurper) handleConnection(ctx context.Context, host *models.PDS, con *w rsc := &events.RepoStreamCallbacks{ RepoCommit: func(evt *comatproto.SyncSubscribeRepos_Commit) error { - log.Debugw("got remote repo event", "pdsHost", host.Host, "repo", evt.Repo, "seq", evt.Seq) + log.Debug("got remote repo event", "pdsHost", host.Host, "repo", evt.Repo, "seq", evt.Seq) if err := s.cb(context.TODO(), host, &events.XRPCStreamEvent{ RepoCommit: evt, }); err != nil { - log.Errorf("failed handling event from %q (%d): %s", host.Host, evt.Seq, err) + log.Error("failed handling event", "host", host.Host, "seq", evt.Seq, "err", err) } *lastCursor = evt.Seq @@ -551,11 +563,11 @@ func (s *Slurper) handleConnection(ctx context.Context, host *models.PDS, con *w return nil }, RepoHandle: func(evt *comatproto.SyncSubscribeRepos_Handle) error { - log.Infow("got remote handle update event", "pdsHost", host.Host, "did", evt.Did, "handle", evt.Handle) + log.Info("got remote handle update event", "pdsHost", host.Host, "did", evt.Did, "handle", evt.Handle) if err := s.cb(context.TODO(), host, &events.XRPCStreamEvent{ RepoHandle: evt, }); err != nil { - log.Errorf("failed handling event from %q (%d): %s", host.Host, evt.Seq, err) + log.Error("failed handling event", "host", host.Host, "seq", evt.Seq, "err", err) } *lastCursor = evt.Seq @@ -566,11 +578,11 @@ func (s *Slurper) handleConnection(ctx context.Context, host *models.PDS, con *w return nil }, RepoMigrate: func(evt *comatproto.SyncSubscribeRepos_Migrate) error { - log.Infow("got remote repo migrate event", "pdsHost", host.Host, "did", evt.Did, "migrateTo", evt.MigrateTo) + log.Info("got remote repo migrate event", "pdsHost", host.Host, "did", evt.Did, "migrateTo", evt.MigrateTo) if err := s.cb(context.TODO(), host, &events.XRPCStreamEvent{ RepoMigrate: evt, }); err != nil { - log.Errorf("failed handling event from %q (%d): %s", host.Host, evt.Seq, err) + log.Error("failed handling event", "host", host.Host, "seq", evt.Seq, "err", err) } *lastCursor = evt.Seq @@ -581,11 +593,11 @@ func (s *Slurper) handleConnection(ctx context.Context, host *models.PDS, con *w return nil }, RepoTombstone: func(evt *comatproto.SyncSubscribeRepos_Tombstone) error { - log.Infow("got remote repo tombstone event", "pdsHost", host.Host, "did", evt.Did) + log.Info("got remote repo tombstone event", "pdsHost", host.Host, "did", evt.Did) if err := s.cb(context.TODO(), host, &events.XRPCStreamEvent{ RepoTombstone: evt, }); err != nil { - log.Errorf("failed handling event from %q (%d): %s", host.Host, evt.Seq, err) + log.Error("failed handling event", "host", host.Host, "seq", evt.Seq, "err", err) } *lastCursor = evt.Seq @@ -596,15 +608,15 @@ func (s *Slurper) handleConnection(ctx context.Context, host *models.PDS, con *w return nil }, RepoInfo: func(info *comatproto.SyncSubscribeRepos_Info) error { - log.Infow("info event", "name", info.Name, "message", info.Message, "pdsHost", host.Host) + log.Info("info event", "name", info.Name, "message", info.Message, "pdsHost", host.Host) return nil }, RepoIdentity: func(ident *comatproto.SyncSubscribeRepos_Identity) error { - log.Infow("identity event", "did", ident.Did) + log.Info("identity event", "did", ident.Did) if err := s.cb(context.TODO(), host, &events.XRPCStreamEvent{ RepoIdentity: ident, }); err != nil { - log.Errorf("failed handling event from %q (%d): %s", host.Host, ident.Seq, err) + log.Error("failed handling event", "host", host.Host, "seq", ident.Seq, "err", err) } *lastCursor = ident.Seq @@ -615,11 +627,11 @@ func (s *Slurper) handleConnection(ctx context.Context, host *models.PDS, con *w return nil }, RepoAccount: func(acct *comatproto.SyncSubscribeRepos_Account) error { - log.Infow("account event", "did", acct.Did, "status", acct.Status) + log.Info("account event", "did", acct.Did, "status", acct.Status) if err := s.cb(context.TODO(), host, &events.XRPCStreamEvent{ RepoAccount: acct, }); err != nil { - log.Errorf("failed handling event from %q (%d): %s", host.Host, acct.Seq, err) + log.Error("failed handling event", "host", host.Host, "seq", acct.Seq, "err", err) } *lastCursor = acct.Seq @@ -662,7 +674,7 @@ func (s *Slurper) handleConnection(ctx context.Context, host *models.PDS, con *w con.RemoteAddr().String(), instrumentedRSC.EventHandler, ) - return events.HandleRepoStream(ctx, con, pool) + return events.HandleRepoStream(ctx, con, pool, nil) } func (s *Slurper) updateCursor(sub *activeSub, curs int64) error { diff --git a/bgs/handlers.go b/bgs/handlers.go index da87c9521..0e46f0043 100644 --- a/bgs/handlers.go +++ b/bgs/handlers.go @@ -3,6 +3,7 @@ package bgs import ( "bytes" "context" + "encoding/json" "errors" "fmt" "io" @@ -30,27 +31,28 @@ func (s *BGS) handleComAtprotoSyncGetRecord(ctx context.Context, collection stri if errors.Is(err, gorm.ErrRecordNotFound) { return nil, echo.NewHTTPError(http.StatusNotFound, "user not found") } - log.Errorw("failed to lookup user", "err", err, "did", did) + log.Error("failed to lookup user", "err", err, "did", did) return nil, echo.NewHTTPError(http.StatusInternalServerError, "failed to lookup user") } - if u.Tombstoned { + if u.GetTombstoned() { return nil, fmt.Errorf("account was deleted") } - if u.TakenDown { + if u.GetTakenDown() { return nil, fmt.Errorf("account was taken down by the Relay") } - if u.UpstreamStatus == events.AccountStatusTakendown { + ustatus := u.GetUpstreamStatus() + if ustatus == events.AccountStatusTakendown { return nil, fmt.Errorf("account was taken down by its PDS") } - if u.UpstreamStatus == events.AccountStatusDeactivated { + if ustatus == events.AccountStatusDeactivated { return nil, fmt.Errorf("account is temporarily deactivated") } - if u.UpstreamStatus == events.AccountStatusSuspended { + if ustatus == events.AccountStatusSuspended { return nil, fmt.Errorf("account is suspended by its PDS") } @@ -59,7 +61,7 @@ func (s *BGS) handleComAtprotoSyncGetRecord(ctx context.Context, collection stri if errors.Is(err, mst.ErrNotFound) { return nil, echo.NewHTTPError(http.StatusNotFound, "record not found in repo") } - log.Errorw("failed to get record from repo", "err", err, "did", did, "collection", collection, "rkey", rkey) + log.Error("failed to get record from repo", "err", err, "did", did, "collection", collection, "rkey", rkey) return nil, echo.NewHTTPError(http.StatusInternalServerError, "failed to get record from repo") } @@ -87,34 +89,35 @@ func (s *BGS) handleComAtprotoSyncGetRepo(ctx context.Context, did string, since if errors.Is(err, gorm.ErrRecordNotFound) { return nil, echo.NewHTTPError(http.StatusNotFound, "user not found") } - log.Errorw("failed to lookup user", "err", err, "did", did) + log.Error("failed to lookup user", "err", err, "did", did) return nil, echo.NewHTTPError(http.StatusInternalServerError, "failed to lookup user") } - if u.Tombstoned { + if u.GetTombstoned() { return nil, fmt.Errorf("account was deleted") } - if u.TakenDown { + if u.GetTakenDown() { return nil, fmt.Errorf("account was taken down by the Relay") } - if u.UpstreamStatus == events.AccountStatusTakendown { + ustatus := u.GetUpstreamStatus() + if ustatus == events.AccountStatusTakendown { return nil, fmt.Errorf("account was taken down by its PDS") } - if u.UpstreamStatus == events.AccountStatusDeactivated { + if ustatus == events.AccountStatusDeactivated { return nil, fmt.Errorf("account is temporarily deactivated") } - if u.UpstreamStatus == events.AccountStatusSuspended { + if ustatus == events.AccountStatusSuspended { return nil, fmt.Errorf("account is suspended by its PDS") } // TODO: stream the response buf := new(bytes.Buffer) if err := s.repoman.ReadRepo(ctx, u.ID, since, buf); err != nil { - log.Errorw("failed to read repo into buffer", "err", err, "did", did) + log.Error("failed to read repo into buffer", "err", err, "did", did) return nil, echo.NewHTTPError(http.StatusInternalServerError, "failed to read repo into buffer") } @@ -167,7 +170,7 @@ func (s *BGS) handleComAtprotoSyncRequestCrawl(ctx context.Context, body *comatp return echo.NewHTTPError(http.StatusUnauthorized, "domain is banned") } - log.Warnf("TODO: better host validation for crawl requests") + log.Warn("TODO: better host validation for crawl requests") clientHost := fmt.Sprintf("%s://%s", u.Scheme, host) @@ -185,6 +188,30 @@ func (s *BGS) handleComAtprotoSyncRequestCrawl(ctx context.Context, body *comatp // Maybe we could do something with this response later _ = desc + if len(s.nextCrawlers) != 0 { + blob, err := json.Marshal(body) + if err != nil { + log.Warn("could not forward requestCrawl, json err", "err", err) + } else { + go func(bodyBlob []byte) { + for _, rpu := range s.nextCrawlers { + pu := rpu.JoinPath("/xrpc/com.atproto.sync.requestCrawl") + response, err := s.httpClient.Post(pu.String(), "application/json", bytes.NewReader(bodyBlob)) + if response != nil && response.Body != nil { + response.Body.Close() + } + if err != nil || response == nil { + log.Warn("requestCrawl forward failed", "host", rpu, "err", err) + } else if response.StatusCode != http.StatusOK { + log.Warn("requestCrawl forward failed", "host", rpu, "status", response.Status) + } else { + log.Info("requestCrawl forward successful", "host", rpu) + } + } + }(blob) + } + } + return s.slurper.SubscribeToPds(ctx, host, true, false) } @@ -204,7 +231,7 @@ func (s *BGS) handleComAtprotoSyncListRepos(ctx context.Context, cursor int64, l if err == gorm.ErrRecordNotFound { return &comatprototypes.SyncListRepos_Output{}, nil } - log.Errorw("failed to query users", "err", err) + log.Error("failed to query users", "err", err) return nil, echo.NewHTTPError(http.StatusInternalServerError, "failed to query users") } @@ -225,7 +252,7 @@ func (s *BGS) handleComAtprotoSyncListRepos(ctx context.Context, cursor int64, l root, err := s.repoman.GetRepoRoot(ctx, user.ID) if err != nil { - log.Errorw("failed to get repo root", "err", err, "did", user.Did) + log.Error("failed to get repo root", "err", err, "did", user.Did) return nil, echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("failed to get repo root for (%s): %v", user.Did, err.Error())) } @@ -253,35 +280,36 @@ func (s *BGS) handleComAtprotoSyncGetLatestCommit(ctx context.Context, did strin return nil, echo.NewHTTPError(http.StatusInternalServerError, "failed to lookup user") } - if u.Tombstoned { + if u.GetTombstoned() { return nil, fmt.Errorf("account was deleted") } - if u.TakenDown { + if u.GetTakenDown() { return nil, fmt.Errorf("account was taken down by the Relay") } - if u.UpstreamStatus == events.AccountStatusTakendown { + ustatus := u.GetUpstreamStatus() + if ustatus == events.AccountStatusTakendown { return nil, fmt.Errorf("account was taken down by its PDS") } - if u.UpstreamStatus == events.AccountStatusDeactivated { + if ustatus == events.AccountStatusDeactivated { return nil, fmt.Errorf("account is temporarily deactivated") } - if u.UpstreamStatus == events.AccountStatusSuspended { + if ustatus == events.AccountStatusSuspended { return nil, fmt.Errorf("account is suspended by its PDS") } root, err := s.repoman.GetRepoRoot(ctx, u.ID) if err != nil { - log.Errorw("failed to get repo root", "err", err, "did", u.Did) + log.Error("failed to get repo root", "err", err, "did", u.Did) return nil, echo.NewHTTPError(http.StatusInternalServerError, "failed to get repo root") } rev, err := s.repoman.GetRepoRev(ctx, u.ID) if err != nil { - log.Errorw("failed to get repo rev", "err", err, "did", u.Did) + log.Error("failed to get repo rev", "err", err, "did", u.Did) return nil, echo.NewHTTPError(http.StatusInternalServerError, "failed to get repo rev") } diff --git a/bgs/metrics.go b/bgs/metrics.go index b33677e6e..5ff362a16 100644 --- a/bgs/metrics.go +++ b/bgs/metrics.go @@ -27,6 +27,11 @@ var repoCommitsReceivedCounter = promauto.NewCounterVec(prometheus.CounterOpts{ Help: "The total number of events received", }, []string{"pds"}) +var repoCommitsResultCounter = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "repo_commits_result_counter", + Help: "The results of commit events received", +}, []string{"pds", "status"}) + var rebasesCounter = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "event_rebases", Help: "The total number of rebase events received", @@ -42,6 +47,11 @@ var externalUserCreationAttempts = promauto.NewCounter(prometheus.CounterOpts{ Help: "The total number of external users created", }) +var connectedInbound = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "bgs_connected_inbound", + Help: "Number of inbound firehoses we are consuming", +}) + var compactionDuration = promauto.NewHistogram(prometheus.HistogramOpts{ Name: "compaction_duration", Help: "A histogram of compaction latencies", @@ -81,6 +91,18 @@ var resSz = promauto.NewHistogramVec(prometheus.HistogramOpts{ Buckets: prometheus.ExponentialBuckets(100, 10, 8), }, []string{"code", "method", "path"}) +var userLookupDuration = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "relay_user_lookup_duration", + Help: "A histogram of user lookup latencies", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), +}) + +var newUserDiscoveryDuration = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "relay_new_user_discovery_duration", + Help: "A histogram of new user discovery latencies", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), +}) + // MetricsMiddleware defines handler function for metrics middleware func MetricsMiddleware(next echo.HandlerFunc) echo.HandlerFunc { return func(c echo.Context) error { diff --git a/carstore/bs.go b/carstore/bs.go index e7af35d12..dbce4f098 100644 --- a/carstore/bs.go +++ b/carstore/bs.go @@ -6,6 +6,7 @@ import ( "context" "fmt" "io" + "log/slog" "os" "path/filepath" "sort" @@ -24,7 +25,6 @@ import ( cbor "github.com/ipfs/go-ipld-cbor" ipld "github.com/ipfs/go-ipld-format" "github.com/ipfs/go-libipfs/blocks" - logging "github.com/ipfs/go-log" car "github.com/ipld/go-car" carutil "github.com/ipld/go-car/util" cbg "github.com/whyrusleeping/cbor-gen" @@ -42,8 +42,6 @@ var blockGetTotalCounterUsrskip = blockGetTotalCounter.WithLabelValues("true", " var blockGetTotalCounterCached = blockGetTotalCounter.WithLabelValues("false", "hit") var blockGetTotalCounterNormal = blockGetTotalCounter.WithLabelValues("false", "miss") -var log = logging.Logger("carstore") - const MaxSliceLength = 2 << 20 const BigShardThreshold = 2 << 20 @@ -62,21 +60,25 @@ type CarStore interface { } type FileCarStore struct { - meta *CarStoreGormMeta - rootDir string + meta *CarStoreGormMeta + rootDirs []string lscLk sync.Mutex lastShardCache map[models.Uid]*CarShard + + log *slog.Logger } -func NewCarStore(meta *gorm.DB, root string) (CarStore, error) { - if _, err := os.Stat(root); err != nil { - if !os.IsNotExist(err) { - return nil, err - } +func NewCarStore(meta *gorm.DB, roots []string) (CarStore, error) { + for _, root := range roots { + if _, err := os.Stat(root); err != nil { + if !os.IsNotExist(err) { + return nil, err + } - if err := os.Mkdir(root, 0775); err != nil { - return nil, err + if err := os.Mkdir(root, 0775); err != nil { + return nil, err + } } } if err := meta.AutoMigrate(&CarShard{}, &blockRef{}); err != nil { @@ -88,13 +90,14 @@ func NewCarStore(meta *gorm.DB, root string) (CarStore, error) { return &FileCarStore{ meta: &CarStoreGormMeta{meta: meta}, - rootDir: root, + rootDirs: roots, lastShardCache: make(map[models.Uid]*CarShard), + log: slog.Default().With("system", "carstore"), }, nil } type userView struct { - cs *FileCarStore + cs CarStore user models.Uid cache map[cid.Cid]blockformat.Block @@ -108,13 +111,24 @@ func (uv *userView) HashOnRead(hor bool) { } func (uv *userView) Has(ctx context.Context, k cid.Cid) (bool, error) { - return uv.cs.meta.HasUidCid(ctx, uv.user, k) + _, have := uv.cache[k] + if have { + return have, nil + } + + fcd, ok := uv.cs.(*FileCarStore) + if !ok { + return false, nil + } + + return fcd.meta.HasUidCid(ctx, uv.user, k) } var CacheHits int64 var CacheMiss int64 func (uv *userView) Get(ctx context.Context, k cid.Cid) (blockformat.Block, error) { + if !k.Defined() { return nil, fmt.Errorf("attempted to 'get' undefined cid") } @@ -129,7 +143,12 @@ func (uv *userView) Get(ctx context.Context, k cid.Cid) (blockformat.Block, erro } atomic.AddInt64(&CacheMiss, 1) - path, offset, user, err := uv.cs.meta.LookupBlockRef(ctx, k) + fcd, ok := uv.cs.(*FileCarStore) + if !ok { + return nil, ipld.ErrNotFound{Cid: k} + } + + path, offset, user, err := fcd.meta.LookupBlockRef(ctx, k) if err != nil { return nil, err } @@ -269,7 +288,7 @@ type DeltaSession struct { baseCid cid.Cid seq int readonly bool - cs *FileCarStore + cs CarStore lastRev string } @@ -541,9 +560,14 @@ func (ds *DeltaSession) GetSize(ctx context.Context, c cid.Cid) (int, error) { func fnameForShard(user models.Uid, seq int) string { return fmt.Sprintf("sh-%d-%d", user, seq) } + +func (cs *FileCarStore) dirForUser(user models.Uid) string { + return cs.rootDirs[int(user)%len(cs.rootDirs)] +} + func (cs *FileCarStore) openNewShardFile(ctx context.Context, user models.Uid, seq int) (*os.File, string, error) { // TODO: some overwrite protections - fname := filepath.Join(cs.rootDir, fnameForShard(user, seq)) + fname := filepath.Join(cs.dirForUser(user), fnameForShard(user, seq)) fi, err := os.Create(fname) if err != nil { return nil, "", err @@ -557,7 +581,7 @@ func (cs *FileCarStore) writeNewShardFile(ctx context.Context, user models.Uid, defer span.End() // TODO: some overwrite protections - fname := filepath.Join(cs.rootDir, fnameForShard(user, seq)) + fname := filepath.Join(cs.dirForUser(user), fnameForShard(user, seq)) if err := os.WriteFile(fname, data, 0664); err != nil { return "", err } @@ -579,7 +603,18 @@ func (ds *DeltaSession) CloseWithRoot(ctx context.Context, root cid.Cid, rev str return nil, fmt.Errorf("cannot write to readonly deltaSession") } - return ds.cs.writeNewShard(ctx, root, rev, ds.user, ds.seq, ds.blks, ds.rmcids) + switch ocs := ds.cs.(type) { + case *FileCarStore: + return ocs.writeNewShard(ctx, root, rev, ds.user, ds.seq, ds.blks, ds.rmcids) + case *NonArchivalCarstore: + slice, err := blocksToCar(ctx, root, rev, ds.blks) + if err != nil { + return nil, err + } + return slice, ocs.updateLastCommit(ctx, ds.user, rev, root) + default: + return nil, fmt.Errorf("unsupported carstore type") + } } func WriteCarHeader(w io.Writer, root cid.Cid) (int64, error) { @@ -600,6 +635,23 @@ func WriteCarHeader(w io.Writer, root cid.Cid) (int64, error) { return hnw, nil } +func blocksToCar(ctx context.Context, root cid.Cid, rev string, blks map[cid.Cid]blockformat.Block) ([]byte, error) { + buf := new(bytes.Buffer) + _, err := WriteCarHeader(buf, root) + if err != nil { + return nil, fmt.Errorf("failed to write car header: %w", err) + } + + for k, blk := range blks { + _, err := LdWrite(buf, k.Bytes(), blk.RawData()) + if err != nil { + return nil, fmt.Errorf("failed to write block: %w", err) + } + } + + return buf.Bytes(), nil +} + func (cs *FileCarStore) writeNewShard(ctx context.Context, root cid.Cid, rev string, user models.Uid, seq int, blks map[cid.Cid]blockformat.Block, rmcids map[cid.Cid]bool) ([]byte, error) { buf := new(bytes.Buffer) @@ -638,10 +690,12 @@ func (cs *FileCarStore) writeNewShard(ctx context.Context, root cid.Cid, rev str offset += nw } + start := time.Now() path, err := cs.writeNewShardFile(ctx, user, seq, buf.Bytes()) if err != nil { return nil, fmt.Errorf("failed to write shard file: %w", err) } + writeShardFileDuration.Observe(time.Since(start).Seconds()) shard := CarShard{ Root: models.DbCID{CID: root}, @@ -652,9 +706,11 @@ func (cs *FileCarStore) writeNewShard(ctx context.Context, root cid.Cid, rev str Rev: rev, } + start = time.Now() if err := cs.putShard(ctx, &shard, brefs, rmcids, false); err != nil { return nil, err } + writeShardMetadataDuration.Observe(time.Since(start).Seconds()) return buf.Bytes(), nil } @@ -872,7 +928,7 @@ func (cs *FileCarStore) deleteShards(ctx context.Context, shs []CarShard) error if !os.IsNotExist(err) { return err } - log.Warnw("shard file we tried to delete did not exist", "shard", sh.ID, "path", sh.Path) + cs.log.Warn("shard file we tried to delete did not exist", "shard", sh.ID, "path", sh.Path) } } @@ -982,7 +1038,7 @@ func (cs *FileCarStore) openNewCompactedShardFile(ctx context.Context, user mode // TODO: some overwrite protections // NOTE CreateTemp is used for creating a non-colliding file, but we keep it and don't delete it so don't think of it as "temporary". // This creates "sh-%d-%d%s" with some random stuff in the last position - fi, err := os.CreateTemp(cs.rootDir, fnameForShard(user, seq)) + fi, err := os.CreateTemp(cs.dirForUser(user), fnameForShard(user, seq)) if err != nil { return nil, "", err } @@ -1023,7 +1079,7 @@ func shardSize(sh *CarShard) (int64, error) { st, err := os.Stat(sh.Path) if err != nil { if os.IsNotExist(err) { - log.Warnw("missing shard, return size of zero", "path", sh.Path, "shard", sh.ID) + slog.Warn("missing shard, return size of zero", "path", sh.Path, "shard", sh.ID, "system", "carstore") return 0, nil } return 0, fmt.Errorf("stat %q: %w", sh.Path, err) @@ -1144,7 +1200,7 @@ func (cs *FileCarStore) CompactUserShards(ctx context.Context, user models.Uid, // still around but we're doing that anyways since compaction isn't a // perfect process - log.Debugw("repo has dirty dupes", "count", len(dupes), "uid", user, "staleRefs", len(staleRefs), "blockRefs", len(brefs)) + cs.log.Debug("repo has dirty dupes", "count", len(dupes), "uid", user, "staleRefs", len(staleRefs), "blockRefs", len(brefs)) //return nil, fmt.Errorf("WIP: not currently handling this case") } @@ -1339,7 +1395,7 @@ func (cs *FileCarStore) compactBucket(ctx context.Context, user models.Uid, b *c }); err != nil { // If we ever fail to iterate a shard file because its // corrupted, just log an error and skip the shard - log.Errorw("iterating blocks in shard", "shard", s.ID, "err", err, "uid", user) + cs.log.Error("iterating blocks in shard", "shard", s.ID, "err", err, "uid", user) } } @@ -1357,7 +1413,7 @@ func (cs *FileCarStore) compactBucket(ctx context.Context, user models.Uid, b *c _ = fi.Close() if err2 := os.Remove(fi.Name()); err2 != nil { - log.Errorf("failed to remove shard file (%s) after failed db transaction: %w", fi.Name(), err2) + cs.log.Error("failed to remove shard file after failed db transaction", "path", fi.Name(), "err", err2) } return err diff --git a/carstore/metrics.go b/carstore/metrics.go new file mode 100644 index 000000000..0d2a0794a --- /dev/null +++ b/carstore/metrics.go @@ -0,0 +1,18 @@ +package carstore + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var writeShardFileDuration = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "carstore_write_shard_file_duration", + Help: "Duration of writing shard file to disk", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), +}) + +var writeShardMetadataDuration = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "carstore_write_shard_metadata_duration", + Help: "Duration of writing shard metadata to DB", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), +}) diff --git a/carstore/nonarchive.go b/carstore/nonarchive.go new file mode 100644 index 000000000..5197b1c6e --- /dev/null +++ b/carstore/nonarchive.go @@ -0,0 +1,254 @@ +package carstore + +import ( + "bytes" + "context" + "fmt" + "io" + "log/slog" + "sync" + + "github.com/bluesky-social/indigo/models" + blockformat "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + "github.com/ipfs/go-datastore" + blockstore "github.com/ipfs/go-ipfs-blockstore" + car "github.com/ipld/go-car" + "go.opentelemetry.io/otel" + "gorm.io/gorm" + "gorm.io/gorm/clause" +) + +type NonArchivalCarstore struct { + db *gorm.DB + + lk sync.Mutex + lastCommitCache map[models.Uid]*commitRefInfo + + log *slog.Logger +} + +func NewNonArchivalCarstore(db *gorm.DB) (*NonArchivalCarstore, error) { + if err := db.AutoMigrate(&commitRefInfo{}); err != nil { + return nil, err + } + + return &NonArchivalCarstore{ + db: db, + lastCommitCache: make(map[models.Uid]*commitRefInfo), + log: slog.Default().With("system", "carstorena"), + }, nil +} + +type commitRefInfo struct { + ID uint `gorm:"primarykey"` + Uid models.Uid `gorm:"uniqueIndex"` + Rev string + Root models.DbCID +} + +func (cs *NonArchivalCarstore) checkLastShardCache(user models.Uid) *commitRefInfo { + cs.lk.Lock() + defer cs.lk.Unlock() + + ls, ok := cs.lastCommitCache[user] + if ok { + return ls + } + + return nil +} + +func (cs *NonArchivalCarstore) removeLastShardCache(user models.Uid) { + cs.lk.Lock() + defer cs.lk.Unlock() + + delete(cs.lastCommitCache, user) +} + +func (cs *NonArchivalCarstore) putLastShardCache(ls *commitRefInfo) { + cs.lk.Lock() + defer cs.lk.Unlock() + + cs.lastCommitCache[ls.Uid] = ls +} + +func (cs *NonArchivalCarstore) loadCommitRefInfo(ctx context.Context, user models.Uid) (*commitRefInfo, error) { + var out commitRefInfo + if err := cs.db.Find(&out, "uid = ?", user).Error; err != nil { + return nil, err + } + + return &out, nil +} + +func (cs *NonArchivalCarstore) getCommitRefInfo(ctx context.Context, user models.Uid) (*commitRefInfo, error) { + ctx, span := otel.Tracer("carstore").Start(ctx, "getCommitRefInfo") + defer span.End() + + maybeLs := cs.checkLastShardCache(user) + if maybeLs != nil { + return maybeLs, nil + } + + lastShard, err := cs.loadCommitRefInfo(ctx, user) + if err != nil { + return nil, err + } + + cs.putLastShardCache(lastShard) + return lastShard, nil +} + +func (cs *NonArchivalCarstore) updateLastCommit(ctx context.Context, uid models.Uid, rev string, cid cid.Cid) error { + cri := &commitRefInfo{ + Uid: uid, + Rev: rev, + Root: models.DbCID{CID: cid}, + } + + if err := cs.db.Clauses(clause.OnConflict{ + Columns: []clause.Column{{Name: "uid"}}, + UpdateAll: true, + }).Create(cri).Error; err != nil { + return fmt.Errorf("update or set last commit info: %w", err) + } + + cs.putLastShardCache(cri) + + return nil +} + +func (cs *NonArchivalCarstore) NewDeltaSession(ctx context.Context, user models.Uid, since *string) (*DeltaSession, error) { + ctx, span := otel.Tracer("carstore").Start(ctx, "NewSession") + defer span.End() + + // TODO: ensure that we don't write updates on top of the wrong head + // this needs to be a compare and swap type operation + lastShard, err := cs.getCommitRefInfo(ctx, user) + if err != nil { + return nil, err + } + + if since != nil && *since != lastShard.Rev { + cs.log.Warn("revision mismatch: %s != %s: %s", *since, lastShard.Rev, ErrRepoBaseMismatch) + } + + return &DeltaSession{ + fresh: blockstore.NewBlockstore(datastore.NewMapDatastore()), + blks: make(map[cid.Cid]blockformat.Block), + base: &userView{ + user: user, + cs: cs, + prefetch: true, + cache: make(map[cid.Cid]blockformat.Block), + }, + user: user, + baseCid: lastShard.Root.CID, + cs: cs, + seq: 0, + lastRev: lastShard.Rev, + }, nil +} + +func (cs *NonArchivalCarstore) ReadOnlySession(user models.Uid) (*DeltaSession, error) { + return &DeltaSession{ + base: &userView{ + user: user, + cs: cs, + prefetch: false, + cache: make(map[cid.Cid]blockformat.Block), + }, + readonly: true, + user: user, + cs: cs, + }, nil +} + +// TODO: incremental is only ever called true, remove the param +func (cs *NonArchivalCarstore) ReadUserCar(ctx context.Context, user models.Uid, sinceRev string, incremental bool, w io.Writer) error { + return fmt.Errorf("not supported in non-archival mode") +} + +func (cs *NonArchivalCarstore) ImportSlice(ctx context.Context, uid models.Uid, since *string, carslice []byte) (cid.Cid, *DeltaSession, error) { + ctx, span := otel.Tracer("carstore").Start(ctx, "ImportSlice") + defer span.End() + + carr, err := car.NewCarReader(bytes.NewReader(carslice)) + if err != nil { + return cid.Undef, nil, err + } + + if len(carr.Header.Roots) != 1 { + return cid.Undef, nil, fmt.Errorf("invalid car file, header must have a single root (has %d)", len(carr.Header.Roots)) + } + + ds, err := cs.NewDeltaSession(ctx, uid, since) + if err != nil { + return cid.Undef, nil, fmt.Errorf("new delta session failed: %w", err) + } + + var cids []cid.Cid + for { + blk, err := carr.Next() + if err != nil { + if err == io.EOF { + break + } + return cid.Undef, nil, err + } + + cids = append(cids, blk.Cid()) + + if err := ds.Put(ctx, blk); err != nil { + return cid.Undef, nil, err + } + } + + return carr.Header.Roots[0], ds, nil +} + +func (cs *NonArchivalCarstore) GetUserRepoHead(ctx context.Context, user models.Uid) (cid.Cid, error) { + lastShard, err := cs.getCommitRefInfo(ctx, user) + if err != nil { + return cid.Undef, err + } + if lastShard.ID == 0 { + return cid.Undef, nil + } + + return lastShard.Root.CID, nil +} + +func (cs *NonArchivalCarstore) GetUserRepoRev(ctx context.Context, user models.Uid) (string, error) { + lastShard, err := cs.getCommitRefInfo(ctx, user) + if err != nil { + return "", err + } + if lastShard.ID == 0 { + return "", nil + } + + return lastShard.Rev, nil +} + +func (cs *NonArchivalCarstore) Stat(ctx context.Context, usr models.Uid) ([]UserStat, error) { + return nil, nil +} + +func (cs *NonArchivalCarstore) WipeUserData(ctx context.Context, user models.Uid) error { + if err := cs.db.Raw("DELETE from commit_ref_infos WHERE uid = ?", user).Error; err != nil { + return err + } + + cs.removeLastShardCache(user) + return nil +} + +func (cs *NonArchivalCarstore) GetCompactionTargets(ctx context.Context, shardCount int) ([]CompactionTarget, error) { + return nil, fmt.Errorf("compaction not supported on non-archival") +} + +func (cs *NonArchivalCarstore) CompactUserShards(ctx context.Context, user models.Uid, skipBigShards bool) (*CompactionStats, error) { + return nil, fmt.Errorf("compaction not supported in non-archival") +} diff --git a/carstore/repo_test.go b/carstore/repo_test.go index a4d2c8cb8..8366cab95 100644 --- a/carstore/repo_test.go +++ b/carstore/repo_test.go @@ -30,8 +30,13 @@ func testCarStore() (CarStore, func(), error) { return nil, nil, err } - sharddir := filepath.Join(tempdir, "shards") - if err := os.MkdirAll(sharddir, 0775); err != nil { + sharddir1 := filepath.Join(tempdir, "shards1") + if err := os.MkdirAll(sharddir1, 0775); err != nil { + return nil, nil, err + } + + sharddir2 := filepath.Join(tempdir, "shards2") + if err := os.MkdirAll(sharddir2, 0775); err != nil { return nil, nil, err } @@ -45,7 +50,7 @@ func testCarStore() (CarStore, func(), error) { return nil, nil, err } - cs, err := NewCarStore(db, sharddir) + cs, err := NewCarStore(db, []string{sharddir1, sharddir2}) if err != nil { return nil, nil, err } diff --git a/cmd/astrolabe/handlers.go b/cmd/astrolabe/handlers.go index 4a735e8ed..f974fcf4c 100644 --- a/cmd/astrolabe/handlers.go +++ b/cmd/astrolabe/handlers.go @@ -6,6 +6,7 @@ import ( "net/http" "strings" + "github.com/bluesky-social/indigo/api/agnostic" comatproto "github.com/bluesky-social/indigo/api/atproto" _ "github.com/bluesky-social/indigo/api/bsky" "github.com/bluesky-social/indigo/atproto/data" @@ -64,7 +65,7 @@ func (srv *Server) WebAccount(c echo.Context) error { atid, err := syntax.ParseAtIdentifier(c.Param("atid")) if err != nil { - return echo.NewHTTPError(404, fmt.Sprintf("failed to parse handle or DID")) + return echo.NewHTTPError(404, "failed to parse handle or DID") } ident, err := srv.dir.Lookup(ctx, *atid) @@ -96,7 +97,7 @@ func (srv *Server) WebRepo(c echo.Context) error { atid, err := syntax.ParseAtIdentifier(c.Param("atid")) if err != nil { - return echo.NewHTTPError(400, fmt.Sprintf("failed to parse handle or DID")) + return echo.NewHTTPError(400, "failed to parse handle or DID") } ident, err := srv.dir.Lookup(ctx, *atid) @@ -133,12 +134,12 @@ func (srv *Server) WebRepoCollection(c echo.Context) error { atid, err := syntax.ParseAtIdentifier(c.Param("atid")) if err != nil { - return echo.NewHTTPError(400, fmt.Sprintf("failed to parse handle or DID")) + return echo.NewHTTPError(400, "failed to parse handle or DID") } collection, err := syntax.ParseNSID(c.Param("collection")) if err != nil { - return echo.NewHTTPError(400, fmt.Sprintf("failed to parse collection NSID")) + return echo.NewHTTPError(400, "failed to parse collection NSID") } ident, err := srv.dir.Lookup(ctx, *atid) @@ -161,7 +162,7 @@ func (srv *Server) WebRepoCollection(c echo.Context) error { cursor := c.QueryParam("cursor") // collection string, cursor string, limit int64, repo string, reverse bool, rkeyEnd string, rkeyStart string - resp, err := RepoListRecords(ctx, &xrpcc, collection.String(), cursor, 100, ident.DID.String(), false, "", "") + resp, err := agnostic.RepoListRecords(ctx, &xrpcc, collection.String(), cursor, 100, ident.DID.String(), false, "", "") if err != nil { return err } @@ -191,17 +192,17 @@ func (srv *Server) WebRepoRecord(c echo.Context) error { atid, err := syntax.ParseAtIdentifier(c.Param("atid")) if err != nil { - return echo.NewHTTPError(400, fmt.Sprintf("failed to parse handle or DID")) + return echo.NewHTTPError(400, "failed to parse handle or DID") } collection, err := syntax.ParseNSID(c.Param("collection")) if err != nil { - return echo.NewHTTPError(400, fmt.Sprintf("failed to parse collection NSID")) + return echo.NewHTTPError(400, "failed to parse collection NSID") } rkey, err := syntax.ParseRecordKey(c.Param("rkey")) if err != nil { - return echo.NewHTTPError(400, fmt.Sprintf("failed to parse record key")) + return echo.NewHTTPError(400, "failed to parse record key") } ident, err := srv.dir.Lookup(ctx, *atid) @@ -218,7 +219,7 @@ func (srv *Server) WebRepoRecord(c echo.Context) error { xrpcc := xrpc.Client{ Host: ident.PDSEndpoint(), } - resp, err := RepoGetRecord(ctx, &xrpcc, "", collection.String(), ident.DID.String(), rkey.String()) + resp, err := agnostic.RepoGetRecord(ctx, &xrpcc, "", collection.String(), ident.DID.String(), rkey.String()) if err != nil { return echo.NewHTTPError(400, fmt.Sprintf("failed to load record: %s", err)) } diff --git a/cmd/beemo/firehose_consumer.go b/cmd/beemo/firehose_consumer.go index 55f2edc87..4f96026a6 100644 --- a/cmd/beemo/firehose_consumer.go +++ b/cmd/beemo/firehose_consumer.go @@ -57,7 +57,7 @@ func RunFirehoseConsumer(ctx context.Context, logger *slog.Logger, relayHost str ) logger.Info("beemo firehose scheduler configured", "scheduler", "parallel", "workers", parallelism) - return events.HandleRepoStream(ctx, con, scheduler) + return events.HandleRepoStream(ctx, con, scheduler, logger) } // TODO: move this to a "ParsePath" helper in syntax package? diff --git a/cmd/bigsky/copy_pdses.py b/cmd/bigsky/copy_pdses.py new file mode 100644 index 000000000..093f8f1e3 --- /dev/null +++ b/cmd/bigsky/copy_pdses.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +# +# pip install requests +# +# python3 copy_pdses.py --admin-key hunter2 --source-url http://srcrelay:2470 --dest-url http://destrelay:2470 + +import json +import logging +import sys +import urllib.parse + +import requests + +logger = logging.getLogger(__name__) + +class relay: + def __init__(self, rooturl, headers=None, session=None): + "rooturl string, headers dict or None, session requests.Session() or None" + self.rooturl = rooturl + self.headers = headers or dict() + self.session = session or requests.Session() + + def crawl(self, host): + pheaders = dict(self.headers) + pheaders['Content-Type'] = 'application/json' + url = urllib.parse.urljoin(self.rooturl, '/admin/pds/requestCrawl') + response = self.session.post(url, headers=pheaders, data=json.dumps({"hostname": host})) + if response.status_code != 200: + return False + return True + + def crawlAndSetLimits(self, host, limits): + "host string, limits dict" + if not self.crawl(host): + logger.error("%s %s : %d %r", url, host, response.status_code, response.text) + return + if limits is None: + logger.debug("requestCrawl %s OK", host) + if self.setLimits(host, limits): + logger.debug("requestCrawl + changeLimits %s OK", host) + def setLimits(self, host, limits): + url = urllib.parse.urljoin(self.rooturl, '/admin/pds/changeLimits') + plimits = dict(limits) + plimits["host"] = host + pheaders = dict(self.headers) + pheaders['Content-Type'] = 'application/json' + response = self.session.post(url, headers=pheaders, data=json.dumps(plimits)) + if response.status_code != 200: + logger.error("%s %s : %d %r", url, host, response.status_code, response.text) + return False + return True + + def crawlAndBlock(self, host): + "make relay aware of PDS, and block it" + if not self.crawl(host): + logger.error("%s %s : %d %r", url, host, response.status_code, response.text) + return + if self.block(host): + logger.debug("requestCrawl + block %s OK", host) + + def block(self, host): + url = urllib.parse.urljoin(self.rooturl, '/admin/pds/block') + response = self.session.post(url, headers=self.headers, data='', params={"host":host}) + if response.status_code != 200: + logger.error("%s %s : %d %r", url, host, response.status_code, response.text) + return False + return True + + def unblock(self, host): + url = urllib.parse.urljoin(self.rooturl, '/admin/pds/unblock') + response = self.session.post(url, headers=self.headers, data='', params={"host":host}) + if response.status_code != 200: + logger.error("%s %s : %d %r", url, host, response.status_code, response.text) + return False + return True + + def pdsList(self): + "GET /admin/pds/list" + url = urllib.parse.urljoin(self.rooturl, '/admin/pds/list') + response = self.session.get(url, headers=self.headers) + if response.status_code != 200: + logger.error("%s : %d %r", url, response.status_code, response.text) + return None + return response.json() + +def makeByHost(they): + out = dict() + for rec in they: + out[rec['Host']] = rec + return out + +def makeLimits(rec): + "for submitting to changeLimits" + return { + "host": rec['Host'], + "per_second":rec['RateLimit'], + "per_hour":rec['HourlyEventLimit'], + "per_day":rec['DailyEventLimit'], + "crawl_rate":rec['CrawlRateLimit'], + "repo_limit":rec['RepoLimit'], + } + +def makeRequestCrawl(rec): + "for submitting to requestCrawl" + return {"hostname":rec["Host"]} + +def de(a,b): + # dict equal + for ka, va in a.items(): + vb = b[ka] + if (va is None) and (vb is None): + continue + if va == vb: + continue + return False + for kb in b.keys(): + if kb not in a: + return False + return True + +def main(): + import argparse + ap = argparse.ArgumentParser() + ap.add_argument('--admin-key', default=None, help='relay auth bearer token', required=True) + ap.add_argument('--source-url', default=None, help='base url to GET /admin/pds/list') + ap.add_argument('--source-json', default=None, help='load /admin/pds/list json from file') + ap.add_argument('--dest-url', default=None, help='dest URL to POST requestCrawl etc to') + ap.add_argument('--dry-run', default=False, action='store_true') + ap.add_argument('--verbose', default=False, action='store_true') + args = ap.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + headers = {'Authorization': 'Bearer ' + args.admin_key} + + if args.source_json: + with open(args.source_json, 'rt') as fin: + sourceList = json.load(fin) + elif args.source_url: + relaySession = relay(args.source_url, headers) + sourceList = relaySession.pdsList() + else: + sys.stdout.write("need --source-url or --source-json\n") + sys.exit(1) + + r2 = relay(args.dest_url, headers) + destList = r2.pdsList() + + source = makeByHost(sourceList) + dests = makeByHost(destList) + + snotd = [] + dnots = [] + diflim = [] + difblock = [] + recrawl = [] + + for k1, v1 in source.items(): + v2 = dests.get(k1) + if v2 is None: + snotd.append(v1) + continue + lim1 = makeLimits(v1) + lim2 = makeLimits(v2) + if v1["Blocked"] != v2["Blocked"]: + difblock.append((k1,v1["Blocked"])) + if v1["Blocked"]: + continue + if not de(lim1, lim2): + diflim.append(lim1) + if v1["HasActiveConnection"] and not v2["HasActiveConnection"]: + recrawl.append(k1) + for k2 in dests.keys(): + if k2 not in source: + dnots.append(k2) + + logger.debug("%d source not dest", len(snotd)) + for rec in snotd: + if rec["Blocked"]: + if args.dry_run: + sys.stdout.write("crawl and block: {!r}\n".format(rec["Host"])) + else: + r2.crawlAndBlock(rec["Host"]) + else: + limits = makeLimits(rec) + if args.dry_run: + sys.stdout.write("crawl and limit: {}\n".format(json.dumps(limits))) + else: + r2.crawlAndSetLimits(rec["Host"], limits) + logger.debug("adjust limits: %d", len(diflim)) + for limits in diflim: + if args.dry_run: + sys.stdout.write("set limits: {}\n".format(json.dumps(limits))) + else: + r2.setLimits(limits["host"], limits) + logger.debug("adjust block status: %d", len(difblock)) + for host, blocked in difblock: + if args.dry_run: + sys.stdout.write("{} block={}\n".format(host, blocked)) + else: + if blocked: + r2.block(host) + else: + r2.unblock(host) + logger.debug("restart requestCrawl: %d", len(recrawl)) + for host in recrawl: + if args.dry_run: + logger.info("requestCrawl %s", host) + else: + if r2.crawl(host): + logger.debug("requestCrawl %s OK", host) + logger.info("%d in dest but not source", len(dnots)) + for k2 in dnots: + logger.debug("%s", k2) + + + + + +if __name__ == '__main__': + main() diff --git a/cmd/bigsky/main.go b/cmd/bigsky/main.go index 540796f51..33c798508 100644 --- a/cmd/bigsky/main.go +++ b/cmd/bigsky/main.go @@ -3,8 +3,10 @@ package main import ( "context" "fmt" + "log/slog" "net/http" _ "net/http/pprof" + "net/url" "os" "os/signal" "path/filepath" @@ -29,7 +31,6 @@ import ( _ "go.uber.org/automaxprocs" "github.com/carlmjohnson/versioninfo" - logging "github.com/ipfs/go-log" "github.com/urfave/cli/v2" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -41,7 +42,7 @@ import ( "gorm.io/plugin/opentelemetry/tracing" ) -var log = logging.Logger("bigsky") +var log = slog.Default().With("system", "bigsky") func init() { // control log level using, eg, GOLOG_LOG_LEVEL=debug @@ -50,7 +51,8 @@ func init() { func main() { if err := run(os.Args); err != nil { - log.Fatal(err) + slog.Error(err.Error()) + os.Exit(1) } } @@ -189,6 +191,10 @@ func run(args []string) error { EnvVars: []string{"RELAY_DID_CACHE_SIZE"}, Value: 5_000_000, }, + &cli.StringSliceFlag{ + Name: "did-memcached", + EnvVars: []string{"RELAY_DID_MEMCACHED"}, + }, &cli.DurationFlag{ Name: "event-playback-ttl", Usage: "time to live for event playback buffering (only applies to disk persister)", @@ -200,6 +206,21 @@ func run(args []string) error { EnvVars: []string{"RELAY_NUM_COMPACTION_WORKERS"}, Value: 2, }, + &cli.StringSliceFlag{ + Name: "carstore-shard-dirs", + Usage: "specify list of shard directories for carstore storage, overrides default storage within datadir", + EnvVars: []string{"RELAY_CARSTORE_SHARD_DIRS"}, + }, + &cli.StringSliceFlag{ + Name: "next-crawler", + Usage: "forward POST requestCrawl to this url, should be machine root url and not xrpc/requestCrawl, comma separated list", + EnvVars: []string{"RELAY_NEXT_CRAWLER"}, + }, + &cli.BoolFlag{ + Name: "non-archival", + EnvVars: []string{"RELAY_NON_ARCHIVAL"}, + Value: false, + }, } app.Action = runBigsky @@ -213,8 +234,8 @@ func setupOTEL(cctx *cli.Context) error { env = "dev" } if cctx.Bool("jaeger") { - url := "http://localhost:14268/api/traces" - exp, err := jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(url))) + jaegerUrl := "http://localhost:14268/api/traces" + exp, err := jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(jaegerUrl))) if err != nil { return err } @@ -240,19 +261,20 @@ func setupOTEL(cctx *cli.Context) error { // At a minimum, you need to set // OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 if ep := cctx.String("otel-exporter-otlp-endpoint"); ep != "" { - log.Infow("setting up trace exporter", "endpoint", ep) + slog.Info("setting up trace exporter", "endpoint", ep) ctx, cancel := context.WithCancel(context.Background()) defer cancel() exp, err := otlptracehttp.New(ctx) if err != nil { - log.Fatalw("failed to create trace exporter", "error", err) + slog.Error("failed to create trace exporter", "error", err) + os.Exit(1) } defer func() { ctx, cancel := context.WithTimeout(context.Background(), time.Second) defer cancel() if err := exp.Shutdown(ctx); err != nil { - log.Errorw("failed to shutdown trace exporter", "error", err) + slog.Error("failed to shutdown trace exporter", "error", err) } }() @@ -277,6 +299,11 @@ func runBigsky(cctx *cli.Context) error { signals := make(chan os.Signal, 1) signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM) + _, err := cliutil.SetupSlog(cliutil.LogOptions{}) + if err != nil { + return err + } + // start observability/tracing (OTEL and jaeger) if err := setupOTEL(cctx); err != nil { return err @@ -289,14 +316,14 @@ func runBigsky(cctx *cli.Context) error { return err } - log.Infow("setting up main database") + slog.Info("setting up main database") dburl := cctx.String("db-url") db, err := cliutil.SetupDatabase(dburl, cctx.Int("max-metadb-connections")) if err != nil { return err } - log.Infow("setting up carstore database") + slog.Info("setting up carstore database") csdburl := cctx.String("carstore-db-url") csdb, err := cliutil.SetupDatabase(csdburl, cctx.Int("max-carstore-connections")) if err != nil { @@ -312,24 +339,62 @@ func runBigsky(cctx *cli.Context) error { } } - os.MkdirAll(filepath.Dir(csdir), os.ModePerm) - cstore, err := carstore.NewCarStore(csdb, csdir) - if err != nil { - return err + csdirs := []string{csdir} + if paramDirs := cctx.StringSlice("carstore-shard-dirs"); len(paramDirs) > 0 { + csdirs = paramDirs + } + + for _, csd := range csdirs { + if err := os.MkdirAll(filepath.Dir(csd), os.ModePerm); err != nil { + return err + } } - mr := did.NewMultiResolver() + var cstore carstore.CarStore - didr := &api.PLCServer{Host: cctx.String("plc-host")} - mr.AddHandler("plc", didr) + if cctx.Bool("non-archival") { + cs, err := carstore.NewNonArchivalCarstore(csdb) + if err != nil { + return err + } + + cstore = cs + } else { + cs, err := carstore.NewCarStore(csdb, csdirs) + if err != nil { + return err + } - webr := did.WebResolver{} - if cctx.Bool("crawl-insecure-ws") { - webr.Insecure = true + cstore = cs } - mr.AddHandler("web", &webr) - cachedidr := plc.NewCachingDidResolver(mr, time.Hour*24, cctx.Int("did-cache-size")) + // DID RESOLUTION + // 1. the outside world, PLCSerever or Web + // 2. (maybe memcached) + // 3. in-process cache + var cachedidr did.Resolver + { + mr := did.NewMultiResolver() + + didr := &api.PLCServer{Host: cctx.String("plc-host")} + mr.AddHandler("plc", didr) + + webr := did.WebResolver{} + if cctx.Bool("crawl-insecure-ws") { + webr.Insecure = true + } + mr.AddHandler("web", &webr) + + var prevResolver did.Resolver + memcachedServers := cctx.StringSlice("did-memcached") + if len(memcachedServers) > 0 { + prevResolver = plc.NewMemcachedDidResolver(mr, time.Hour*24, memcachedServers) + } else { + prevResolver = mr + } + + cachedidr = plc.NewCachingDidResolver(prevResolver, time.Hour*24, cctx.Int("did-cache-size")) + } kmgr := indexer.NewKeyManager(cachedidr, nil) @@ -338,7 +403,7 @@ func runBigsky(cctx *cli.Context) error { var persister events.EventPersistence if dpd := cctx.String("disk-persister-dir"); dpd != "" { - log.Infow("setting up disk persister") + slog.Info("setting up disk persister") pOpts := events.DefaultDiskPersistOptions() pOpts.Retention = cctx.Duration("event-playback-ttl") @@ -361,10 +426,11 @@ func runBigsky(cctx *cli.Context) error { rf := indexer.NewRepoFetcher(db, repoman, cctx.Int("max-fetch-concurrency")) - ix, err := indexer.NewIndexer(db, notifman, evtman, cachedidr, rf, true, cctx.Bool("spidering"), false) + ix, err := indexer.NewIndexer(db, notifman, evtman, cachedidr, rf, true, false, cctx.Bool("spidering")) if err != nil { return err } + defer ix.Shutdown() rlskip := cctx.String("bsky-social-rate-limit-skip") ix.ApplyPDSClientSettings = func(c *xrpc.Client) { @@ -387,7 +453,7 @@ func runBigsky(cctx *cli.Context) error { repoman.SetEventHandler(func(ctx context.Context, evt *repomgr.RepoEvent) { if err := ix.HandleRepoEvent(ctx, evt); err != nil { - log.Errorw("failed to handle repo event", "err", err) + slog.Error("failed to handle repo event", "err", err) } }, false) @@ -411,7 +477,7 @@ func runBigsky(cctx *cli.Context) error { } } - log.Infow("constructing bgs") + slog.Info("constructing bgs") bgsConfig := libbgs.DefaultBGSConfig() bgsConfig.SSL = !cctx.Bool("crawl-insecure-ws") bgsConfig.CompactInterval = cctx.Duration("compact-interval") @@ -419,6 +485,19 @@ func runBigsky(cctx *cli.Context) error { bgsConfig.MaxQueuePerPDS = cctx.Int64("max-queue-per-pds") bgsConfig.DefaultRepoLimit = cctx.Int64("default-repo-limit") bgsConfig.NumCompactionWorkers = cctx.Int("num-compaction-workers") + nextCrawlers := cctx.StringSlice("next-crawler") + if len(nextCrawlers) != 0 { + nextCrawlerUrls := make([]*url.URL, len(nextCrawlers)) + for i, tu := range nextCrawlers { + var err error + nextCrawlerUrls[i], err = url.Parse(tu) + if err != nil { + return fmt.Errorf("failed to parse next-crawler url: %w", err) + } + slog.Info("configuring relay for requestCrawl", "host", nextCrawlerUrls[i]) + } + bgsConfig.NextCrawlers = nextCrawlerUrls + } bgs, err := libbgs.NewBGS(db, ix, repoman, evtman, cachedidr, rf, hr, bgsConfig) if err != nil { return err @@ -433,7 +512,8 @@ func runBigsky(cctx *cli.Context) error { // set up metrics endpoint go func() { if err := bgs.StartMetrics(cctx.String("metrics-listen")); err != nil { - log.Fatalf("failed to start metrics endpoint: %s", err) + log.Error("failed to start metrics endpoint", "err", err) + os.Exit(1) } }() @@ -444,22 +524,22 @@ func runBigsky(cctx *cli.Context) error { bgsErr <- err }() - log.Infow("startup complete") + slog.Info("startup complete") select { case <-signals: log.Info("received shutdown signal") errs := bgs.Shutdown() for err := range errs { - log.Errorw("error during BGS shutdown", "err", err) + slog.Error("error during BGS shutdown", "err", err) } case err := <-bgsErr: if err != nil { - log.Errorw("error during BGS startup", "err", err) + slog.Error("error during BGS startup", "err", err) } log.Info("shutting down") errs := bgs.Shutdown() for err := range errs { - log.Errorw("error during BGS shutdown", "err", err) + slog.Error("error during BGS shutdown", "err", err) } } diff --git a/cmd/bigsky/resync_pdses.py b/cmd/bigsky/resync_pdses.py new file mode 100644 index 000000000..710aeb7d9 --- /dev/null +++ b/cmd/bigsky/resync_pdses.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# +# pip install requests +# +# python3 resync_pdses.py --admin-key hunter2 --url http://myrelay:2470 host_per_line.txt + +import json +import sys +import urllib.parse + +import requests + + +# pds limits for POST /admin/pds/changeLimits +# {"host":"", "per_second": int, "per_hour": int, "per_day": int, "crawl_rate": int, "repo_limit": int} + +limitsKeys = ('per_second', 'per_hour', 'per_day', 'crawl_rate', 'repo_limit') + +def checkLimits(limits): + for k in limits.keys(): + if k not in limitsKeys: + raise Exception(f"unknown pds rate limits key {k!r}") + return True + +class relay: + def __init__(self, rooturl, headers=None, session=None): + "rooturl string, headers dict or None, session requests.Session() or None" + self.rooturl = rooturl + self.headers = headers or dict() + self.session = session or requests.Session() + + def resync(self, host): + "host string" + url = urllib.parse.urljoin(self.rooturl, '/admin/pds/resync') + response = self.session.post(url, params={"host": host}, headers=self.headers, data='') + if response.status_code != 200: + sys.stderr.write(f"{url}?host={host} : ({response.status_code}) ({response.text!r})\n") + else: + sys.stderr.write(f"{url}?host={host} : OK\n") + + def crawlAndSetLimits(self, host, limits): + "host string, limits dict" + pheaders = dict(self.headers) + pheaders['Content-Type'] = 'application/json' + url = urllib.parse.urljoin(self.rooturl, '/admin/pds/requestCrawl') + response = self.session.post(url, headers=pheaders, data=json.dumps({"hostname": host})) + if response.status_code != 200: + sys.stderr.write(f"{url} {host} : {response.status_code} {response.text!r}\n") + return + if limits is None: + sys.stderr.write(f"requestCrawl {host} OK\n") + url = urllib.parse.urljoin(self.rooturl, '/admin/pds/changeLimits') + plimits = dict(limits) + plimits["host"] = host + response = self.session.post(url, headers=pheaders, data=json.dumps(plimits)) + if response.status_code != 200: + sys.stderr.write(f"{url} {host} : {response.status_code} {response.text!r}\n") + return + sys.stderr.write(f"requestCrawl + changeLimits {host} OK\n") + +def main(): + import argparse + ap = argparse.ArgumentParser() + ap.add_argument('input', default='-', help='host per line text file to read, - for stdin') + ap.add_argument('--admin-key', default=None, help='relay auth bearer token', required=True) + ap.add_argument('--url', default=None, help='base url to POST /admin/pds/resync', required=True) + ap.add_argument('--resync', default=False, action='store_true', help='resync selected PDSes') + ap.add_argument('--limits', default=None, help='json pds rate limits') + ap.add_argument('--crawl', default=False, action='store_true', help='crawl & set limits') + args = ap.parse_args() + + headers = {'Authorization': 'Bearer ' + args.admin_key} + + relaySession = relay(args.url, headers) + + #url = urllib.parse.urljoin(args.url, '/admin/pds/resync') + + #sess = requests.Session() + if args.crawl and args.resync: + sys.stderr.write("should only specify one of --resync --crawl") + sys.exit(1) + if (not args.crawl) and (not args.resync): + sys.stderr.write("should specify one of --resync --crawl") + sys.exit(1) + + limits = None + if args.limits: + limits = json.loads(args.limits) + checkLimits(limits) + + if args.input == '-': + fin = sys.stdin + else: + fin = open(args.input, 'rt') + for line in fin: + if not line: + continue + line = line.strip() + if not line: + continue + if line[0] == '#': + continue + host = line + if args.crawl: + relaySession.crawlAndSetLimits(host, limits) + elif args.resync: + relaySession.resync(host) + # response = sess.post(url, params={"host": line}, headers=headers) + # if response.status_code != 200: + # sys.stderr.write(f"{url}?host={line} : ({response.status_code}) ({response.text!r})\n") + # else: + # sys.stderr.write(f"{url}?host={line} : OK\n") + +if __name__ == '__main__': + main() diff --git a/cmd/goat/account_migrate.go b/cmd/goat/account_migrate.go index fa3d60535..08925f48b 100644 --- a/cmd/goat/account_migrate.go +++ b/cmd/goat/account_migrate.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/bluesky-social/indigo/api/agnostic" comatproto "github.com/bluesky-social/indigo/api/atproto" "github.com/bluesky-social/indigo/atproto/syntax" "github.com/bluesky-social/indigo/xrpc" @@ -166,11 +167,11 @@ func runAccountMigrate(cctx *cli.Context) error { slog.Info("migrating preferences") // TODO: service proxy header for AppView? - prefResp, err := ActorGetPreferences(ctx, oldClient) + prefResp, err := agnostic.ActorGetPreferences(ctx, oldClient) if err != nil { return fmt.Errorf("failed fetching old preferences: %w", err) } - err = ActorPutPreferences(ctx, &newClient, &ActorPutPreferences_Input{ + err = agnostic.ActorPutPreferences(ctx, &newClient, &agnostic.ActorPutPreferences_Input{ Preferences: prefResp.Preferences, }) if err != nil { @@ -214,7 +215,7 @@ func runAccountMigrate(cctx *cli.Context) error { // NOTE: to work with did:web or non-PDS-managed did:plc, need to do manual migraiton process slog.Info("updating identity to new host") - credsResp, err := IdentityGetRecommendedDidCredentials(ctx, &newClient) + credsResp, err := agnostic.IdentityGetRecommendedDidCredentials(ctx, &newClient) if err != nil { return fmt.Errorf("failed fetching new credentials: %w", err) } @@ -223,7 +224,7 @@ func runAccountMigrate(cctx *cli.Context) error { return nil } - var unsignedOp IdentitySignPlcOperation_Input + var unsignedOp agnostic.IdentitySignPlcOperation_Input if err = json.Unmarshal(credsBytes, &unsignedOp); err != nil { return fmt.Errorf("failed parsing PLC op: %w", err) } @@ -231,12 +232,12 @@ func runAccountMigrate(cctx *cli.Context) error { // NOTE: could add additional sanity checks here that any extra rotation keys were retained, and that old alsoKnownAs and service entries are retained? The stakes aren't super high for the later, as PLC has the full history. PLC and the new PDS already implement some basic sanity checks. - signedPlcOpResp, err := IdentitySignPlcOperation(ctx, oldClient, &unsignedOp) + signedPlcOpResp, err := agnostic.IdentitySignPlcOperation(ctx, oldClient, &unsignedOp) if err != nil { return fmt.Errorf("failed requesting PLC operation signature: %w", err) } - err = IdentitySubmitPlcOperation(ctx, &newClient, &IdentitySubmitPlcOperation_Input{ + err = agnostic.IdentitySubmitPlcOperation(ctx, &newClient, &agnostic.IdentitySubmitPlcOperation_Input{ Operation: signedPlcOpResp.Operation, }) if err != nil { diff --git a/cmd/goat/account_plc.go b/cmd/goat/account_plc.go index a0dde653b..5ce62903a 100644 --- a/cmd/goat/account_plc.go +++ b/cmd/goat/account_plc.go @@ -6,6 +6,7 @@ import ( "fmt" "os" + "github.com/bluesky-social/indigo/api/agnostic" comatproto "github.com/bluesky-social/indigo/api/atproto" "github.com/urfave/cli/v2" @@ -56,7 +57,7 @@ func runAccountPlcRecommended(cctx *cli.Context) error { return err } - resp, err := IdentityGetRecommendedDidCredentials(ctx, xrpcc) + resp, err := agnostic.IdentityGetRecommendedDidCredentials(ctx, xrpcc) if err != nil { return err } @@ -109,7 +110,7 @@ func runAccountPlcSign(cctx *cli.Context) error { return err } - var body IdentitySignPlcOperation_Input + var body agnostic.IdentitySignPlcOperation_Input if err = json.Unmarshal(fileBytes, &body); err != nil { return fmt.Errorf("failed decoding PLC op JSON: %w", err) } @@ -119,7 +120,7 @@ func runAccountPlcSign(cctx *cli.Context) error { body.Token = &token } - resp, err := IdentitySignPlcOperation(ctx, xrpcc, &body) + resp, err := agnostic.IdentitySignPlcOperation(ctx, xrpcc, &body) if err != nil { return err } @@ -158,7 +159,7 @@ func runAccountPlcSubmit(cctx *cli.Context) error { return fmt.Errorf("failed decoding PLC op JSON: %w", err) } - err = IdentitySubmitPlcOperation(ctx, xrpcc, &IdentitySubmitPlcOperation_Input{ + err = agnostic.IdentitySubmitPlcOperation(ctx, xrpcc, &agnostic.IdentitySubmitPlcOperation_Input{ Operation: &op, }) if err != nil { diff --git a/cmd/goat/bsky_prefs.go b/cmd/goat/bsky_prefs.go index 725072344..789609125 100644 --- a/cmd/goat/bsky_prefs.go +++ b/cmd/goat/bsky_prefs.go @@ -6,6 +6,8 @@ import ( "fmt" "os" + "github.com/bluesky-social/indigo/api/agnostic" + "github.com/urfave/cli/v2" ) @@ -39,7 +41,7 @@ func runBskyPrefsExport(cctx *cli.Context) error { } // TODO: does indigo API code crash with unsupported preference '$type'? Eg "Lexicon decoder" with unsupported type. - resp, err := ActorGetPreferences(ctx, xrpcc) + resp, err := agnostic.ActorGetPreferences(ctx, xrpcc) if err != nil { return fmt.Errorf("failed fetching old preferences: %w", err) } @@ -77,7 +79,7 @@ func runBskyPrefsImport(cctx *cli.Context) error { return err } - err = ActorPutPreferences(ctx, xrpcc, &ActorPutPreferences_Input{ + err = agnostic.ActorPutPreferences(ctx, xrpcc, &agnostic.ActorPutPreferences_Input{ Preferences: prefsArray, }) if err != nil { diff --git a/cmd/goat/firehose.go b/cmd/goat/firehose.go index ab305a16e..d106c62ff 100644 --- a/cmd/goat/firehose.go +++ b/cmd/goat/firehose.go @@ -130,7 +130,7 @@ func runFirehose(cctx *cli.Context) error { rsc.EventHandler, ) slog.Info("starting firehose consumer", "relayHost", relayHost) - return events.HandleRepoStream(ctx, con, scheduler) + return events.HandleRepoStream(ctx, con, scheduler, nil) } // TODO: move this to a "ParsePath" helper in syntax package? diff --git a/cmd/goat/net.go b/cmd/goat/net.go index d7bb8e8f1..f0f2d9c4d 100644 --- a/cmd/goat/net.go +++ b/cmd/goat/net.go @@ -5,6 +5,7 @@ import ( "fmt" "log/slog" + "github.com/bluesky-social/indigo/api/agnostic" "github.com/bluesky-social/indigo/atproto/data" "github.com/bluesky-social/indigo/atproto/identity" "github.com/bluesky-social/indigo/atproto/syntax" @@ -17,7 +18,7 @@ func fetchRecord(ctx context.Context, ident identity.Identity, aturi syntax.ATUR xrpcc := xrpc.Client{ Host: ident.PDSEndpoint(), } - resp, err := RepoGetRecord(ctx, &xrpcc, "", aturi.Collection().String(), ident.DID.String(), aturi.RecordKey().String()) + resp, err := agnostic.RepoGetRecord(ctx, &xrpcc, "", aturi.Collection().String(), ident.DID.String(), aturi.RecordKey().String()) if err != nil { return nil, err } diff --git a/cmd/goat/record.go b/cmd/goat/record.go index 013913aa2..0caee8552 100644 --- a/cmd/goat/record.go +++ b/cmd/goat/record.go @@ -6,6 +6,7 @@ import ( "fmt" "os" + "github.com/bluesky-social/indigo/api/agnostic" comatproto "github.com/bluesky-social/indigo/api/atproto" "github.com/bluesky-social/indigo/atproto/data" "github.com/bluesky-social/indigo/atproto/identity" @@ -179,7 +180,7 @@ func runRecordList(cctx *cli.Context) error { cursor := "" for { // collection string, cursor string, limit int64, repo string, reverse bool, rkeyEnd string, rkeyStart string - resp, err := RepoListRecords(ctx, &xrpcc, nsid, cursor, 100, ident.DID.String(), false, "", "") + resp, err := agnostic.RepoListRecords(ctx, &xrpcc, nsid, cursor, 100, ident.DID.String(), false, "", "") if err != nil { return err } @@ -246,7 +247,7 @@ func runRecordCreate(cctx *cli.Context) error { } validate := !cctx.Bool("no-validate") - resp, err := RepoCreateRecord(ctx, xrpcc, &RepoCreateRecord_Input{ + resp, err := agnostic.RepoCreateRecord(ctx, xrpcc, &agnostic.RepoCreateRecord_Input{ Collection: nsid, Repo: xrpcc.Auth.Did, Record: recordVal, @@ -293,7 +294,7 @@ func runRecordUpdate(cctx *cli.Context) error { rkey := cctx.String("rkey") // NOTE: need to fetch existing record CID to perform swap. this is optional in theory, but golang can't deal with "optional" and "nullable", so we always need to set this (?) - existing, err := RepoGetRecord(ctx, xrpcc, "", nsid, xrpcc.Auth.Did, rkey) + existing, err := agnostic.RepoGetRecord(ctx, xrpcc, "", nsid, xrpcc.Auth.Did, rkey) if err != nil { return err } @@ -305,7 +306,7 @@ func runRecordUpdate(cctx *cli.Context) error { validate := !cctx.Bool("no-validate") - resp, err := RepoPutRecord(ctx, xrpcc, &RepoPutRecord_Input{ + resp, err := agnostic.RepoPutRecord(ctx, xrpcc, &agnostic.RepoPutRecord_Input{ Collection: nsid, Repo: xrpcc.Auth.Did, Record: recordVal, diff --git a/cmd/goat/repogetRecord.go b/cmd/goat/repogetRecord.go deleted file mode 100644 index e3b857cd5..000000000 --- a/cmd/goat/repogetRecord.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copied from indigo:api/atproto/repolistRecords.go - -package main - -// schema: com.atproto.repo.getRecord - -import ( - "context" - "encoding/json" - - "github.com/bluesky-social/indigo/xrpc" -) - -// RepoGetRecord_Output is the output of a com.atproto.repo.getRecord call. -type RepoGetRecord_Output struct { - Cid *string `json:"cid,omitempty" cborgen:"cid,omitempty"` - Uri string `json:"uri" cborgen:"uri"` - // NOTE: changed from lex decoder to json.RawMessage - Value *json.RawMessage `json:"value" cborgen:"value"` -} - -// RepoGetRecord calls the XRPC method "com.atproto.repo.getRecord". -// -// cid: The CID of the version of the record. If not specified, then return the most recent version. -// collection: The NSID of the record collection. -// repo: The handle or DID of the repo. -// rkey: The Record Key. -func RepoGetRecord(ctx context.Context, c *xrpc.Client, cid string, collection string, repo string, rkey string) (*RepoGetRecord_Output, error) { - var out RepoGetRecord_Output - - params := map[string]interface{}{ - "cid": cid, - "collection": collection, - "repo": repo, - "rkey": rkey, - } - if err := c.Do(ctx, xrpc.Query, "", "com.atproto.repo.getRecord", params, nil, &out); err != nil { - return nil, err - } - - return &out, nil -} diff --git a/cmd/goat/repolistRecords.go b/cmd/goat/repolistRecords.go deleted file mode 100644 index 3cfdd9b24..000000000 --- a/cmd/goat/repolistRecords.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copied from indigo:api/atproto/repolistRecords.go - -package main - -// schema: com.atproto.repo.listRecords - -import ( - "context" - "encoding/json" - - "github.com/bluesky-social/indigo/xrpc" -) - -// RepoListRecords_Output is the output of a com.atproto.repo.listRecords call. -type RepoListRecords_Output struct { - Cursor *string `json:"cursor,omitempty" cborgen:"cursor,omitempty"` - Records []*RepoListRecords_Record `json:"records" cborgen:"records"` -} - -// RepoListRecords_Record is a "record" in the com.atproto.repo.listRecords schema. -type RepoListRecords_Record struct { - Cid string `json:"cid" cborgen:"cid"` - Uri string `json:"uri" cborgen:"uri"` - // NOTE: changed from lex decoder to json.RawMessage - Value *json.RawMessage `json:"value" cborgen:"value"` -} - -// RepoListRecords calls the XRPC method "com.atproto.repo.listRecords". -// -// collection: The NSID of the record type. -// limit: The number of records to return. -// repo: The handle or DID of the repo. -// reverse: Flag to reverse the order of the returned records. -// rkeyEnd: DEPRECATED: The highest sort-ordered rkey to stop at (exclusive) -// rkeyStart: DEPRECATED: The lowest sort-ordered rkey to start from (exclusive) -func RepoListRecords(ctx context.Context, c *xrpc.Client, collection string, cursor string, limit int64, repo string, reverse bool, rkeyEnd string, rkeyStart string) (*RepoListRecords_Output, error) { - var out RepoListRecords_Output - - params := map[string]interface{}{ - "collection": collection, - "cursor": cursor, - "limit": limit, - "repo": repo, - "reverse": reverse, - "rkeyEnd": rkeyEnd, - "rkeyStart": rkeyStart, - } - if err := c.Do(ctx, xrpc.Query, "", "com.atproto.repo.listRecords", params, nil, &out); err != nil { - return nil, err - } - - return &out, nil -} diff --git a/cmd/gosky/car.go b/cmd/gosky/car.go index 54eb7f50c..288a4640f 100644 --- a/cmd/gosky/car.go +++ b/cmd/gosky/car.go @@ -64,7 +64,7 @@ var carUnpackCmd = &cli.Command{ if topDir == "" { topDir = did.String() } - log.Infof("writing output to: %s", topDir) + log.Info("writing output", "topDir", topDir) commitPath := topDir + "/_commit" os.MkdirAll(filepath.Dir(commitPath), os.ModePerm) @@ -90,7 +90,7 @@ var carUnpackCmd = &cli.Command{ if err != nil { return err } - log.Debugf("processing record: %s", k) + log.Debug("processing record", "rec", k) // TODO: check if path is safe more carefully recPath := topDir + "/" + k diff --git a/cmd/gosky/debug.go b/cmd/gosky/debug.go index 75d231cfc..2037c328c 100644 --- a/cmd/gosky/debug.go +++ b/cmd/gosky/debug.go @@ -106,7 +106,7 @@ var inspectEventCmd = &cli.Command{ } seqScheduler := sequential.NewScheduler("debug-inspect-event", rsc.EventHandler) - err = events.HandleRepoStream(ctx, con, seqScheduler) + err = events.HandleRepoStream(ctx, con, seqScheduler, nil) if err != errFoundIt { return err } @@ -284,7 +284,7 @@ var debugStreamCmd = &cli.Command{ }, } seqScheduler := sequential.NewScheduler("debug-stream", rsc.EventHandler) - err = events.HandleRepoStream(ctx, con, seqScheduler) + err = events.HandleRepoStream(ctx, con, seqScheduler, nil) if err != nil { return err } @@ -390,7 +390,8 @@ var compareStreamsCmd = &cli.Command{ go func(i int, url string) { con, _, err := d.Dial(url, http.Header{}) if err != nil { - log.Fatalf("Dial failure on url%d: %s", i+1, err) + log.Error("Dial failure", "i", i, "url", url, "err", err) + os.Exit(1) } ctx := context.TODO() @@ -405,8 +406,9 @@ var compareStreamsCmd = &cli.Command{ }, } seqScheduler := sequential.NewScheduler(fmt.Sprintf("debug-stream-%d", i+1), rsc.EventHandler) - if err := events.HandleRepoStream(ctx, con, seqScheduler); err != nil { - log.Fatalf("HandleRepoStream failure on url%d: %s", i+1, err) + if err := events.HandleRepoStream(ctx, con, seqScheduler, nil); err != nil { + log.Error("HandleRepoStream failure", "i", i, "url", url, "err", err) + os.Exit(1) } }(i, url) } @@ -876,13 +878,15 @@ var debugCompareReposCmd = &cli.Command{ logger := log.With("host", cctx.String("host-1")) repo1bytes, err := comatproto.SyncGetRepo(ctx, &xrpc1, did.String(), "") if err != nil { - logger.Fatalf("getting repo: %s", err) + logger.Error("getting repo", "err", err) + os.Exit(1) return } rep1, err = repo.ReadRepoFromCar(ctx, bytes.NewReader(repo1bytes)) if err != nil { - logger.Fatalf("reading repo: %s", err) + logger.Error("reading repo", "err", err) + os.Exit(1) return } }() @@ -893,13 +897,15 @@ var debugCompareReposCmd = &cli.Command{ logger := log.With("host", cctx.String("host-2")) repo2bytes, err := comatproto.SyncGetRepo(ctx, &xrpc2, did.String(), "") if err != nil { - logger.Fatalf("getting repo: %s", err) + logger.Error("getting repo", "err", err) + os.Exit(1) return } rep2, err = repo.ReadRepoFromCar(ctx, bytes.NewReader(repo2bytes)) if err != nil { - logger.Fatalf("reading repo: %s", err) + logger.Error("reading repo", "err", err) + os.Exit(1) return } }() diff --git a/cmd/gosky/main.go b/cmd/gosky/main.go index 6a3cfabd4..f7ff87209 100644 --- a/cmd/gosky/main.go +++ b/cmd/gosky/main.go @@ -7,6 +7,7 @@ import ( "encoding/json" "fmt" "io" + "log/slog" "net/http" "os" "os/signal" @@ -18,6 +19,7 @@ import ( "github.com/bluesky-social/indigo/api/atproto" comatproto "github.com/bluesky-social/indigo/api/atproto" "github.com/bluesky-social/indigo/api/bsky" + "github.com/bluesky-social/indigo/atproto/identity" "github.com/bluesky-social/indigo/atproto/syntax" "github.com/bluesky-social/indigo/events" "github.com/bluesky-social/indigo/events/schedulers/sequential" @@ -38,14 +40,13 @@ import ( _ "github.com/joho/godotenv/autoload" "github.com/carlmjohnson/versioninfo" - logging "github.com/ipfs/go-log" "github.com/polydawn/refmt/cbor" rejson "github.com/polydawn/refmt/json" "github.com/polydawn/refmt/shared" cli "github.com/urfave/cli/v2" ) -var log = logging.Logger("gosky") +var log = slog.Default().With("system", "gosky") func main() { run(os.Args) @@ -79,6 +80,14 @@ func run(args []string) { EnvVars: []string{"ATP_PLC_HOST"}, }, } + + _, err := cliutil.SetupSlog(cliutil.LogOptions{}) + if err != nil { + fmt.Fprintf(os.Stderr, "logging setup error: %s\n", err.Error()) + os.Exit(1) + return + } + app.Commands = []*cli.Command{ accountCmd, adminCmd, @@ -338,7 +347,7 @@ var readRepoStreamCmd = &cli.Command{ }, } seqScheduler := sequential.NewScheduler(con.RemoteAddr().String(), rsc.EventHandler) - return events.HandleRepoStream(ctx, con, seqScheduler) + return events.HandleRepoStream(ctx, con, seqScheduler, log) }, } @@ -462,6 +471,18 @@ var getRecordCmd = &cli.Command{ return fmt.Errorf("unrecognized link") } + atid, err := syntax.ParseAtIdentifier(did) + if err != nil { + return err + } + + resp, err := identity.DefaultDirectory().Lookup(ctx, *atid) + if err != nil { + return err + } + + xrpcc.Host = resp.PDSEndpoint() + out, err := comatproto.RepoGetRecord(ctx, xrpcc, "", collection, did, rkey) if err != nil { return err @@ -490,7 +511,7 @@ var getRecordCmd = &cli.Command{ rc, rec, err := rr.GetRecord(ctx, cctx.Args().First()) if err != nil { - return err + return fmt.Errorf("get record failed: %w", err) } if cctx.Bool("raw") { diff --git a/cmd/gosky/streamdiff.go b/cmd/gosky/streamdiff.go index 2f8d8b0c2..230bfc1da 100644 --- a/cmd/gosky/streamdiff.go +++ b/cmd/gosky/streamdiff.go @@ -58,9 +58,9 @@ var streamCompareCmd = &cli.Command{ }, } seqScheduler := sequential.NewScheduler("streamA", rsc.EventHandler) - err = events.HandleRepoStream(ctx, cona, seqScheduler) + err = events.HandleRepoStream(ctx, cona, seqScheduler, log) if err != nil { - log.Errorf("stream A failed: %s", err) + log.Error("stream A failed", "err", err) } }() @@ -82,9 +82,9 @@ var streamCompareCmd = &cli.Command{ } seqScheduler := sequential.NewScheduler("streamB", rsc.EventHandler) - err = events.HandleRepoStream(ctx, conb, seqScheduler) + err = events.HandleRepoStream(ctx, conb, seqScheduler, log) if err != nil { - log.Errorf("stream B failed: %s", err) + log.Error("stream B failed", "err", err) } }() diff --git a/cmd/gosky/sync.go b/cmd/gosky/sync.go index 9e88466b0..357131a10 100644 --- a/cmd/gosky/sync.go +++ b/cmd/gosky/sync.go @@ -66,7 +66,7 @@ var syncGetRepoCmd = &cli.Command{ xrpcc.Host = h } - log.Infof("downloading from %s to: %s", xrpcc.Host, carPath) + log.Info("downloading", "from", xrpcc.Host, "to", carPath) repoBytes, err := comatproto.SyncGetRepo(ctx, xrpcc, ident.DID.String(), "") if err != nil { return err diff --git a/cmd/hepa/main.go b/cmd/hepa/main.go index 7883d9c29..9e3029b6c 100644 --- a/cmd/hepa/main.go +++ b/cmd/hepa/main.go @@ -149,6 +149,30 @@ func run(args []string) error { Usage: "secret token for prescreen server", EnvVars: []string{"HEPA_PRESCREEN_TOKEN"}, }, + &cli.DurationFlag{ + Name: "report-dupe-period", + Usage: "time period within which automod will not re-report an account for the same reasonType", + EnvVars: []string{"HEPA_REPORT_DUPE_PERIOD"}, + Value: 1 * 24 * time.Hour, + }, + &cli.IntFlag{ + Name: "quota-mod-report-day", + Usage: "number of reports automod can file per day, for all subjects and types combined (circuit breaker)", + EnvVars: []string{"HEPA_QUOTA_MOD_REPORT_DAY"}, + Value: 10000, + }, + &cli.IntFlag{ + Name: "quota-mod-takedown-day", + Usage: "number of takedowns automod can action per day, for all subjects combined (circuit breaker)", + EnvVars: []string{"HEPA_QUOTA_MOD_TAKEDOWN_DAY"}, + Value: 200, + }, + &cli.IntFlag{ + Name: "quota-mod-action-day", + Usage: "number of misc actions automod can do per day, for all subjects combined (circuit breaker)", + EnvVars: []string{"HEPA_QUOTA_MOD_ACTION_DAY"}, + Value: 2000, + }, } app.Commands = []*cli.Command{ @@ -255,6 +279,10 @@ var runCmd = &cli.Command{ FirehoseParallelism: cctx.Int("firehose-parallelism"), // DEPRECATED PreScreenHost: cctx.String("prescreen-host"), PreScreenToken: cctx.String("prescreen-token"), + ReportDupePeriod: cctx.Duration("report-dupe-period"), + QuotaModReportDay: cctx.Int("quota-mod-report-day"), + QuotaModTakedownDay: cctx.Int("quota-mod-takedown-day"), + QuotaModActionDay: cctx.Int("quota-mod-action-day"), }, ) if err != nil { diff --git a/cmd/hepa/server.go b/cmd/hepa/server.go index 9fe08f98e..a4ca252d1 100644 --- a/cmd/hepa/server.go +++ b/cmd/hepa/server.go @@ -14,6 +14,7 @@ import ( "github.com/bluesky-social/indigo/automod" "github.com/bluesky-social/indigo/automod/cachestore" "github.com/bluesky-social/indigo/automod/countstore" + "github.com/bluesky-social/indigo/automod/engine" "github.com/bluesky-social/indigo/automod/flagstore" "github.com/bluesky-social/indigo/automod/rules" "github.com/bluesky-social/indigo/automod/setstore" @@ -54,6 +55,10 @@ type Config struct { FirehoseParallelism int // DEPRECATED PreScreenHost string PreScreenToken string + ReportDupePeriod time.Duration + QuotaModReportDay int + QuotaModTakedownDay int + QuotaModActionDay int } func NewServer(dir identity.Directory, config Config) (*Server, error) { @@ -219,6 +224,12 @@ func NewServer(dir identity.Directory, config Config) (*Server, error) { OzoneClient: ozoneClient, AdminClient: adminClient, BlobClient: blobClient, + Config: engine.EngineConfig{ + ReportDupePeriod: config.ReportDupePeriod, + QuotaModReportDay: config.QuotaModReportDay, + QuotaModTakedownDay: config.QuotaModTakedownDay, + QuotaModActionDay: config.QuotaModActionDay, + }, } s := &Server{ diff --git a/cmd/laputa/main.go b/cmd/laputa/main.go index 2cedb393a..54da0a429 100644 --- a/cmd/laputa/main.go +++ b/cmd/laputa/main.go @@ -14,7 +14,6 @@ import ( _ "go.uber.org/automaxprocs" "github.com/carlmjohnson/versioninfo" - logging "github.com/ipfs/go-log" "github.com/urfave/cli/v2" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -25,8 +24,6 @@ import ( "gorm.io/plugin/opentelemetry/tracing" ) -var log = logging.Logger("laputa") - func main() { run(os.Args) } @@ -158,7 +155,7 @@ func run(args []string) { } } - cstore, err := carstore.NewCarStore(csdb, csdir) + cstore, err := carstore.NewCarStore(csdb, []string{csdir}) if err != nil { return err } diff --git a/cmd/netsync/main.go b/cmd/netsync/main.go index 24cd445ae..98d23a626 100644 --- a/cmd/netsync/main.go +++ b/cmd/netsync/main.go @@ -345,8 +345,8 @@ func Netsync(cctx *cli.Context) error { Handler: mux, } + state.wg.Add(1) go func() { - state.wg.Add(1) defer state.wg.Done() if err := metricsServer.ListenAndServe(); err != http.ErrServerClosed { logger.Error("failed to start metrics server", "err", err) @@ -368,8 +368,8 @@ func Netsync(cctx *cli.Context) error { } // Check for empty queue + state.wg.Add(1) go func() { - state.wg.Add(1) defer state.wg.Done() t := time.NewTicker(30 * time.Second) for { diff --git a/cmd/rainbow/Dockerfile b/cmd/rainbow/Dockerfile index 72bfc3572..b94714338 100644 --- a/cmd/rainbow/Dockerfile +++ b/cmd/rainbow/Dockerfile @@ -39,5 +39,5 @@ ENTRYPOINT ["/usr/bin/dumb-init", "--"] CMD ["/usr/bin/rainbow"] LABEL org.opencontainers.image.source=https://github.com/bluesky-social/indigo -LABEL org.opencontainers.image.description="bsky.app rainbow" +LABEL org.opencontainers.image.description="rainbow atproto firehose fanout service" LABEL org.opencontainers.image.licenses=MIT diff --git a/cmd/rainbow/README.md b/cmd/rainbow/README.md new file mode 100644 index 000000000..7fe15b656 --- /dev/null +++ b/cmd/rainbow/README.md @@ -0,0 +1,32 @@ + +`rainbow`: atproto Firehose Fanout Service +========================================== + +This is an atproto service which consumes from a firehose (eg, from a relay or PDS) and fans out events to many subscribers. + +Features and design points: + +- retains "backfill window" on local disk (using [pebble](https://github.com/cockroachdb/pebble)) +- serves the `com.atproto.sync.subscribeRepos` endpoint (WebSocket) +- retains upstream firehose "sequence numbers" +- does not validate events (signatures, repo tree, hashes, etc), just passes through +- does not archive or mirror individual records or entire repositories (or implement related API endpoints) +- disk I/O intensive: fast NVMe disks are recommended, and RAM is helpful for caching +- single golang binary for easy deployment +- observability: logging, prometheus metrics, OTEL traces + +## Running + +This is a simple, single-binary Go program. You can also build and run it as a docker container (see `./Dockerfile`). + +From the top level of this repo, you can build: + +```shell +go build ./cmd/rainbow -o rainbow-bin +``` + +or just run it, and see configuration options: + +```shell +go run ./cmd/rainbow --help +``` diff --git a/cmd/rainbow/main.go b/cmd/rainbow/main.go index 72f611e3a..d574d6260 100644 --- a/cmd/rainbow/main.go +++ b/cmd/rainbow/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "log/slog" _ "net/http/pprof" "os" "os/signal" @@ -10,8 +11,8 @@ import ( "github.com/bluesky-social/indigo/events" "github.com/bluesky-social/indigo/splitter" + "github.com/carlmjohnson/versioninfo" - logging "github.com/ipfs/go-log" _ "github.com/joho/godotenv/autoload" "github.com/urfave/cli/v2" "go.opentelemetry.io/otel" @@ -23,11 +24,11 @@ import ( _ "go.uber.org/automaxprocs" ) -var log = logging.Logger("splitter") +var log = slog.Default().With("system", "rainbow") func init() { // control log level using, eg, GOLOG_LOG_LEVEL=debug - logging.SetAllLoggers(logging.LevelDebug) + //logging.SetAllLoggers(logging.LevelDebug) } func main() { @@ -36,50 +37,55 @@ func main() { func run(args []string) { app := cli.App{ - Name: "splitter", - Usage: "firehose proxy", + Name: "rainbow", + Usage: "atproto firehose fan-out daemon", Version: versioninfo.Short(), } app.Flags = []cli.Flag{ &cli.BoolFlag{ - Name: "crawl-insecure-ws", - Usage: "when connecting to PDS instances, use ws:// instead of wss://", + Name: "crawl-insecure-ws", + Usage: "when connecting to PDS instances, use ws:// instead of wss://", + EnvVars: []string{"RAINBOW_INSECURE_CRAWL"}, }, &cli.StringFlag{ - Name: "splitter-host", - Value: "bsky.network", + Name: "splitter-host", + Value: "bsky.network", + EnvVars: []string{"ATP_RELAY_HOST", "RAINBOW_RELAY_HOST"}, }, &cli.StringFlag{ - Name: "persist-db", - Value: "", - Usage: "path to persistence db", + Name: "persist-db", + Value: "./rainbow.db", + Usage: "path to persistence db", + EnvVars: []string{"RAINBOW_DB_PATH"}, }, &cli.StringFlag{ - Name: "cursor-file", - Value: "", - Usage: "write upstream cursor number to this file", + Name: "cursor-file", + Value: "./rainbow-cursor", + Usage: "write upstream cursor number to this file", + EnvVars: []string{"RAINBOW_CURSOR_PATH"}, }, &cli.StringFlag{ - Name: "api-listen", - Value: ":2480", + Name: "api-listen", + Value: ":2480", + EnvVars: []string{"RAINBOW_API_LISTEN"}, }, &cli.StringFlag{ Name: "metrics-listen", Value: ":2481", - EnvVars: []string{"SPLITTER_METRICS_LISTEN"}, + EnvVars: []string{"RAINBOW_METRICS_LISTEN", "SPLITTER_METRICS_LISTEN"}, }, &cli.Float64Flag{ Name: "persist-hours", - Value: 24 * 7, - EnvVars: []string{"SPLITTER_PERSIST_HOURS"}, + Value: 24 * 3, + EnvVars: []string{"RAINBOW_PERSIST_HOURS", "SPLITTER_PERSIST_HOURS"}, Usage: "hours to buffer (float, may be fractional)", }, &cli.Int64Flag{ Name: "persist-bytes", Value: 0, Usage: "max bytes target for event cache, 0 to disable size target trimming", - EnvVars: []string{"SPLITTER_PERSIST_BYTES"}, + EnvVars: []string{"RAINBOW_PERSIST_BYTES", "SPLITTER_PERSIST_BYTES"}, }, &cli.StringSliceFlag{ Name: "next-crawler", @@ -88,10 +94,13 @@ func run(args []string) { }, } + // TODO: slog.SetDefault and set module `var log *slog.Logger` based on flags and env + app.Action = Splitter err := app.Run(os.Args) if err != nil { - log.Fatal(err) + log.Error(err.Error()) + os.Exit(1) } } @@ -106,19 +115,20 @@ func Splitter(cctx *cli.Context) error { // At a minimum, you need to set // OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 if ep := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"); ep != "" { - log.Infow("setting up trace exporter", "endpoint", ep) + log.Info("setting up trace exporter", "endpoint", ep) ctx, cancel := context.WithCancel(context.Background()) defer cancel() exp, err := otlptracehttp.New(ctx) if err != nil { - log.Fatalw("failed to create trace exporter", "error", err) + log.Error("failed to create trace exporter", "error", err) + os.Exit(1) } defer func() { ctx, cancel := context.WithTimeout(context.Background(), time.Second) defer cancel() if err := exp.Shutdown(ctx); err != nil { - log.Errorw("failed to shutdown trace exporter", "error", err) + log.Error("failed to shutdown trace exporter", "error", err) } }() @@ -142,7 +152,7 @@ func Splitter(cctx *cli.Context) error { var spl *splitter.Splitter var err error if persistPath != "" { - log.Infof("building splitter with storage at: %s", persistPath) + log.Info("building splitter with storage at", "path", persistPath) ppopts := events.PebblePersistOptions{ DbPath: persistPath, PersistDuration: time.Duration(float64(time.Hour) * cctx.Float64("persist-hours")), @@ -164,14 +174,16 @@ func Splitter(cctx *cli.Context) error { spl, err = splitter.NewSplitter(conf, nextCrawlers) } if err != nil { - log.Fatalw("failed to create splitter", "path", persistPath, "error", err) + log.Error("failed to create splitter", "path", persistPath, "error", err) + os.Exit(1) return err } // set up metrics endpoint go func() { if err := spl.StartMetrics(cctx.String("metrics-listen")); err != nil { - log.Fatalf("failed to start metrics endpoint: %s", err) + log.Error("failed to start metrics endpoint", "err", err) + os.Exit(1) } }() @@ -182,20 +194,20 @@ func Splitter(cctx *cli.Context) error { runErr <- err }() - log.Infow("startup complete") + log.Info("startup complete") select { case <-signals: log.Info("received shutdown signal") if err := spl.Shutdown(); err != nil { - log.Errorw("error during Splitter shutdown", "err", err) + log.Error("error during Splitter shutdown", "err", err) } case err := <-runErr: if err != nil { - log.Errorw("error during Splitter startup", "err", err) + log.Error("error during Splitter startup", "err", err) } log.Info("shutting down") if err := spl.Shutdown(); err != nil { - log.Errorw("error during Splitter shutdown", "err", err) + log.Error("error during Splitter shutdown", "err", err) } } diff --git a/cmd/sonar/main.go b/cmd/sonar/main.go index a867646fe..c5ece10ec 100644 --- a/cmd/sonar/main.go +++ b/cmd/sonar/main.go @@ -104,8 +104,8 @@ func Sonar(cctx *cli.Context) error { pool := sequential.NewScheduler(u.Host, s.HandleStreamEvent) // Start a goroutine to manage the cursor file, saving the current cursor every 5 seconds. + wg.Add(1) go func() { - wg.Add(1) defer wg.Done() ticker := time.NewTicker(5 * time.Second) logger := logger.With("source", "cursor_file_manager") @@ -130,8 +130,8 @@ func Sonar(cctx *cli.Context) error { }() // Start a goroutine to manage the liveness checker, shutting down if no events are received for 15 seconds + wg.Add(1) go func() { - wg.Add(1) defer wg.Done() ticker := time.NewTicker(15 * time.Second) lastSeq := int64(0) @@ -167,8 +167,8 @@ func Sonar(cctx *cli.Context) error { } // Startup metrics server + wg.Add(1) go func() { - wg.Add(1) defer wg.Done() logger = logger.With("source", "metrics_server") @@ -194,10 +194,10 @@ func Sonar(cctx *cli.Context) error { } defer c.Close() + wg.Add(1) go func() { - wg.Add(1) defer wg.Done() - err = events.HandleRepoStream(ctx, c, pool) + err = events.HandleRepoStream(ctx, c, pool, logger) logger.Info("HandleRepoStream returned unexpectedly", "err", err) cancel() }() diff --git a/cmd/stress/main.go b/cmd/stress/main.go index 0aa103e43..09288a0f8 100644 --- a/cmd/stress/main.go +++ b/cmd/stress/main.go @@ -26,13 +26,10 @@ import ( _ "github.com/joho/godotenv/autoload" "github.com/carlmjohnson/versioninfo" - logging "github.com/ipfs/go-log" "github.com/ipld/go-car" cli "github.com/urfave/cli/v2" ) -var log = logging.Logger("stress") - func main() { run(os.Args) } diff --git a/cmd/supercollider/main.go b/cmd/supercollider/main.go index 49499ef69..1c64e71d3 100644 --- a/cmd/supercollider/main.go +++ b/cmd/supercollider/main.go @@ -565,7 +565,7 @@ func initSpeedyRepoMan(key *godid.PrivKey) (*repomgr.RepoManager, *godid.PrivKey return nil, nil, err } - cs, err := carstore.NewCarStore(cardb, cspath) + cs, err := carstore.NewCarStore(cardb, []string{cspath}) if err != nil { return nil, nil, err } diff --git a/did/metrics.go b/did/metrics.go index e7bea3b56..d3c8f0ecd 100644 --- a/did/metrics.go +++ b/did/metrics.go @@ -9,3 +9,9 @@ var mrResolvedDidsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "multiresolver_resolved_dids_total", Help: "Total number of DIDs resolved", }, []string{"resolver"}) + +var mrResolveDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "indigo_multiresolver_resolve_duration_seconds", + Help: "A histogram of resolve latencies", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), +}, []string{"resolver"}) diff --git a/did/multi.go b/did/multi.go index e871ff454..2c67781fe 100644 --- a/did/multi.go +++ b/did/multi.go @@ -3,6 +3,7 @@ package did import ( "context" "fmt" + "time" "github.com/whyrusleeping/go-did" ) @@ -43,12 +44,17 @@ func (mr *MultiResolver) FlushCacheFor(didstr string) { } func (mr *MultiResolver) GetDocument(ctx context.Context, didstr string) (*did.Document, error) { + s := time.Now() + pdid, err := did.ParseDID(didstr) if err != nil { return nil, err } method := pdid.Protocol() + defer func() { + mrResolveDuration.WithLabelValues(method).Observe(time.Since(s).Seconds()) + }() res, ok := mr.handlers[method] if !ok { diff --git a/events/consumer.go b/events/consumer.go index 5d373f30e..f00cd79c7 100644 --- a/events/consumer.go +++ b/events/consumer.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io" + "log/slog" "net" "time" @@ -108,7 +109,14 @@ func (sr *instrumentedReader) Read(p []byte) (int, error) { return n, err } -func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) error { +// HandleRepoStream +// con is source of events +// sched gets AddWork for each event +// log may be nil for default logger +func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler, log *slog.Logger) error { + if log == nil { + log = slog.Default().With("system", "events") + } ctx, cancel := context.WithCancel(ctx) defer cancel() defer sched.Shutdown() @@ -124,7 +132,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) select { case <-t.C: if err := con.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(time.Second*10)); err != nil { - log.Warnf("failed to ping: %s", err) + log.Warn("failed to ping", "err", err) } case <-ctx.Done(): con.Close() @@ -145,7 +153,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) con.SetPongHandler(func(_ string) error { if err := con.SetReadDeadline(time.Now().Add(time.Minute)); err != nil { - log.Errorf("failed to set read deadline: %s", err) + log.Error("failed to set read deadline", "err", err) } return nil @@ -194,7 +202,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) } if evt.Seq < lastSeq { - log.Errorf("Got events out of order from stream (seq = %d, prev = %d)", evt.Seq, lastSeq) + log.Error("Got events out of order from stream", "seq", evt.Seq, "prev", lastSeq) } lastSeq = evt.Seq @@ -211,7 +219,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) } if evt.Seq < lastSeq { - log.Errorf("Got events out of order from stream (seq = %d, prev = %d)", evt.Seq, lastSeq) + log.Error("Got events out of order from stream", "seq", evt.Seq, "prev", lastSeq) } lastSeq = evt.Seq @@ -227,7 +235,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) } if evt.Seq < lastSeq { - log.Errorf("Got events out of order from stream (seq = %d, prev = %d)", evt.Seq, lastSeq) + log.Error("Got events out of order from stream", "seq", evt.Seq, "prev", lastSeq) } lastSeq = evt.Seq @@ -243,7 +251,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) } if evt.Seq < lastSeq { - log.Errorf("Got events out of order from stream (seq = %d, prev = %d)", evt.Seq, lastSeq) + log.Error("Got events out of order from stream", "seq", evt.Seq, "prev", lastSeq) } lastSeq = evt.Seq @@ -271,7 +279,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) } if evt.Seq < lastSeq { - log.Errorf("Got events out of order from stream (seq = %d, prev = %d)", evt.Seq, lastSeq) + log.Error("Got events out of order from stream", "seq", evt.Seq, "prev", lastSeq) } lastSeq = evt.Seq @@ -287,7 +295,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) } if evt.Seq < lastSeq { - log.Errorf("Got events out of order from stream (seq = %d, prev = %d)", evt.Seq, lastSeq) + log.Error("Got events out of order from stream", "seq", evt.Seq, "prev", lastSeq) } lastSeq = evt.Seq @@ -303,7 +311,7 @@ func HandleRepoStream(ctx context.Context, con *websocket.Conn, sched Scheduler) } if evt.Seq < lastSeq { - log.Errorf("Got events out of order from stream (seq = %d, prev = %d)", evt.Seq, lastSeq) + log.Error("Got events out of order from stream", "seq", evt.Seq, "prev", lastSeq) } lastSeq = evt.Seq diff --git a/events/dbpersist.go b/events/dbpersist.go index a9d6288f2..ab3b1e466 100644 --- a/events/dbpersist.go +++ b/events/dbpersist.go @@ -131,7 +131,7 @@ func (p *DbPersistence) batchFlusher() { if needsFlush { if err := p.Flush(context.Background()); err != nil { - log.Errorf("failed to flush batch: %s", err) + log.Error("failed to flush batch", "err", err) } } } @@ -323,7 +323,7 @@ func (p *DbPersistence) RecordFromTombstone(ctx context.Context, evt *comatproto func (p *DbPersistence) RecordFromRepoCommit(ctx context.Context, evt *comatproto.SyncSubscribeRepos_Commit) (*RepoEventRecord, error) { // TODO: hack hack hack if len(evt.Ops) > 8192 { - log.Errorf("(VERY BAD) truncating ops field in outgoing event (len = %d)", len(evt.Ops)) + log.Error("(VERY BAD) truncating ops field in outgoing event", "len", len(evt.Ops)) evt.Ops = evt.Ops[:8192] } diff --git a/events/dbpersist_test.go b/events/dbpersist_test.go index 7feced26d..c6c2161af 100644 --- a/events/dbpersist_test.go +++ b/events/dbpersist_test.go @@ -16,15 +16,10 @@ import ( pds "github.com/bluesky-social/indigo/pds/data" "github.com/bluesky-social/indigo/repomgr" "github.com/bluesky-social/indigo/util" - logging "github.com/ipfs/go-log/v2" "gorm.io/driver/sqlite" "gorm.io/gorm" ) -func init() { - logging.SetAllLoggers(logging.LevelDebug) -} - func BenchmarkDBPersist(b *testing.B) { ctx := context.Background() @@ -300,7 +295,7 @@ func setupDBs(t testing.TB) (*gorm.DB, *gorm.DB, carstore.CarStore, string, erro return nil, nil, nil, "", err } - cs, err := carstore.NewCarStore(cardb, cspath) + cs, err := carstore.NewCarStore(cardb, []string{cspath}) if err != nil { return nil, nil, nil, "", err } diff --git a/events/diskpersist.go b/events/diskpersist.go index 25eb989af..43778fe45 100644 --- a/events/diskpersist.go +++ b/events/diskpersist.go @@ -312,7 +312,7 @@ func (dp *DiskPersistence) flushRoutine() { dp.lk.Lock() if err := dp.flushLog(ctx); err != nil { // TODO: this happening is quite bad. Need a recovery strategy - log.Errorf("failed to flush disk log: %s", err) + log.Error("failed to flush disk log", "err", err) } dp.lk.Unlock() } @@ -354,7 +354,7 @@ func (dp *DiskPersistence) garbageCollectRoutine() { case <-t.C: if errs := dp.garbageCollect(ctx); len(errs) > 0 { for _, err := range errs { - log.Errorf("garbage collection error: %s", err) + log.Error("garbage collection error", "err", err) } } } @@ -430,7 +430,7 @@ func (dp *DiskPersistence) garbageCollect(ctx context.Context) []error { refsGarbageCollected.WithLabelValues().Add(float64(refsDeleted)) filesGarbageCollected.WithLabelValues().Add(float64(filesDeleted)) - log.Infow("garbage collection complete", + log.Info("garbage collection complete", "filesDeleted", filesDeleted, "refsDeleted", refsDeleted, "oldRefsFound", oldRefsFound, @@ -696,7 +696,7 @@ func (dp *DiskPersistence) readEventsFrom(ctx context.Context, since int64, fn s return nil, err } if since > lastSeq { - log.Errorw("playback cursor is greater than last seq of file checked", + log.Error("playback cursor is greater than last seq of file checked", "since", since, "lastSeq", lastSeq, "filename", fn, @@ -778,7 +778,7 @@ func (dp *DiskPersistence) readEventsFrom(ctx context.Context, since int64, fn s return nil, err } default: - log.Warnw("unrecognized event kind coming from log file", "seq", h.Seq, "kind", h.Kind) + log.Warn("unrecognized event kind coming from log file", "seq", h.Seq, "kind", h.Kind) return nil, fmt.Errorf("halting on unrecognized event kind") } } diff --git a/events/events.go b/events/events.go index 168d6d6dd..5619fc20e 100644 --- a/events/events.go +++ b/events/events.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "io" + "log/slog" "sync" "time" @@ -14,12 +15,11 @@ import ( "github.com/bluesky-social/indigo/models" "github.com/prometheus/client_golang/prometheus" - logging "github.com/ipfs/go-log" cbg "github.com/whyrusleeping/cbor-gen" "go.opentelemetry.io/otel" ) -var log = logging.Logger("events") +var log = slog.Default().With("system", "events") type Scheduler interface { AddWork(ctx context.Context, repo string, val *XRPCStreamEvent) error @@ -34,6 +34,8 @@ type EventManager struct { crossoverBufferSize int persister EventPersistence + + log *slog.Logger } func NewEventManager(persister EventPersistence) *EventManager { @@ -41,6 +43,7 @@ func NewEventManager(persister EventPersistence) *EventManager { bufferSize: 16 << 10, crossoverBufferSize: 512, persister: persister, + log: slog.Default().With("system", "events"), } persister.SetEventBroadcaster(em.broadcastEvent) @@ -67,7 +70,7 @@ func (em *EventManager) Shutdown(ctx context.Context) error { func (em *EventManager) broadcastEvent(evt *XRPCStreamEvent) { // the main thing we do is send it out, so MarshalCBOR once if err := evt.Preserialize(); err != nil { - log.Errorf("broadcast serialize failed, %s", err) + em.log.Error("broadcast serialize failed", "err", err) // serialize isn't going to go better later, this event is cursed return } @@ -93,7 +96,7 @@ func (em *EventManager) broadcastEvent(evt *XRPCStreamEvent) { // code s.filter = func(*XRPCStreamEvent) bool { return false } - log.Warnw("dropping slow consumer due to event overflow", "bufferSize", len(s.outgoing), "ident", s.ident) + em.log.Warn("dropping slow consumer due to event overflow", "bufferSize", len(s.outgoing), "ident", s.ident) go func(torem *Subscriber) { torem.lk.Lock() if !torem.cleanedUp { @@ -104,7 +107,7 @@ func (em *EventManager) broadcastEvent(evt *XRPCStreamEvent) { }, }: case <-time.After(time.Second * 5): - log.Warnw("failed to send error frame to backed up consumer", "ident", torem.ident) + em.log.Warn("failed to send error frame to backed up consumer", "ident", torem.ident) } } torem.lk.Unlock() @@ -121,7 +124,7 @@ func (em *EventManager) persistAndSendEvent(ctx context.Context, evt *XRPCStream // accept a uid. The lookup inside the persister is notably expensive (despite // being an lru cache?) if err := em.persister.Persist(ctx, evt); err != nil { - log.Errorf("failed to persist outbound event: %s", err) + em.log.Error("failed to persist outbound event", "err", err) } } @@ -370,9 +373,9 @@ func (em *EventManager) Subscribe(ctx context.Context, ident string, filter func } }); err != nil { if errors.Is(err, ErrPlaybackShutdown) { - log.Warnf("events playback: %s", err) + em.log.Warn("events playback", "err", err) } else { - log.Errorf("events playback: %s", err) + em.log.Error("events playback", "err", err) } // TODO: send an error frame or something? @@ -400,7 +403,7 @@ func (em *EventManager) Subscribe(ctx context.Context, ident string, filter func } }); err != nil { if !errors.Is(err, ErrCaughtUp) { - log.Errorf("events playback: %s", err) + em.log.Error("events playback", "err", err) // TODO: send an error frame or something? close(out) diff --git a/events/pebblepersist.go b/events/pebblepersist.go index 2c1c787e5..ff65387e3 100644 --- a/events/pebblepersist.go +++ b/events/pebblepersist.go @@ -193,7 +193,7 @@ func (pp *PebblePersist) GCThread(ctx context.Context) { case <-ticker.C: err := pp.GarbageCollect(ctx) if err != nil { - log.Errorw("GC err", "err", err) + log.Error("GC err", "err", err) } case <-ctx.Done(): return @@ -239,24 +239,24 @@ func (pp *PebblePersist) GarbageCollect(ctx context.Context) error { sizeBefore, _ := pp.db.EstimateDiskUsage(zeroKey[:], ffffKey[:]) if seq == -1 { // nothing to delete - log.Infow("pebble gc nop", "size", sizeBefore) + log.Info("pebble gc nop", "size", sizeBefore) return nil } var key [16]byte setKeySeqMillis(key[:], seq, lastKeyTime) - log.Infow("pebble gc start", "to", hex.EncodeToString(key[:])) + log.Info("pebble gc start", "to", hex.EncodeToString(key[:])) err = pp.db.DeleteRange(zeroKey[:], key[:], pebble.Sync) if err != nil { return err } sizeAfter, _ := pp.db.EstimateDiskUsage(zeroKey[:], ffffKey[:]) - log.Infow("pebble gc", "before", sizeBefore, "after", sizeAfter) + log.Info("pebble gc", "before", sizeBefore, "after", sizeAfter) start := time.Now() err = pp.db.Compact(zeroKey[:], key[:], true) if err != nil { - log.Warnw("pebble gc compact", "err", err) + log.Warn("pebble gc compact", "err", err) } dt := time.Since(start) - log.Infow("pebble gc compact ok", "dt", dt) + log.Info("pebble gc compact ok", "dt", dt) return nil } diff --git a/events/schedulers/autoscaling/autoscaling.go b/events/schedulers/autoscaling/autoscaling.go index 04f973bd3..f842b8ad0 100644 --- a/events/schedulers/autoscaling/autoscaling.go +++ b/events/schedulers/autoscaling/autoscaling.go @@ -2,17 +2,15 @@ package autoscaling import ( "context" + "log/slog" "sync" "time" "github.com/bluesky-social/indigo/events" "github.com/bluesky-social/indigo/events/schedulers" - logging "github.com/ipfs/go-log" "github.com/prometheus/client_golang/prometheus" ) -var log = logging.Logger("autoscaling-scheduler") - // Scheduler is a scheduler that will scale up and down the number of workers based on the throughput of the workers. type Scheduler struct { concurrency int @@ -40,6 +38,8 @@ type Scheduler struct { autoscaleFrequency time.Duration autoscalerIn chan struct{} autoscalerOut chan struct{} + + log *slog.Logger } type AutoscaleSettings struct { @@ -99,6 +99,8 @@ func NewScheduler(autoscaleSettings AutoscaleSettings, ident string, do func(con autoscaleFrequency: autoscaleSettings.AutoscaleFrequency, autoscalerIn: make(chan struct{}), autoscalerOut: make(chan struct{}), + + log: slog.Default().With("system", "autoscaling-scheduler"), } for i := 0; i < p.concurrency; i++ { @@ -111,28 +113,28 @@ func NewScheduler(autoscaleSettings AutoscaleSettings, ident string, do func(con } func (p *Scheduler) Shutdown() { - log.Debugf("shutting down autoscaling scheduler for %s", p.ident) + p.log.Debug("shutting down autoscaling scheduler", "ident", p.ident) // stop autoscaling p.autoscalerIn <- struct{}{} close(p.autoscalerIn) <-p.autoscalerOut - log.Debug("stopping autoscaling scheduler workers") + p.log.Debug("stopping autoscaling scheduler workers") // stop workers for i := 0; i < p.concurrency; i++ { p.feeder <- &consumerTask{signal: "stop"} } close(p.feeder) - log.Debug("waiting for autoscaling scheduler workers to stop") + p.log.Debug("waiting for autoscaling scheduler workers to stop") p.workerGroup.Wait() - log.Debug("stopping autoscaling scheduler throughput manager") + p.log.Debug("stopping autoscaling scheduler throughput manager") p.throughputManager.Stop() - log.Debug("autoscaling scheduler shutdown complete") + p.log.Debug("autoscaling scheduler shutdown complete") } // Add autoscaling function @@ -197,7 +199,7 @@ func (p *Scheduler) AddWork(ctx context.Context, repo string, val *events.XRPCSt } func (p *Scheduler) worker() { - log.Debugf("starting autoscaling worker for %s", p.ident) + p.log.Debug("starting autoscaling worker", "ident", p.ident) p.workersActive.Inc() p.workerGroup.Add(1) defer p.workerGroup.Done() @@ -205,21 +207,21 @@ func (p *Scheduler) worker() { for work != nil { // Check if the work item contains a signal to stop the worker. if work.signal == "stop" { - log.Debugf("stopping autoscaling worker for %s", p.ident) + p.log.Debug("stopping autoscaling worker", "ident", p.ident) p.workersActive.Dec() return } p.itemsActive.Inc() if err := p.do(context.TODO(), work.val); err != nil { - log.Errorf("event handler failed: %s", err) + p.log.Error("event handler failed", "err", err) } p.itemsProcessed.Inc() p.lk.Lock() rem, ok := p.active[work.repo] if !ok { - log.Errorf("should always have an 'active' entry if a worker is processing a job") + p.log.Error("should always have an 'active' entry if a worker is processing a job") } if len(rem) == 0 { diff --git a/events/schedulers/parallel/parallel.go b/events/schedulers/parallel/parallel.go index 8862c94d6..296f6d8d5 100644 --- a/events/schedulers/parallel/parallel.go +++ b/events/schedulers/parallel/parallel.go @@ -2,17 +2,15 @@ package parallel import ( "context" + "log/slog" "sync" "github.com/bluesky-social/indigo/events" "github.com/bluesky-social/indigo/events/schedulers" - logging "github.com/ipfs/go-log" "github.com/prometheus/client_golang/prometheus" ) -var log = logging.Logger("parallel-scheduler") - // Scheduler is a parallel scheduler that will run work on a fixed number of workers type Scheduler struct { maxConcurrency int @@ -33,6 +31,8 @@ type Scheduler struct { itemsProcessed prometheus.Counter itemsActive prometheus.Counter workesActive prometheus.Gauge + + log *slog.Logger } func NewScheduler(maxC, maxQ int, ident string, do func(context.Context, *events.XRPCStreamEvent) error) *Scheduler { @@ -52,6 +52,8 @@ func NewScheduler(maxC, maxQ int, ident string, do func(context.Context, *events itemsProcessed: schedulers.WorkItemsProcessed.WithLabelValues(ident, "parallel"), itemsActive: schedulers.WorkItemsActive.WithLabelValues(ident, "parallel"), workesActive: schedulers.WorkersActive.WithLabelValues(ident, "parallel"), + + log: slog.Default().With("system", "parallel-scheduler"), } for i := 0; i < maxC; i++ { @@ -64,7 +66,7 @@ func NewScheduler(maxC, maxQ int, ident string, do func(context.Context, *events } func (p *Scheduler) Shutdown() { - log.Infof("shutting down parallel scheduler for %s", p.ident) + p.log.Info("shutting down parallel scheduler", "ident", p.ident) for i := 0; i < p.maxConcurrency; i++ { p.feeder <- &consumerTask{ @@ -78,7 +80,7 @@ func (p *Scheduler) Shutdown() { <-p.out } - log.Info("parallel scheduler shutdown complete") + p.log.Info("parallel scheduler shutdown complete") } type consumerTask struct { @@ -123,14 +125,14 @@ func (p *Scheduler) worker() { p.itemsActive.Inc() if err := p.do(context.TODO(), work.val); err != nil { - log.Errorf("event handler failed: %s", err) + p.log.Error("event handler failed", "err", err) } p.itemsProcessed.Inc() p.lk.Lock() rem, ok := p.active[work.repo] if !ok { - log.Errorf("should always have an 'active' entry if a worker is processing a job") + p.log.Error("should always have an 'active' entry if a worker is processing a job") } if len(rem) == 0 { diff --git a/events/schedulers/sequential/sequential.go b/events/schedulers/sequential/sequential.go index 30de71b35..77d8087e4 100644 --- a/events/schedulers/sequential/sequential.go +++ b/events/schedulers/sequential/sequential.go @@ -2,14 +2,12 @@ package sequential import ( "context" - "github.com/bluesky-social/indigo/events" "github.com/bluesky-social/indigo/events/schedulers" - logging "github.com/ipfs/go-log" "github.com/prometheus/client_golang/prometheus" ) -var log = logging.Logger("sequential-scheduler") +// var log = slog.Default().With("system", "sequential-scheduler") // Scheduler is a sequential scheduler that will run work on a single worker type Scheduler struct { diff --git a/fakedata/accounts.go b/fakedata/accounts.go index 71c35165c..3337fc8f6 100644 --- a/fakedata/accounts.go +++ b/fakedata/accounts.go @@ -20,12 +20,8 @@ type AccountCatalog struct { func (ac *AccountCatalog) Combined() []AccountContext { var combined []AccountContext - for _, c := range ac.Celebs { - combined = append(combined, c) - } - for _, r := range ac.Regulars { - combined = append(combined, r) - } + combined = append(combined, ac.Celebs...) + combined = append(combined, ac.Regulars...) return combined } @@ -72,7 +68,7 @@ func ReadAccountCatalog(path string) (*AccountCatalog, error) { return nil, fmt.Errorf("account index didn't match: %d != %d (%s)", i, u.Index, u.AccountType) } } - log.Infof("loaded account catalog: regular=%d celebrity=%d", len(catalog.Regulars), len(catalog.Celebs)) + log.Info("loaded account catalog", "regular", len(catalog.Regulars), "celebrity", len(catalog.Celebs)) return catalog, nil } diff --git a/fakedata/generators.go b/fakedata/generators.go index 8b13af173..e07d875b6 100644 --- a/fakedata/generators.go +++ b/fakedata/generators.go @@ -7,6 +7,7 @@ import ( "bytes" "context" "fmt" + "log/slog" "math/rand" "time" @@ -16,10 +17,13 @@ import ( "github.com/bluesky-social/indigo/xrpc" "github.com/brianvoe/gofakeit/v6" - logging "github.com/ipfs/go-log" ) -var log = logging.Logger("fakedata") +var log = slog.Default().With("system", "fakedata") + +func SetLogger(logger *slog.Logger) { + log = logger +} func MeasureIterations(name string) func(int) { start := time.Now() @@ -28,7 +32,7 @@ func MeasureIterations(name string) func(int) { return } total := time.Since(start) - log.Infof("%s wall runtime: count=%d total=%s mean=%s", name, count, total, total/time.Duration(count)) + log.Info("wall runtime", "name", name, "count", count, "total", total, "rate", total/time.Duration(count)) } } diff --git a/go.mod b/go.mod index da1a77d84..ff1c8ee77 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b github.com/adrg/xdg v0.5.0 github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de + github.com/bradfitz/gomemcache v0.0.0-20230905024940-24af94b03874 github.com/brianvoe/gofakeit/v6 v6.25.0 github.com/carlmjohnson/versioninfo v0.22.5 github.com/cockroachdb/pebble v1.1.2 @@ -30,8 +31,6 @@ require ( github.com/ipfs/go-ipld-cbor v0.1.0 github.com/ipfs/go-ipld-format v0.6.0 github.com/ipfs/go-libipfs v0.7.0 - github.com/ipfs/go-log v1.0.5 - github.com/ipfs/go-log/v2 v2.5.1 github.com/ipld/go-car v0.6.1-0.20230509095817-92d28eb23ba4 github.com/ipld/go-car/v2 v2.13.1 github.com/jackc/pgx/v5 v5.5.0 @@ -89,6 +88,8 @@ require ( github.com/go-redis/redis v6.15.9+incompatible // indirect github.com/golang/snappy v0.0.4 // indirect github.com/hashicorp/golang-lru v1.0.2 // indirect + github.com/ipfs/go-log v1.0.5 // indirect + github.com/ipfs/go-log/v2 v2.5.1 // indirect github.com/jackc/puddle/v2 v2.2.1 // indirect github.com/klauspost/compress v1.17.3 // indirect github.com/kr/pretty v0.3.1 // indirect diff --git a/go.sum b/go.sum index 446851e37..b72fdce5b 100644 --- a/go.sum +++ b/go.sum @@ -73,6 +73,8 @@ github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bradfitz/gomemcache v0.0.0-20230905024940-24af94b03874 h1:N7oVaKyGp8bttX0bfZGmcGkjz7DLQXhAn3DNd3T0ous= +github.com/bradfitz/gomemcache v0.0.0-20230905024940-24af94b03874/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= github.com/brianvoe/gofakeit/v6 v6.25.0 h1:ZpFjktOpLZUeF8q223o0rUuXtA+m5qW5srjvVi+JkXk= github.com/brianvoe/gofakeit/v6 v6.25.0/go.mod h1:Xj58BMSnFqcn/fAQeSK+/PLtC5kSb7FJIq4JyGa8vEs= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= diff --git a/indexer/crawler.go b/indexer/crawler.go index 3970bfd7d..526da9bb6 100644 --- a/indexer/crawler.go +++ b/indexer/crawler.go @@ -3,7 +3,9 @@ package indexer import ( "context" "fmt" + "log/slog" "sync" + "time" comatproto "github.com/bluesky-social/indigo/api/atproto" "github.com/bluesky-social/indigo/models" @@ -27,14 +29,18 @@ type CrawlDispatcher struct { doRepoCrawl func(context.Context, *crawlWork) error concurrency int + + log *slog.Logger + + done chan struct{} } -func NewCrawlDispatcher(repoFn func(context.Context, *crawlWork) error, concurrency int) (*CrawlDispatcher, error) { +func NewCrawlDispatcher(repoFn func(context.Context, *crawlWork) error, concurrency int, log *slog.Logger) (*CrawlDispatcher, error) { if concurrency < 1 { return nil, fmt.Errorf("must specify a non-zero positive integer for crawl dispatcher concurrency") } - return &CrawlDispatcher{ + out := &CrawlDispatcher{ ingest: make(chan *models.ActorInfo), repoSync: make(chan *crawlWork), complete: make(chan models.Uid), @@ -43,7 +49,12 @@ func NewCrawlDispatcher(repoFn func(context.Context, *crawlWork) error, concurre concurrency: concurrency, todo: make(map[models.Uid]*crawlWork), inProgress: make(map[models.Uid]*crawlWork), - }, nil + log: log, + done: make(chan struct{}), + } + go out.CatchupRepoGaugePoller() + + return out, nil } func (c *CrawlDispatcher) Run() { @@ -54,6 +65,10 @@ func (c *CrawlDispatcher) Run() { } } +func (c *CrawlDispatcher) Shutdown() { + close(c.done) +} + type catchupJob struct { evt *comatproto.SyncSubscribeRepos_Commit host *models.PDS @@ -173,13 +188,13 @@ func (c *CrawlDispatcher) dequeueJob(job *crawlWork) { } func (c *CrawlDispatcher) addToCatchupQueue(catchup *catchupJob) *crawlWork { - catchupEventsEnqueued.Inc() c.maplk.Lock() defer c.maplk.Unlock() // If the actor crawl is enqueued, we can append to the catchup queue which gets emptied during the crawl job, ok := c.todo[catchup.user.Uid] if ok { + catchupEventsEnqueued.WithLabelValues("todo").Inc() job.catchup = append(job.catchup, catchup) return nil } @@ -187,10 +202,12 @@ func (c *CrawlDispatcher) addToCatchupQueue(catchup *catchupJob) *crawlWork { // If the actor crawl is in progress, we can append to the nextr queue which gets emptied after the crawl job, ok = c.inProgress[catchup.user.Uid] if ok { + catchupEventsEnqueued.WithLabelValues("prog").Inc() job.next = append(job.next, catchup) return nil } + catchupEventsEnqueued.WithLabelValues("new").Inc() // Otherwise, we need to create a new crawl job for this actor and enqueue it cw := &crawlWork{ act: catchup.user, @@ -205,7 +222,7 @@ func (c *CrawlDispatcher) fetchWorker() { select { case job := <-c.repoSync: if err := c.doRepoCrawl(context.TODO(), job); err != nil { - log.Errorf("failed to perform repo crawl of %q: %s", job.act.Did, err) + c.log.Error("failed to perform repo crawl", "did", job.act.Did, "err", err) } // TODO: do we still just do this if it errors? @@ -269,3 +286,21 @@ func (c *CrawlDispatcher) RepoInSlowPath(ctx context.Context, uid models.Uid) bo return false } + +func (c *CrawlDispatcher) countReposInSlowPath() int { + c.maplk.Lock() + defer c.maplk.Unlock() + return len(c.inProgress) + len(c.todo) +} + +func (c *CrawlDispatcher) CatchupRepoGaugePoller() { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + for { + select { + case <-c.done: + case <-ticker.C: + catchupReposGauge.Set(float64(c.countReposInSlowPath())) + } + } +} diff --git a/indexer/indexer.go b/indexer/indexer.go index 18c9d3625..e6a324e9e 100644 --- a/indexer/indexer.go +++ b/indexer/indexer.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "fmt" + "log/slog" "time" comatproto "github.com/bluesky-social/indigo/api/atproto" @@ -19,14 +20,11 @@ import ( "github.com/bluesky-social/indigo/xrpc" "github.com/ipfs/go-cid" - logging "github.com/ipfs/go-log" "go.opentelemetry.io/otel" "gorm.io/gorm" "gorm.io/gorm/clause" ) -var log = logging.Logger("indexer") - const MaxEventSliceLength = 1000000 const MaxOpsSliceLength = 200 @@ -45,6 +43,8 @@ type Indexer struct { SendRemoteFollow func(context.Context, string, uint) error CreateExternalUser func(context.Context, string) (*models.ActorInfo, error) ApplyPDSClientSettings func(*xrpc.Client) + + log *slog.Logger } func NewIndexer(db *gorm.DB, notifman notifs.NotificationManager, evtman *events.EventManager, didr did.Resolver, fetcher *RepoFetcher, crawl, aggregate, spider bool) (*Indexer, error) { @@ -65,10 +65,11 @@ func NewIndexer(db *gorm.DB, notifman notifs.NotificationManager, evtman *events return nil }, ApplyPDSClientSettings: func(*xrpc.Client) {}, + log: slog.Default().With("system", "indexer"), } if crawl { - c, err := NewCrawlDispatcher(fetcher.FetchAndIndexRepo, fetcher.MaxConcurrency) + c, err := NewCrawlDispatcher(fetcher.FetchAndIndexRepo, fetcher.MaxConcurrency, ix.log) if err != nil { return nil, err } @@ -80,11 +81,17 @@ func NewIndexer(db *gorm.DB, notifman notifs.NotificationManager, evtman *events return ix, nil } +func (ix *Indexer) Shutdown() { + if ix.Crawler != nil { + ix.Crawler.Shutdown() + } +} + func (ix *Indexer) HandleRepoEvent(ctx context.Context, evt *repomgr.RepoEvent) error { ctx, span := otel.Tracer("indexer").Start(ctx, "HandleRepoEvent") defer span.End() - log.Debugw("Handling Repo Event!", "uid", evt.User) + ix.log.Debug("Handling Repo Event!", "uid", evt.User) outops := make([]*comatproto.SyncSubscribeRepos_RepoOp, 0, len(evt.Ops)) for _, op := range evt.Ops { @@ -96,7 +103,7 @@ func (ix *Indexer) HandleRepoEvent(ctx context.Context, evt *repomgr.RepoEvent) }) if err := ix.handleRepoOp(ctx, evt, &op); err != nil { - log.Errorw("failed to handle repo op", "err", err) + ix.log.Error("failed to handle repo op", "err", err) } } @@ -113,7 +120,7 @@ func (ix *Indexer) HandleRepoEvent(ctx context.Context, evt *repomgr.RepoEvent) toobig = true } - log.Debugw("Sending event", "did", did) + ix.log.Debug("Sending event", "did", did) if err := ix.events.AddEvent(ctx, &events.XRPCStreamEvent{ RepoCommit: &comatproto.SyncSubscribeRepos_Commit{ Repo: did, @@ -191,7 +198,7 @@ func (ix *Indexer) crawlRecordReferences(ctx context.Context, op *repomgr.RepoOp if e.Type == "mention" { _, err := ix.GetUserOrMissing(ctx, e.Value) if err != nil { - log.Infow("failed to parse user mention", "ref", e.Value, "err", err) + ix.log.Info("failed to parse user mention", "ref", e.Value, "err", err) } } } @@ -199,13 +206,13 @@ func (ix *Indexer) crawlRecordReferences(ctx context.Context, op *repomgr.RepoOp if rec.Reply != nil { if rec.Reply.Parent != nil { if err := ix.crawlAtUriRef(ctx, rec.Reply.Parent.Uri); err != nil { - log.Infow("failed to crawl reply parent", "cid", op.RecCid, "replyuri", rec.Reply.Parent.Uri, "err", err) + ix.log.Info("failed to crawl reply parent", "cid", op.RecCid, "replyuri", rec.Reply.Parent.Uri, "err", err) } } if rec.Reply.Root != nil { if err := ix.crawlAtUriRef(ctx, rec.Reply.Root.Uri); err != nil { - log.Infow("failed to crawl reply root", "cid", op.RecCid, "rooturi", rec.Reply.Root.Uri, "err", err) + ix.log.Info("failed to crawl reply root", "cid", op.RecCid, "rooturi", rec.Reply.Root.Uri, "err", err) } } } @@ -214,27 +221,27 @@ func (ix *Indexer) crawlRecordReferences(ctx context.Context, op *repomgr.RepoOp case *bsky.FeedRepost: if rec.Subject != nil { if err := ix.crawlAtUriRef(ctx, rec.Subject.Uri); err != nil { - log.Infow("failed to crawl repost subject", "cid", op.RecCid, "subjecturi", rec.Subject.Uri, "err", err) + ix.log.Info("failed to crawl repost subject", "cid", op.RecCid, "subjecturi", rec.Subject.Uri, "err", err) } } return nil case *bsky.FeedLike: if rec.Subject != nil { if err := ix.crawlAtUriRef(ctx, rec.Subject.Uri); err != nil { - log.Infow("failed to crawl like subject", "cid", op.RecCid, "subjecturi", rec.Subject.Uri, "err", err) + ix.log.Info("failed to crawl like subject", "cid", op.RecCid, "subjecturi", rec.Subject.Uri, "err", err) } } return nil case *bsky.GraphFollow: _, err := ix.GetUserOrMissing(ctx, rec.Subject) if err != nil { - log.Infow("failed to crawl follow subject", "cid", op.RecCid, "subjectdid", rec.Subject, "err", err) + ix.log.Info("failed to crawl follow subject", "cid", op.RecCid, "subjectdid", rec.Subject, "err", err) } return nil case *bsky.GraphBlock: _, err := ix.GetUserOrMissing(ctx, rec.Subject) if err != nil { - log.Infow("failed to crawl follow subject", "cid", op.RecCid, "subjectdid", rec.Subject, "err", err) + ix.log.Info("failed to crawl follow subject", "cid", op.RecCid, "subjectdid", rec.Subject, "err", err) } return nil case *bsky.ActorProfile: @@ -246,7 +253,7 @@ func (ix *Indexer) crawlRecordReferences(ctx context.Context, op *repomgr.RepoOp case *bsky.FeedGenerator: return nil default: - log.Warnw("unrecognized record type (crawling references)", "record", op.Record, "collection", op.Collection) + ix.log.Warn("unrecognized record type (crawling references)", "record", op.Record, "collection", op.Collection) return nil } } @@ -287,7 +294,7 @@ func (ix *Indexer) createMissingUserRecord(ctx context.Context, did string) (*mo } func (ix *Indexer) addUserToCrawler(ctx context.Context, ai *models.ActorInfo) error { - log.Debugw("Sending user to crawler: ", "did", ai.Did) + ix.log.Debug("Sending user to crawler: ", "did", ai.Did) if ix.Crawler == nil { return nil } @@ -389,7 +396,7 @@ func (ix *Indexer) GetPost(ctx context.Context, uri string) (*models.FeedPost, e } func (ix *Indexer) handleRecordDelete(ctx context.Context, evt *repomgr.RepoEvent, op *repomgr.RepoOp, local bool) error { - log.Debugw("record delete event", "collection", op.Collection) + ix.log.Debug("record delete event", "collection", op.Collection) switch op.Collection { case "app.bsky.feed.post": @@ -405,7 +412,7 @@ func (ix *Indexer) handleRecordDelete(ctx context.Context, evt *repomgr.RepoEven fp, err := ix.GetPost(ctx, uri) if err != nil { if errors.Is(err, gorm.ErrRecordNotFound) { - log.Warnw("deleting post weve never seen before. Weird.", "user", evt.User, "rkey", op.Rkey) + ix.log.Warn("deleting post weve never seen before. Weird.", "user", evt.User, "rkey", op.Rkey) return nil } return err @@ -419,7 +426,7 @@ func (ix *Indexer) handleRecordDelete(ctx context.Context, evt *repomgr.RepoEven return err } - log.Warn("TODO: remove notifications on delete") + ix.log.Warn("TODO: remove notifications on delete") /* if err := ix.notifman.RemoveRepost(ctx, fp.Author, rr.ID, evt.User); err != nil { return nil, err @@ -460,7 +467,7 @@ func (ix *Indexer) handleRecordDeleteFeedLike(ctx context.Context, evt *repomgr. return err } - log.Warnf("need to delete vote notification") + ix.log.Warn("need to delete vote notification") return nil } @@ -471,7 +478,7 @@ func (ix *Indexer) handleRecordDeleteGraphFollow(ctx context.Context, evt *repom } if q.RowsAffected == 0 { - log.Warnw("attempted to delete follow we did not have a record for", "user", evt.User, "rkey", op.Rkey) + ix.log.Warn("attempted to delete follow we did not have a record for", "user", evt.User, "rkey", op.Rkey) return nil } @@ -479,7 +486,7 @@ func (ix *Indexer) handleRecordDeleteGraphFollow(ctx context.Context, evt *repom } func (ix *Indexer) handleRecordCreate(ctx context.Context, evt *repomgr.RepoEvent, op *repomgr.RepoOp, local bool) ([]uint, error) { - log.Debugw("record create event", "collection", op.Collection) + ix.log.Debug("record create event", "collection", op.Collection) var out []uint switch rec := op.Record.(type) { @@ -529,8 +536,9 @@ func (ix *Indexer) handleRecordCreate(ctx context.Context, evt *repomgr.RepoEven case *bsky.FeedGenerator: return out, nil case *bsky.ActorProfile: - log.Debugf("TODO: got actor profile record creation, need to do something with this") + ix.log.Debug("TODO: got actor profile record creation, need to do something with this") default: + ix.log.Warn("unrecognized record", "record", op.Record, "collection", op.Collection) return nil, fmt.Errorf("unrecognized record type (creation): %s", op.Collection) } @@ -603,7 +611,7 @@ func (ix *Indexer) handleRecordCreateGraphFollow(ctx context.Context, rec *bsky. } func (ix *Indexer) handleRecordUpdate(ctx context.Context, evt *repomgr.RepoEvent, op *repomgr.RepoOp, local bool) error { - log.Debugw("record update event", "collection", op.Collection) + ix.log.Debug("record update event", "collection", op.Collection) switch rec := op.Record.(type) { case *bsky.FeedPost: @@ -623,7 +631,7 @@ func (ix *Indexer) handleRecordUpdate(ctx context.Context, evt *repomgr.RepoEven if oldReply != newReply { // the 'replyness' of the post was changed... that's weird - log.Errorf("need to properly handle case where reply-ness of posts is changed") + ix.log.Error("need to properly handle case where reply-ness of posts is changed") return nil } @@ -634,7 +642,7 @@ func (ix *Indexer) handleRecordUpdate(ctx context.Context, evt *repomgr.RepoEven } if replyto.ID != fp.ReplyTo { - log.Errorf("post was changed to be a reply to a different post") + ix.log.Error("post was changed to be a reply to a different post") return nil } } @@ -687,7 +695,7 @@ func (ix *Indexer) handleRecordUpdate(ctx context.Context, evt *repomgr.RepoEven return ix.handleRecordCreateGraphFollow(ctx, rec, evt, op) case *bsky.ActorProfile: - log.Debugf("TODO: got actor profile record update, need to do something with this") + ix.log.Debug("TODO: got actor profile record update, need to do something with this") default: return fmt.Errorf("unrecognized record type (update): %s", op.Collection) } @@ -761,7 +769,7 @@ func (ix *Indexer) handleRecordCreateFeedPost(ctx context.Context, user models.U // we're likely filling in a missing reference if !maybe.Missing { // TODO: we've already processed this record creation - log.Warnw("potentially erroneous event, duplicate create", "rkey", rkey, "user", user) + ix.log.Warn("potentially erroneous event, duplicate create", "rkey", rkey, "user", user) } if err := ix.db.Clauses(clause.OnConflict{ @@ -785,7 +793,7 @@ func (ix *Indexer) handleRecordCreateFeedPost(ctx context.Context, user models.U } func (ix *Indexer) createMissingPostRecord(ctx context.Context, puri *util.ParsedUri) (*models.FeedPost, error) { - log.Warn("creating missing post record") + ix.log.Warn("creating missing post record") ai, err := ix.GetUserOrMissing(ctx, puri.Did) if err != nil { return nil, err @@ -807,7 +815,7 @@ func (ix *Indexer) addNewPostNotification(ctx context.Context, post *bsky.FeedPo if post.Reply != nil { replyto, err := ix.GetPost(ctx, post.Reply.Parent.Uri) if err != nil { - log.Error("probably shouldn't error when processing a reply to a not-found post") + ix.log.Error("probably shouldn't error when processing a reply to a not-found post") return err } diff --git a/indexer/keymgr.go b/indexer/keymgr.go index b13b980d6..9f9de2cd5 100644 --- a/indexer/keymgr.go +++ b/indexer/keymgr.go @@ -3,6 +3,7 @@ package indexer import ( "context" "fmt" + "log/slog" did "github.com/whyrusleeping/go-did" "go.opentelemetry.io/otel" @@ -12,6 +13,8 @@ type KeyManager struct { didr DidResolver signingKey *did.PrivKey + + log *slog.Logger } type DidResolver interface { @@ -22,6 +25,7 @@ func NewKeyManager(didr DidResolver, k *did.PrivKey) *KeyManager { return &KeyManager{ didr: didr, signingKey: k, + log: slog.Default().With("system", "indexer"), } } @@ -36,7 +40,7 @@ func (km *KeyManager) VerifyUserSignature(ctx context.Context, did string, sig [ err = k.Verify(msg, sig) if err != nil { - log.Warnw("signature failed to verify", "err", err, "did", did, "pubKey", k, "sigBytes", sig, "msgBytes", msg) + km.log.Warn("signature failed to verify", "err", err, "did", did, "pubKey", k, "sigBytes", sig, "msgBytes", msg) } return err } diff --git a/indexer/metrics.go b/indexer/metrics.go index 43c9968bb..8aa1d9799 100644 --- a/indexer/metrics.go +++ b/indexer/metrics.go @@ -25,10 +25,10 @@ var reposFetched = promauto.NewCounterVec(prometheus.CounterOpts{ Help: "Number of repos fetched", }, []string{"status"}) -var catchupEventsEnqueued = promauto.NewCounter(prometheus.CounterOpts{ +var catchupEventsEnqueued = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "indexer_catchup_events_enqueued", Help: "Number of catchup events enqueued", -}) +}, []string{"how"}) var catchupEventsProcessed = promauto.NewCounter(prometheus.CounterOpts{ Name: "indexer_catchup_events_processed", @@ -39,3 +39,13 @@ var usersAddedToCatchupQueue = promauto.NewCounter(prometheus.CounterOpts{ Name: "indexer_users_added_to_catchup_queue", Help: "Number of users added to catchup queue", }) + +var catchupEventsFailed = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "indexer_catchup_events_failed", + Help: "Number of catchup events processed", +}, []string{"err"}) + +var catchupReposGauge = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "indexer_catchup_repos", + Help: "Number of repos waiting on catchup", +}) diff --git a/indexer/posts_test.go b/indexer/posts_test.go index ed21ab666..9b9fddb4a 100644 --- a/indexer/posts_test.go +++ b/indexer/posts_test.go @@ -50,7 +50,7 @@ func testIndexer(t *testing.T) *testIx { t.Fatal(err) } - cs, err := carstore.NewCarStore(cardb, cspath) + cs, err := carstore.NewCarStore(cardb, []string{cspath}) if err != nil { t.Fatal(err) } @@ -81,6 +81,7 @@ func (ix *testIx) Cleanup() { if ix.dir != "" { _ = os.RemoveAll(ix.dir) } + ix.ix.Shutdown() } // TODO: dedupe this out into some testing utility package diff --git a/indexer/repofetch.go b/indexer/repofetch.go index 6bc0fa19e..8ce68bb5f 100644 --- a/indexer/repofetch.go +++ b/indexer/repofetch.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "io/fs" + "log/slog" "sync" "github.com/bluesky-social/indigo/api/atproto" @@ -27,6 +28,7 @@ func NewRepoFetcher(db *gorm.DB, rm *repomgr.RepoManager, maxConcurrency int) *R Limiters: make(map[uint]*rate.Limiter), ApplyPDSClientSettings: func(*xrpc.Client) {}, MaxConcurrency: maxConcurrency, + log: slog.Default().With("system", "indexer"), } } @@ -40,6 +42,8 @@ type RepoFetcher struct { MaxConcurrency int ApplyPDSClientSettings func(*xrpc.Client) + + log *slog.Logger } func (rf *RepoFetcher) GetLimiter(pdsID uint) *rate.Limiter { @@ -50,8 +54,8 @@ func (rf *RepoFetcher) GetLimiter(pdsID uint) *rate.Limiter { } func (rf *RepoFetcher) GetOrCreateLimiter(pdsID uint, pdsrate float64) *rate.Limiter { - rf.LimitMux.RLock() - defer rf.LimitMux.RUnlock() + rf.LimitMux.Lock() + defer rf.LimitMux.Unlock() lim, ok := rf.Limiters[pdsID] if !ok { @@ -84,7 +88,7 @@ func (rf *RepoFetcher) fetchRepo(ctx context.Context, c *xrpc.Client, pds *model // Wait to prevent DOSing the PDS when connecting to a new stream with lots of active repos limiter.Wait(ctx) - log.Debugw("SyncGetRepo", "did", did, "since", rev) + rf.log.Debug("SyncGetRepo", "did", did, "since", rev) // TODO: max size on these? A malicious PDS could just send us a petabyte sized repo here and kill us repo, err := atproto.SyncGetRepo(ctx, c, did, rev) if err != nil { @@ -107,11 +111,13 @@ func (rf *RepoFetcher) FetchAndIndexRepo(ctx context.Context, job *crawlWork) er var pds models.PDS if err := rf.db.First(&pds, "id = ?", ai.PDS).Error; err != nil { + catchupEventsFailed.WithLabelValues("nopds").Inc() return fmt.Errorf("expected to find pds record (%d) in db for crawling one of their users: %w", ai.PDS, err) } rev, err := rf.repoman.GetRepoRev(ctx, ai.Uid) if err != nil && !isNotFound(err) { + catchupEventsFailed.WithLabelValues("noroot").Inc() return fmt.Errorf("failed to get repo root: %w", err) } @@ -123,7 +129,7 @@ func (rf *RepoFetcher) FetchAndIndexRepo(ctx context.Context, job *crawlWork) er for i, j := range job.catchup { catchupEventsProcessed.Inc() if err := rf.repoman.HandleExternalUserEvent(ctx, pds.ID, ai.Uid, ai.Did, j.evt.Since, j.evt.Rev, j.evt.Blocks, j.evt.Ops); err != nil { - log.Errorw("buffered event catchup failed", "error", err, "did", ai.Did, "i", i, "jobCount", len(job.catchup), "seq", j.evt.Seq) + rf.log.Error("buffered event catchup failed", "error", err, "did", ai.Did, "i", i, "jobCount", len(job.catchup), "seq", j.evt.Seq) resync = true // fall back to a repo sync break } @@ -151,7 +157,7 @@ func (rf *RepoFetcher) FetchAndIndexRepo(ctx context.Context, job *crawlWork) er span.RecordError(err) if ipld.IsNotFound(err) || errors.Is(err, io.EOF) || errors.Is(err, fs.ErrNotExist) { - log.Errorw("partial repo fetch was missing data", "did", ai.Did, "pds", pds.Host, "rev", rev) + rf.log.Error("partial repo fetch was missing data", "did", ai.Did, "pds", pds.Host, "rev", rev) repo, err := rf.fetchRepo(ctx, c, &pds, ai.Did, "") if err != nil { return err diff --git a/lex/type_schema.go b/lex/type_schema.go index fcec3575a..62c5e6702 100644 --- a/lex/type_schema.go +++ b/lex/type_schema.go @@ -223,7 +223,7 @@ func (s *TypeSchema) WriteHandlerStub(w io.Writer, fname, shortname, impname str } returndef = fmt.Sprintf("(*%s.%s, error)", impname, outname) case "application/cbor", "application/vnd.ipld.car", "*/*": - returndef = fmt.Sprintf("(io.Reader, error)") + returndef = "(io.Reader, error)" default: return fmt.Errorf("unrecognized output encoding (handler stub): %q", s.Output.Encoding) } diff --git a/pds/feedgen.go b/pds/feedgen.go index aacb775be..9ad51387a 100644 --- a/pds/feedgen.go +++ b/pds/feedgen.go @@ -3,6 +3,7 @@ package pds import ( "context" "fmt" + "log/slog" "sort" "strings" "time" @@ -22,13 +23,16 @@ type FeedGenerator struct { ix *indexer.Indexer readRecord ReadRecordFunc + + log *slog.Logger } -func NewFeedGenerator(db *gorm.DB, ix *indexer.Indexer, readRecord ReadRecordFunc) (*FeedGenerator, error) { +func NewFeedGenerator(db *gorm.DB, ix *indexer.Indexer, readRecord ReadRecordFunc, log *slog.Logger) (*FeedGenerator, error) { return &FeedGenerator{ db: db, ix: ix, readRecord: readRecord, + log: log, }, nil } @@ -355,7 +359,7 @@ func (fg *FeedGenerator) hydrateVote(ctx context.Context, v *models.VoteRecord) func (fg *FeedGenerator) GetVotes(ctx context.Context, uri string, pcid cid.Cid, limit int, before string) ([]*HydratedVote, error) { if before != "" { - log.Warn("not respecting 'before' yet") + fg.log.Warn("not respecting 'before' yet") } p, err := fg.ix.GetPost(ctx, uri) diff --git a/pds/handlers_test.go b/pds/handlers_test.go index fe2bb14b8..9cecd3f91 100644 --- a/pds/handlers_test.go +++ b/pds/handlers_test.go @@ -29,7 +29,7 @@ func testCarStore(t *testing.T, db *gorm.DB) (carstore.CarStore, func()) { t.Fatal(err) } - cs, err := carstore.NewCarStore(db, sharddir) + cs, err := carstore.NewCarStore(db, []string{sharddir}) if err != nil { t.Fatal(err) } diff --git a/pds/server.go b/pds/server.go index 54f1dfed1..c883012ed 100644 --- a/pds/server.go +++ b/pds/server.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "fmt" + "log/slog" "net" "net/http" "net/mail" @@ -14,7 +15,6 @@ import ( "github.com/bluesky-social/indigo/api/atproto" comatproto "github.com/bluesky-social/indigo/api/atproto" - bsky "github.com/bluesky-social/indigo/api/bsky" "github.com/bluesky-social/indigo/carstore" "github.com/bluesky-social/indigo/events" "github.com/bluesky-social/indigo/indexer" @@ -30,7 +30,6 @@ import ( gojwt "github.com/golang-jwt/jwt" "github.com/gorilla/websocket" "github.com/ipfs/go-cid" - logging "github.com/ipfs/go-log" "github.com/labstack/echo/v4" "github.com/labstack/echo/v4/middleware" "github.com/lestrrat-go/jwx/v2/jwt" @@ -38,8 +37,6 @@ import ( "gorm.io/gorm" ) -var log = logging.Logger("pds") - type Server struct { db *gorm.DB cs carstore.CarStore @@ -57,6 +54,8 @@ type Server struct { serviceUrl string plc plc.PLCClient + + log *slog.Logger } // serverListenerBootTimeout is how long to wait for the requested server socket @@ -97,18 +96,20 @@ func NewServer(db *gorm.DB, cs carstore.CarStore, serkey *did.PrivKey, handleSuf serviceUrl: serviceUrl, jwtSigningKey: jwtkey, enforcePeering: false, + + log: slog.Default().With("system", "pds"), } repoman.SetEventHandler(func(ctx context.Context, evt *repomgr.RepoEvent) { if err := ix.HandleRepoEvent(ctx, evt); err != nil { - log.Errorw("handle repo event failed", "user", evt.User, "err", err) + s.log.Error("handle repo event failed", "user", evt.User, "err", err) } }, true) //ix.SendRemoteFollow = s.sendRemoteFollow ix.CreateExternalUser = s.createExternalUser - feedgen, err := NewFeedGenerator(db, ix, s.readRecordFunc) + feedgen, err := NewFeedGenerator(db, ix, s.readRecordFunc, s.log) if err != nil { return nil, err } @@ -200,15 +201,6 @@ func (s *Server) createExternalUser(ctx context.Context, did string) (*models.Ac handle = hurl.Host } - profile, err := bsky.ActorGetProfile(ctx, c, did) - if err != nil { - return nil, err - } - - if handle != profile.Handle { - return nil, fmt.Errorf("mismatch in handle between did document and pds profile (%s != %s)", handle, profile.Handle) - } - // TODO: request this users info from their server to fill out our data... u := User{ Handle: handle, @@ -225,7 +217,7 @@ func (s *Server) createExternalUser(ctx context.Context, did string) (*models.Ac subj := &models.ActorInfo{ Uid: u.ID, Handle: sql.NullString{String: handle, Valid: true}, - DisplayName: *profile.DisplayName, + DisplayName: "missing display name", Did: did, Type: "", PDS: peering.ID, @@ -434,7 +426,7 @@ type HealthStatus struct { func (s *Server) HandleHealthCheck(c echo.Context) error { if err := s.db.Exec("SELECT 1").Error; err != nil { - log.Errorf("healthcheck can't connect to database: %v", err) + s.log.Error("healthcheck can't connect to database", "err", err) return c.JSON(500, HealthStatus{Status: "error", Message: "can't connect to database"}) } else { return c.JSON(200, HealthStatus{Status: "ok"}) @@ -726,7 +718,7 @@ func (s *Server) EventsHandler(c echo.Context) error { func (s *Server) UpdateUserHandle(ctx context.Context, u *User, handle string) error { if u.Handle == handle { // no change? move on - log.Warnw("attempted to change handle to current handle", "did", u.Did, "handle", handle) + s.log.Warn("attempted to change handle to current handle", "did", u.Did, "handle", handle) return nil } diff --git a/plc/memcached.go b/plc/memcached.go new file mode 100644 index 000000000..50b0e2925 --- /dev/null +++ b/plc/memcached.go @@ -0,0 +1,90 @@ +package plc + +import ( + "context" + "encoding/json" + "github.com/bradfitz/gomemcache/memcache" + "go.opentelemetry.io/otel/attribute" + "time" + + "github.com/bluesky-social/indigo/did" + "go.opentelemetry.io/otel" +) + +type MemcachedDidResolver struct { + mcd *memcache.Client + res did.Resolver + maxAge int32 +} + +func NewMemcachedDidResolver(res did.Resolver, maxAge time.Duration, servers []string) *MemcachedDidResolver { + expiry := int32(0) + if maxAge.Seconds() > (30 * 24 * 60 * 60) { + // clamp expiry at 30 days minus a minute for memcached + expiry = (30 * 24 * 60 * 60) - 60 + } else { + expiry = int32(maxAge.Seconds()) + } + client := memcache.New(servers...) + return &MemcachedDidResolver{ + mcd: client, + res: res, + maxAge: expiry, + } +} + +func (r *MemcachedDidResolver) FlushCacheFor(didstr string) { + r.mcd.Delete(didstr) + r.res.FlushCacheFor(didstr) +} + +func (r *MemcachedDidResolver) tryCache(didstr string) (*did.Document, bool) { + ob, err := r.mcd.Get(didstr) + if (ob == nil) || (err != nil) { + return nil, false + } + var doc did.Document + err = json.Unmarshal(ob.Value, &doc) + if err != nil { + // TODO: log error? + return nil, false + } + + return &doc, true +} + +func (r *MemcachedDidResolver) putCache(did string, doc *did.Document) { + blob, err := json.Marshal(doc) + if err != nil { + // TODO: log error + return + } + item := memcache.Item{ + Key: did, + Value: blob, + Expiration: int32(r.maxAge), + } + r.mcd.Set(&item) +} + +func (r *MemcachedDidResolver) GetDocument(ctx context.Context, didstr string) (*did.Document, error) { + ctx, span := otel.Tracer("cacheResolver").Start(ctx, "getDocument") + defer span.End() + + doc, ok := r.tryCache(didstr) + if ok { + span.SetAttributes(attribute.Bool("cache", true)) + memcacheHitsTotal.Inc() + return doc, nil + } + memcacheMissesTotal.Inc() + span.SetAttributes(attribute.Bool("cache", false)) + + doc, err := r.res.GetDocument(ctx, didstr) + if err != nil { + return nil, err + } + + r.putCache(didstr, doc) + return doc, nil +} diff --git a/plc/metrics.go b/plc/metrics.go index 3a2521bd9..ea2af2022 100644 --- a/plc/metrics.go +++ b/plc/metrics.go @@ -14,3 +14,13 @@ var cacheMissesTotal = promauto.NewCounter(prometheus.CounterOpts{ Name: "plc_cache_misses_total", Help: "Total number of cache misses", }) + +var memcacheHitsTotal = promauto.NewCounter(prometheus.CounterOpts{ + Name: "plc_memcache_hits_total", + Help: "Total number of cache hits", +}) + +var memcacheMissesTotal = promauto.NewCounter(prometheus.CounterOpts{ + Name: "plc_memcache_misses_total", + Help: "Total number of cache misses", +}) diff --git a/repomgr/bench_test.go b/repomgr/bench_test.go index 271813909..1137474f0 100644 --- a/repomgr/bench_test.go +++ b/repomgr/bench_test.go @@ -54,12 +54,20 @@ func BenchmarkRepoMgrCreates(b *testing.B) { b.Fatal(err) } - cs, err := carstore.NewCarStore(cardb, cspath) + // TODO: constructor for 'either type' + /* + cs, err := carstore.NewCarStore(cardb, []string{cspath}) + if err != nil { + b.Fatal(err) + } + */ + cs, err := carstore.NewNonArchivalCarstore(cardb) if err != nil { b.Fatal(err) } repoman := NewRepoManager(cs, &util.FakeKeyManager{}) + repoman.noArchive = true ctx := context.TODO() if err := repoman.InitNewActor(ctx, 1, "hello.world", "did:foo:bar", "catdog", "", ""); err != nil { diff --git a/repomgr/ingest_test.go b/repomgr/ingest_test.go index dcb9097ac..602a06c38 100644 --- a/repomgr/ingest_test.go +++ b/repomgr/ingest_test.go @@ -50,7 +50,7 @@ func TestLoadNewRepo(t *testing.T) { t.Fatal(err) } - cs, err := carstore.NewCarStore(cardb, cspath) + cs, err := carstore.NewCarStore(cardb, []string{cspath}) if err != nil { t.Fatal(err) } @@ -69,7 +69,7 @@ func TestLoadNewRepo(t *testing.T) { } } -func testCarstore(t *testing.T, dir string) carstore.CarStore { +func testCarstore(t *testing.T, dir string, archive bool) carstore.CarStore { cardb, err := gorm.Open(sqlite.Open(filepath.Join(dir, "car.sqlite"))) if err != nil { t.Fatal(err) @@ -80,12 +80,20 @@ func testCarstore(t *testing.T, dir string) carstore.CarStore { t.Fatal(err) } - cs, err := carstore.NewCarStore(cardb, cspath) - if err != nil { - t.Fatal(err) - } + if archive { + cs, err := carstore.NewCarStore(cardb, []string{cspath}) + if err != nil { + t.Fatal(err) + } + return cs + } else { + cs, err := carstore.NewNonArchivalCarstore(cardb) + if err != nil { + t.Fatal(err) + } - return cs + return cs + } } func TestIngestWithGap(t *testing.T) { @@ -106,7 +114,7 @@ func TestIngestWithGap(t *testing.T) { Uid: 1, }) - cs := testCarstore(t, dir) + cs := testCarstore(t, dir, true) repoman := NewRepoManager(cs, &util.FakeKeyManager{}) @@ -114,7 +122,7 @@ func TestIngestWithGap(t *testing.T) { if err != nil { t.Fatal(err) } - cs2 := testCarstore(t, dir2) + cs2 := testCarstore(t, dir2, true) var since *string ctx := context.TODO() @@ -198,7 +206,7 @@ func TestDuplicateRecord(t *testing.T) { Uid: 1, }) - cs := testCarstore(t, dir) + cs := testCarstore(t, dir, true) repoman := NewRepoManager(cs, &util.FakeKeyManager{}) @@ -208,14 +216,14 @@ func TestDuplicateRecord(t *testing.T) { } p1, _, err := repoman.CreateRecord(ctx, 1, "app.bsky.feed.post", &bsky.FeedPost{ - Text: fmt.Sprintf("hello friend"), + Text: "hello friend", }) if err != nil { t.Fatal(err) } p2, _, err := repoman.CreateRecord(ctx, 1, "app.bsky.feed.post", &bsky.FeedPost{ - Text: fmt.Sprintf("hello friend"), + Text: "hello friend", }) if err != nil { t.Fatal(err) diff --git a/repomgr/metrics.go b/repomgr/metrics.go index a92e3091d..df3e4bea1 100644 --- a/repomgr/metrics.go +++ b/repomgr/metrics.go @@ -9,3 +9,21 @@ var repoOpsImported = promauto.NewCounter(prometheus.CounterOpts{ Name: "repomgr_repo_ops_imported", Help: "Number of repo ops imported", }) + +var openAndSigCheckDuration = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "repomgr_open_and_sig_check_duration", + Help: "Duration of opening and signature check", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), +}) + +var calcDiffDuration = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "repomgr_calc_diff_duration", + Help: "Duration of calculating diff", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), +}) + +var writeCarSliceDuration = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "repomgr_write_car_slice_duration", + Help: "Duration of writing car slice", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), +}) diff --git a/repomgr/repomgr.go b/repomgr/repomgr.go index ad90a43b2..d2c3766f3 100644 --- a/repomgr/repomgr.go +++ b/repomgr/repomgr.go @@ -6,8 +6,10 @@ import ( "errors" "fmt" "io" + "log/slog" "strings" "sync" + "time" atproto "github.com/bluesky-social/indigo/api/atproto" bsky "github.com/bluesky-social/indigo/api/bsky" @@ -23,7 +25,6 @@ import ( "github.com/ipfs/go-datastore" blockstore "github.com/ipfs/go-ipfs-blockstore" ipld "github.com/ipfs/go-ipld-format" - logging "github.com/ipfs/go-log/v2" "github.com/ipld/go-car" cbg "github.com/whyrusleeping/cbor-gen" "go.opentelemetry.io/otel" @@ -31,14 +32,19 @@ import ( "gorm.io/gorm" ) -var log = logging.Logger("repomgr") - func NewRepoManager(cs carstore.CarStore, kmgr KeyManager) *RepoManager { + var noArchive bool + if _, ok := cs.(*carstore.NonArchivalCarstore); ok { + noArchive = true + } + return &RepoManager{ cs: cs, userLocks: make(map[models.Uid]*userLock), kmgr: kmgr, + log: slog.Default().With("system", "repomgr"), + noArchive: noArchive, } } @@ -61,6 +67,9 @@ type RepoManager struct { events func(context.Context, *RepoEvent) hydrateRecords bool + + log *slog.Logger + noArchive bool } type ActorInfo struct { @@ -528,16 +537,122 @@ func (rm *RepoManager) CheckRepoSig(ctx context.Context, r *repo.Repo, expdid st } func (rm *RepoManager) HandleExternalUserEvent(ctx context.Context, pdsid uint, uid models.Uid, did string, since *string, nrev string, carslice []byte, ops []*atproto.SyncSubscribeRepos_RepoOp) error { + if rm.noArchive { + return rm.handleExternalUserEventNoArchive(ctx, pdsid, uid, did, since, nrev, carslice, ops) + } else { + return rm.handleExternalUserEventArchive(ctx, pdsid, uid, did, since, nrev, carslice, ops) + } +} + +func (rm *RepoManager) handleExternalUserEventNoArchive(ctx context.Context, pdsid uint, uid models.Uid, did string, since *string, nrev string, carslice []byte, ops []*atproto.SyncSubscribeRepos_RepoOp) error { ctx, span := otel.Tracer("repoman").Start(ctx, "HandleExternalUserEvent") defer span.End() span.SetAttributes(attribute.Int64("uid", int64(uid))) - log.Debugw("HandleExternalUserEvent", "pds", pdsid, "uid", uid, "since", since, "nrev", nrev) + rm.log.Debug("HandleExternalUserEvent", "pds", pdsid, "uid", uid, "since", since, "nrev", nrev) unlock := rm.lockUser(ctx, uid) defer unlock() + start := time.Now() + root, ds, err := rm.cs.ImportSlice(ctx, uid, since, carslice) + if err != nil { + return fmt.Errorf("importing external carslice: %w", err) + } + + r, err := repo.OpenRepo(ctx, ds, root) + if err != nil { + return fmt.Errorf("opening external user repo (%d, root=%s): %w", uid, root, err) + } + + if err := rm.CheckRepoSig(ctx, r, did); err != nil { + return fmt.Errorf("check repo sig: %w", err) + } + openAndSigCheckDuration.Observe(time.Since(start).Seconds()) + + evtops := make([]RepoOp, 0, len(ops)) + for _, op := range ops { + parts := strings.SplitN(op.Path, "/", 2) + if len(parts) != 2 { + return fmt.Errorf("invalid rpath in mst diff, must have collection and rkey") + } + + switch EventKind(op.Action) { + case EvtKindCreateRecord: + rop := RepoOp{ + Kind: EvtKindCreateRecord, + Collection: parts[0], + Rkey: parts[1], + RecCid: (*cid.Cid)(op.Cid), + } + + if rm.hydrateRecords { + _, rec, err := r.GetRecord(ctx, op.Path) + if err != nil { + return fmt.Errorf("reading changed record from car slice: %w", err) + } + rop.Record = rec + } + + evtops = append(evtops, rop) + case EvtKindUpdateRecord: + rop := RepoOp{ + Kind: EvtKindUpdateRecord, + Collection: parts[0], + Rkey: parts[1], + RecCid: (*cid.Cid)(op.Cid), + } + + if rm.hydrateRecords { + _, rec, err := r.GetRecord(ctx, op.Path) + if err != nil { + return fmt.Errorf("reading changed record from car slice: %w", err) + } + + rop.Record = rec + } + + evtops = append(evtops, rop) + case EvtKindDeleteRecord: + evtops = append(evtops, RepoOp{ + Kind: EvtKindDeleteRecord, + Collection: parts[0], + Rkey: parts[1], + }) + default: + return fmt.Errorf("unrecognized external user event kind: %q", op.Action) + } + } + + if rm.events != nil { + rm.events(ctx, &RepoEvent{ + User: uid, + //OldRoot: prev, + NewRoot: root, + Rev: nrev, + Since: since, + Ops: evtops, + RepoSlice: carslice, + PDS: pdsid, + }) + } + + return nil +} + +func (rm *RepoManager) handleExternalUserEventArchive(ctx context.Context, pdsid uint, uid models.Uid, did string, since *string, nrev string, carslice []byte, ops []*atproto.SyncSubscribeRepos_RepoOp) error { + ctx, span := otel.Tracer("repoman").Start(ctx, "HandleExternalUserEvent") + defer span.End() + + span.SetAttributes(attribute.Int64("uid", int64(uid))) + + rm.log.Debug("HandleExternalUserEvent", "pds", pdsid, "uid", uid, "since", since, "nrev", nrev) + + unlock := rm.lockUser(ctx, uid) + defer unlock() + + start := time.Now() root, ds, err := rm.cs.ImportSlice(ctx, uid, since, carslice) if err != nil { return fmt.Errorf("importing external carslice: %w", err) @@ -551,6 +666,7 @@ func (rm *RepoManager) HandleExternalUserEvent(ctx context.Context, pdsid uint, if err := rm.CheckRepoSig(ctx, r, did); err != nil { return err } + openAndSigCheckDuration.Observe(time.Since(start).Seconds()) var skipcids map[cid.Cid]bool if ds.BaseCid().Defined() { @@ -571,10 +687,11 @@ func (rm *RepoManager) HandleExternalUserEvent(ctx context.Context, pdsid uint, } } + start = time.Now() if err := ds.CalcDiff(ctx, skipcids); err != nil { return fmt.Errorf("failed while calculating mst diff (since=%v): %w", since, err) - } + calcDiffDuration.Observe(time.Since(start).Seconds()) evtops := make([]RepoOp, 0, len(ops)) @@ -631,10 +748,12 @@ func (rm *RepoManager) HandleExternalUserEvent(ctx context.Context, pdsid uint, } } + start = time.Now() rslice, err := ds.CloseWithRoot(ctx, root, nrev) if err != nil { return fmt.Errorf("close with root: %w", err) } + writeCarSliceDuration.Observe(time.Since(start).Seconds()) if rm.events != nil { rm.events(ctx, &RepoEvent{ @@ -829,9 +948,9 @@ func (rm *RepoManager) ImportNewRepo(ctx context.Context, user models.Uid, repoD ops := make([]RepoOp, 0, len(diffops)) for _, op := range diffops { repoOpsImported.Inc() - out, err := processOp(ctx, bs, op, rm.hydrateRecords) + out, err := rm.processOp(ctx, bs, op, rm.hydrateRecords) if err != nil { - log.Errorw("failed to process repo op", "err", err, "path", op.Rpath, "repo", repoDid) + rm.log.Error("failed to process repo op", "err", err, "path", op.Rpath, "repo", repoDid) } if out != nil { @@ -865,7 +984,7 @@ func (rm *RepoManager) ImportNewRepo(ctx context.Context, user models.Uid, repoD return nil } -func processOp(ctx context.Context, bs blockstore.Blockstore, op *mst.DiffOp, hydrateRecords bool) (*RepoOp, error) { +func (rm *RepoManager) processOp(ctx context.Context, bs blockstore.Blockstore, op *mst.DiffOp, hydrateRecords bool) (*RepoOp, error) { parts := strings.SplitN(op.Rpath, "/", 2) if len(parts) != 2 { return nil, fmt.Errorf("repo mst had invalid rpath: %q", op.Rpath) @@ -898,7 +1017,7 @@ func processOp(ctx context.Context, bs blockstore.Blockstore, op *mst.DiffOp, hy return nil, err } - log.Warnf("failed processing repo diff: %s", err) + rm.log.Warn("failed processing repo diff", "err", err) } else { outop.Record = rec } @@ -954,7 +1073,7 @@ func (rm *RepoManager) processNewRepo(ctx context.Context, user models.Uid, r io // the repos lifecycle, this will end up erroneously not including // them. We should compute the set of blocks needed to read any repo // ops that happened in the commit and use that for our 'output' blocks - cids, err := walkTree(ctx, seen, root, membs, true) + cids, err := rm.walkTree(ctx, seen, root, membs, true) if err != nil { return fmt.Errorf("walkTree: %w", err) } @@ -995,7 +1114,7 @@ func stringOrNil(s *string) string { // walkTree returns all cids linked recursively by the root, skipping any cids // in the 'skip' map, and not erroring on 'not found' if prevMissing is set -func walkTree(ctx context.Context, skip map[cid.Cid]bool, root cid.Cid, bs blockstore.Blockstore, prevMissing bool) ([]cid.Cid, error) { +func (rm *RepoManager) walkTree(ctx context.Context, skip map[cid.Cid]bool, root cid.Cid, bs blockstore.Blockstore, prevMissing bool) ([]cid.Cid, error) { // TODO: what if someone puts non-cbor links in their repo? if root.Prefix().Codec != cid.DagCBOR { return nil, fmt.Errorf("can only handle dag-cbor objects in repos (%s is %d)", root, root.Prefix().Codec) @@ -1009,7 +1128,7 @@ func walkTree(ctx context.Context, skip map[cid.Cid]bool, root cid.Cid, bs block var links []cid.Cid if err := cbg.ScanForLinks(bytes.NewReader(blk.RawData()), func(c cid.Cid) { if c.Prefix().Codec == cid.Raw { - log.Debugw("skipping 'raw' CID in record", "recordCid", root, "rawCid", c) + rm.log.Debug("skipping 'raw' CID in record", "recordCid", root, "rawCid", c) return } if skip[c] { @@ -1029,7 +1148,7 @@ func walkTree(ctx context.Context, skip map[cid.Cid]bool, root cid.Cid, bs block // TODO: should do this non-recursive since i expect these may get deep for _, c := range links { - sub, err := walkTree(ctx, skip, c, bs, prevMissing) + sub, err := rm.walkTree(ctx, skip, c, bs, prevMissing) if err != nil { if prevMissing && !ipld.IsNotFound(err) { return nil, err diff --git a/search/firehose.go b/search/firehose.go index 0400b9648..6db633cc0 100644 --- a/search/firehose.go +++ b/search/firehose.go @@ -139,6 +139,7 @@ func (idx *Indexer) RunIndexer(ctx context.Context) error { idx.relayhost, rsc.EventHandler, ), + idx.logger, ) } diff --git a/search/handlers.go b/search/handlers.go index 735080dbd..a5d2b27ce 100644 --- a/search/handlers.go +++ b/search/handlers.go @@ -39,7 +39,7 @@ func parseCursorLimit(e echo.Context) (int, int, error) { if offset > 10000 { return 0, 0, &echo.HTTPError{ Code: 400, - Message: fmt.Sprintf("invalid value for 'cursor' (can't paginate so deep)"), + Message: "invalid value for 'cursor' (can't paginate so deep)", } } diff --git a/search/indexing.go b/search/indexing.go index 1a599be5a..2d2082a21 100644 --- a/search/indexing.go +++ b/search/indexing.go @@ -130,7 +130,7 @@ func NewIndexer(db *gorm.DB, escli *es.Client, dir identity.Directory, config In opts.SyncRequestsPerSecond = 8 } - opts.CheckoutPath = fmt.Sprintf("%s/xrpc/com.atproto.sync.getRepo", relayHTTP) + opts.RelayHost = relayHTTP if config.IndexMaxConcurrency > 0 { opts.ParallelRecordCreates = config.IndexMaxConcurrency } else { @@ -145,6 +145,8 @@ func NewIndexer(db *gorm.DB, escli *es.Client, dir identity.Directory, config In idx.handleDelete, opts, ) + // reuse identity directory (for efficient caching) + bf.Directory = dir idx.bfs = bfstore idx.bf = bf diff --git a/search/transform.go b/search/transform.go index c06cea46c..917b622d8 100644 --- a/search/transform.go +++ b/search/transform.go @@ -267,9 +267,7 @@ func parsePostTags(p *appbsky.FeedPost) []string { } } } - for _, t := range p.Tags { - ret = append(ret, t) - } + ret = append(ret, p.Tags...) if len(ret) == 0 { return nil } diff --git a/splitter/splitter.go b/splitter/splitter.go index ad55d226d..9859d147f 100644 --- a/splitter/splitter.go +++ b/splitter/splitter.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "io" + "log/slog" "math/rand" "net" "net/http" @@ -25,7 +26,6 @@ import ( "github.com/bluesky-social/indigo/util" "github.com/bluesky-social/indigo/xrpc" "github.com/gorilla/websocket" - logging "github.com/ipfs/go-log" "github.com/labstack/echo/v4" "github.com/labstack/echo/v4/middleware" promclient "github.com/prometheus/client_golang/prometheus" @@ -33,8 +33,6 @@ import ( dto "github.com/prometheus/client_model/go" ) -var log = logging.Logger("splitter") - type Splitter struct { erb *EventRingBuffer pp *events.PebblePersist @@ -49,6 +47,8 @@ type Splitter struct { httpC *http.Client nextCrawlers []*url.URL + + log *slog.Logger } type SplitterConfig struct { @@ -59,6 +59,7 @@ type SplitterConfig struct { func NewSplitter(conf SplitterConfig, nextCrawlers []string) (*Splitter, error) { var nextCrawlerURLs []*url.URL + log := slog.Default().With("system", "splitter") if len(nextCrawlers) > 0 { nextCrawlerURLs = make([]*url.URL, len(nextCrawlers)) for i, tu := range nextCrawlers { @@ -67,7 +68,7 @@ func NewSplitter(conf SplitterConfig, nextCrawlers []string) (*Splitter, error) if err != nil { return nil, fmt.Errorf("failed to parse next-crawler url: %w", err) } - log.Infow("configuring relay for requestCrawl", "host", nextCrawlerURLs[i]) + log.Info("configuring relay for requestCrawl", "host", nextCrawlerURLs[i]) } } @@ -76,11 +77,13 @@ func NewSplitter(conf SplitterConfig, nextCrawlers []string) (*Splitter, error) consumers: make(map[uint64]*SocketConsumer), httpC: util.RobustHTTPClient(), nextCrawlers: nextCrawlerURLs, + log: log, } if conf.PebbleOptions == nil { // mem splitter erb := NewEventRingBuffer(20_000, 10_000) + s.erb = erb s.events = events.NewEventManager(erb) } else { pp, err := events.NewPebblePersistance(conf.PebbleOptions) @@ -88,6 +91,7 @@ func NewSplitter(conf SplitterConfig, nextCrawlers []string) (*Splitter, error) return nil, err } go pp.GCThread(context.Background()) + s.pp = pp s.events = events.NewEventManager(pp) } @@ -117,6 +121,7 @@ func NewDiskSplitter(host, path string, persistHours float64, maxBytes int64) (* pp: pp, events: em, consumers: make(map[uint64]*SocketConsumer), + log: slog.Default().With("system", "splitter"), }, nil } @@ -175,7 +180,7 @@ func (s *Splitter) StartWithListener(listen net.Listener) error { if err2 := ctx.JSON(err.Code, map[string]any{ "error": err.Message, }); err2 != nil { - log.Errorf("Failed to write http error: %s", err2) + s.log.Error("Failed to write http error", "err", err2) } default: sendHeader := true @@ -183,7 +188,7 @@ func (s *Splitter) StartWithListener(listen net.Listener) error { sendHeader = false } - log.Warnf("HANDLER ERROR: (%s) %s", ctx.Path(), err) + s.log.Warn("HANDLER ERROR", "path", ctx.Path(), "err", err) if strings.HasPrefix(ctx.Path(), "/admin/") { ctx.JSON(500, map[string]any{ @@ -202,7 +207,10 @@ func (s *Splitter) StartWithListener(listen net.Listener) error { e.POST("/xrpc/com.atproto.sync.requestCrawl", s.RequestCrawlHandler) e.GET("/xrpc/com.atproto.sync.subscribeRepos", s.EventsHandler) + e.GET("/xrpc/_health", s.HandleHealthCheck) + e.GET("/_health", s.HandleHealthCheck) + e.GET("/", s.HandleHomeMessage) // In order to support booting on random ports in tests, we need to tell the // Echo instance it's already got a port, and then use its StartServer @@ -278,7 +286,7 @@ func (s *Splitter) RequestCrawlHandler(c echo.Context) error { if len(s.nextCrawlers) != 0 { blob, err := json.Marshal(body) if err != nil { - log.Warnw("could not forward requestCrawl, json err", "err", err) + s.log.Warn("could not forward requestCrawl, json err", "err", err) } else { go func(bodyBlob []byte) { for _, remote := range s.nextCrawlers { @@ -292,11 +300,11 @@ func (s *Splitter) RequestCrawlHandler(c echo.Context) error { response.Body.Close() } if err != nil || response == nil { - log.Warnw("requestCrawl forward failed", "host", remote, "err", err) + s.log.Warn("requestCrawl forward failed", "host", remote, "err", err) } else if response.StatusCode != http.StatusOK { - log.Warnw("requestCrawl forward failed", "host", remote, "status", response.Status) + s.log.Warn("requestCrawl forward failed", "host", remote, "status", response.Status) } else { - log.Infow("requestCrawl forward successful", "host", remote) + s.log.Info("requestCrawl forward successful", "host", remote) } } }(blob) @@ -306,6 +314,21 @@ func (s *Splitter) RequestCrawlHandler(c echo.Context) error { return c.JSON(200, HealthStatus{Status: "ok"}) } +var homeMessage string = ` + _ _ + _ _ __ _(_)_ _ | |__ _____ __ __ +| '_/ _' | | ' \| '_ \/ _ \ V V / +|_| \__,_|_|_||_|_.__/\___/\_/\_/ + +This is an atproto [https://atproto.com] firehose fanout service, running the 'rainbow' codebase [https://github.com/bluesky-social/indigo] + +The firehose WebSocket path is at: /xrpc/com.atproto.sync.subscribeRepos +` + +func (s *Splitter) HandleHomeMessage(c echo.Context) error { + return c.String(http.StatusOK, homeMessage) +} + func (s *Splitter) EventsHandler(c echo.Context) error { var since *int64 if sinceVal := c.QueryParam("cursor"); sinceVal != "" { @@ -347,7 +370,7 @@ func (s *Splitter) EventsHandler(c echo.Context) error { } if err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(5*time.Second)); err != nil { - log.Errorf("failed to ping client: %s", err) + s.log.Error("failed to ping client", "err", err) cancel() return } @@ -372,7 +395,7 @@ func (s *Splitter) EventsHandler(c echo.Context) error { for { _, _, err := conn.ReadMessage() if err != nil { - log.Errorf("failed to read message from client: %s", err) + s.log.Error("failed to read message from client", "err", err) cancel() return } @@ -399,7 +422,7 @@ func (s *Splitter) EventsHandler(c echo.Context) error { consumerID := s.registerConsumer(&consumer) defer s.cleanupConsumer(consumerID) - log.Infow("new consumer", + s.log.Info("new consumer", "remote_addr", consumer.RemoteAddr, "user_agent", consumer.UserAgent, "cursor", since, @@ -412,13 +435,13 @@ func (s *Splitter) EventsHandler(c echo.Context) error { select { case evt, ok := <-evts: if !ok { - log.Error("event stream closed unexpectedly") + s.log.Error("event stream closed unexpectedly") return nil } wc, err := conn.NextWriter(websocket.BinaryMessage) if err != nil { - log.Errorf("failed to get next writer: %s", err) + s.log.Error("failed to get next writer", "err", err) return err } @@ -432,7 +455,7 @@ func (s *Splitter) EventsHandler(c echo.Context) error { } if err := wc.Close(); err != nil { - log.Warnf("failed to flush-close our event write: %s", err) + s.log.Warn("failed to flush-close our event write", "err", err) return nil } @@ -473,10 +496,10 @@ func (s *Splitter) cleanupConsumer(id uint64) { var m = &dto.Metric{} if err := c.EventsSent.Write(m); err != nil { - log.Errorf("failed to get sent counter: %s", err) + s.log.Error("failed to get sent counter", "err", err) } - log.Infow("consumer disconnected", + s.log.Info("consumer disconnected", "consumer_id", id, "remote_addr", c.RemoteAddr, "user_agent", c.UserAgent, @@ -522,17 +545,17 @@ func (s *Splitter) subscribeWithRedialer(ctx context.Context, host string, curso } con, res, err := d.DialContext(ctx, url, header) if err != nil { - log.Warnw("dialing failed", "host", host, "err", err, "backoff", backoff) + s.log.Warn("dialing failed", "host", host, "err", err, "backoff", backoff) time.Sleep(sleepForBackoff(backoff)) backoff++ continue } - log.Info("event subscription response code: ", res.StatusCode) + s.log.Info("event subscription response", "code", res.StatusCode) if err := s.handleConnection(ctx, host, con, &cursor); err != nil { - log.Warnf("connection to %q failed: %s", host, err) + s.log.Warn("connection failed", "host", host, "err", err) } } } @@ -555,7 +578,7 @@ func (s *Splitter) handleConnection(ctx context.Context, host string, con *webso if seq%5000 == 0 { // TODO: don't need this after we move to getting seq from pebble if err := s.writeCursor(seq); err != nil { - log.Errorf("write cursor failed: %s", err) + s.log.Error("write cursor failed", "err", err) } } @@ -563,19 +586,19 @@ func (s *Splitter) handleConnection(ctx context.Context, host string, con *webso return nil }) - return events.HandleRepoStream(ctx, con, sched) + return events.HandleRepoStream(ctx, con, sched, nil) } func (s *Splitter) getLastCursor() (int64, error) { if s.pp != nil { seq, millis, _, err := s.pp.GetLast(context.Background()) if err == nil { - log.Debugw("got last cursor from pebble", "seq", seq, "millis", millis) + s.log.Debug("got last cursor from pebble", "seq", seq, "millis", millis) return seq, nil } else if errors.Is(err, events.ErrNoLast) { - log.Info("pebble no last") + s.log.Info("pebble no last") } else { - log.Errorw("pebble seq fail", "err", err) + s.log.Error("pebble seq fail", "err", err) } } diff --git a/testing/integ_test.go b/testing/integ_test.go index cef32d3a4..fd5533932 100644 --- a/testing/integ_test.go +++ b/testing/integ_test.go @@ -15,16 +15,21 @@ import ( "github.com/bluesky-social/indigo/repo" "github.com/bluesky-social/indigo/xrpc" "github.com/ipfs/go-cid" - "github.com/ipfs/go-log/v2" car "github.com/ipld/go-car" "github.com/stretchr/testify/assert" ) -func init() { - log.SetAllLoggers(log.LevelInfo) +func TestRelayBasic(t *testing.T) { + t.Helper() + testRelayBasic(t, true) } -func TestRelayBasic(t *testing.T) { +func TestRelayBasicNonArchive(t *testing.T) { + t.Helper() + testRelayBasic(t, false) +} + +func testRelayBasic(t *testing.T, archive bool) { if testing.Short() { t.Skip("skipping Relay test in 'short' test mode") } @@ -33,7 +38,7 @@ func TestRelayBasic(t *testing.T) { p1 := MustSetupPDS(t, ".tpds", didr) p1.Run(t) - b1 := MustSetupRelay(t, didr) + b1 := MustSetupRelay(t, didr, archive) b1.Run(t) b1.tr.TrialHosts = []string{p1.RawHost()} @@ -116,6 +121,16 @@ func socialSim(t *testing.T, users []*TestUser, postiter, likeiter int) []*atpro } func TestRelayMultiPDS(t *testing.T) { + t.Helper() + testRelayMultiPDS(t, true) +} + +func TestRelayMultiPDSNonArchive(t *testing.T) { + t.Helper() + testRelayMultiPDS(t, false) +} + +func testRelayMultiPDS(t *testing.T, archive bool) { if testing.Short() { t.Skip("skipping Relay test in 'short' test mode") } @@ -130,7 +145,7 @@ func TestRelayMultiPDS(t *testing.T) { p2 := MustSetupPDS(t, ".pdsdos", didr) p2.Run(t) - b1 := MustSetupRelay(t, didr) + b1 := MustSetupRelay(t, didr, archive) b1.Run(t) b1.tr.TrialHosts = []string{p1.RawHost(), p2.RawHost()} @@ -198,7 +213,7 @@ func TestRelayMultiGap(t *testing.T) { p2 := MustSetupPDS(t, ".pdsdos", didr) p2.Run(t) - b1 := MustSetupRelay(t, didr) + b1 := MustSetupRelay(t, didr, true) b1.Run(t) b1.tr.TrialHosts = []string{p1.RawHost(), p2.RawHost()} @@ -256,7 +271,7 @@ func TestHandleChange(t *testing.T) { p1 := MustSetupPDS(t, ".pdsuno", didr) p1.Run(t) - b1 := MustSetupRelay(t, didr) + b1 := MustSetupRelay(t, didr, true) b1.Run(t) b1.tr.TrialHosts = []string{p1.RawHost()} @@ -293,7 +308,7 @@ func TestAccountEvent(t *testing.T) { p1 := MustSetupPDS(t, ".pdsuno", didr) p1.Run(t) - b1 := MustSetupRelay(t, didr) + b1 := MustSetupRelay(t, didr, true) b1.Run(t) b1.tr.TrialHosts = []string{p1.RawHost()} @@ -391,6 +406,14 @@ func TestAccountEvent(t *testing.T) { } func TestRelayTakedown(t *testing.T) { + testRelayTakedown(t, true) +} + +func TestRelayTakedownNonArchive(t *testing.T) { + testRelayTakedown(t, false) +} + +func testRelayTakedown(t *testing.T, archive bool) { if testing.Short() { t.Skip("skipping Relay test in 'short' test mode") } @@ -401,7 +424,7 @@ func TestRelayTakedown(t *testing.T) { p1 := MustSetupPDS(t, ".tpds", didr) p1.Run(t) - b1 := MustSetupRelay(t, didr) + b1 := MustSetupRelay(t, didr, true) b1.Run(t) b1.tr.TrialHosts = []string{p1.RawHost()} @@ -480,7 +503,7 @@ func TestDomainBans(t *testing.T) { } didr := TestPLC(t) - b1 := MustSetupRelay(t, didr) + b1 := MustSetupRelay(t, didr, true) b1.Run(t) b1.BanDomain(t, "foo.com") @@ -523,7 +546,7 @@ func TestRelayHandleEmptyEvent(t *testing.T) { p1 := MustSetupPDS(t, ".tpds", didr) p1.Run(t) - b1 := MustSetupRelay(t, didr) + b1 := MustSetupRelay(t, didr, true) b1.Run(t) b1.tr.TrialHosts = []string{p1.RawHost()} @@ -541,6 +564,7 @@ func TestRelayHandleEmptyEvent(t *testing.T) { e1 := evts.Next() assert.NotNil(e1.RepoCommit) assert.Equal(e1.RepoCommit.Repo, bob.DID()) + fmt.Println(e1.RepoCommit.Ops[0]) ctx := context.TODO() rm := p1.server.Repoman() @@ -549,6 +573,7 @@ func TestRelayHandleEmptyEvent(t *testing.T) { } e2 := evts.Next() + //fmt.Println(e2.RepoCommit.Ops[0]) assert.Equal(len(e2.RepoCommit.Ops), 0) assert.Equal(e2.RepoCommit.Repo, bob.DID()) } diff --git a/testing/utils.go b/testing/utils.go index 9b076ef17..d2847d026 100644 --- a/testing/utils.go +++ b/testing/utils.go @@ -117,7 +117,7 @@ func SetupPDS(ctx context.Context, suffix string, plc plc.PLCClient) (*TestPDS, return nil, err } - cs, err := carstore.NewCarStore(cardb, cspath) + cs, err := carstore.NewCarStore(cardb, []string{cspath}) if err != nil { return nil, err } @@ -518,10 +518,10 @@ func (t *TestRelay) Host() string { return t.listener.Addr().String() } -func MustSetupRelay(t *testing.T, didr plc.PLCClient) *TestRelay { +func MustSetupRelay(t *testing.T, didr plc.PLCClient, archive bool) *TestRelay { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - tbgs, err := SetupRelay(ctx, didr) + tbgs, err := SetupRelay(ctx, didr, archive) if err != nil { t.Fatal(err) } @@ -529,7 +529,7 @@ func MustSetupRelay(t *testing.T, didr plc.PLCClient) *TestRelay { return tbgs } -func SetupRelay(ctx context.Context, didr plc.PLCClient) (*TestRelay, error) { +func SetupRelay(ctx context.Context, didr plc.PLCClient, archive bool) (*TestRelay, error) { dir, err := os.MkdirTemp("", "integtest") if err != nil { return nil, err @@ -550,9 +550,19 @@ func SetupRelay(ctx context.Context, didr plc.PLCClient) (*TestRelay, error) { return nil, err } - cs, err := carstore.NewCarStore(cardb, cspath) - if err != nil { - return nil, err + var cs carstore.CarStore + if archive { + arccs, err := carstore.NewCarStore(cardb, []string{cspath}) + if err != nil { + return nil, err + } + cs = arccs + } else { + nacs, err := carstore.NewNonArchivalCarstore(cardb) + if err != nil { + return nil, err + } + cs = nacs } //kmgr := indexer.NewKeyManager(didr, nil) @@ -691,7 +701,7 @@ func (b *TestRelay) Events(t *testing.T, since int64) *EventStream { }, } seqScheduler := sequential.NewScheduler("test", rsc.EventHandler) - if err := events.HandleRepoStream(ctx, con, seqScheduler); err != nil { + if err := events.HandleRepoStream(ctx, con, seqScheduler, nil); err != nil { fmt.Println(err) } }() diff --git a/util/cliutil/ipfslog.go b/util/cliutil/ipfslog.go new file mode 100644 index 000000000..4f3a0f1c9 --- /dev/null +++ b/util/cliutil/ipfslog.go @@ -0,0 +1,35 @@ +package cliutil + +import ( + "io" + + ipfslog "github.com/ipfs/go-log/v2" + "go.uber.org/zap/zapcore" +) + +func SetIpfsWriter(out io.Writer, format string, level string) { + var ze zapcore.Encoder + switch format { + case "json": + ze = zapcore.NewJSONEncoder(zapcore.EncoderConfig{}) + case "text": + ze = zapcore.NewConsoleEncoder(zapcore.EncoderConfig{}) + default: + ze = zapcore.NewConsoleEncoder(zapcore.EncoderConfig{}) + } + var zl zapcore.LevelEnabler + switch level { + case "debug": + zl = zapcore.DebugLevel + case "info": + zl = zapcore.InfoLevel + case "warn": + zl = zapcore.WarnLevel + case "error": + zl = zapcore.ErrorLevel + default: + zl = zapcore.InfoLevel + } + nc := zapcore.NewCore(ze, zapcore.AddSync(out), zl) + ipfslog.SetPrimaryCore(nc) +} diff --git a/util/cliutil/util.go b/util/cliutil/util.go index 758643eff..8991945ca 100644 --- a/util/cliutil/util.go +++ b/util/cliutil/util.go @@ -2,10 +2,17 @@ package cliutil import ( "encoding/json" + "errors" "fmt" + "io" + "io/fs" + "log/slog" "net/http" "os" "path/filepath" + "regexp" + "sort" + "strconv" "strings" "time" @@ -230,3 +237,345 @@ func SetupDatabase(dburl string, maxConnections int) (*gorm.DB, error) { return db, nil } + +type LogOptions struct { + // e.g. 1_000_000_000 + LogRotateBytes int64 + + // path to write to, if rotating, %T gets UnixMilli at file open time + // NOTE: substitution is simple replace("%T", "") + LogPath string + + // text|json + LogFormat string + + // info|debug|warn|error + LogLevel string + + // Keep N old logs (not including current); <0 disables removal, 0==remove all old log files immediately + KeepOld int +} + +func firstenv(env_var_names ...string) string { + for _, env_var_name := range env_var_names { + val := os.Getenv(env_var_name) + if val != "" { + return val + } + } + return "" +} + +// SetupSlog integrates passed in options and env vars. +// +// passing default cliutil.LogOptions{} is ok. +// +// BSKYLOG_LOG_LEVEL=info|debug|warn|error +// +// BSKYLOG_LOG_FMT=text|json +// +// BSKYLOG_FILE=path (or "-" or "" for stdout), %T gets UnixMilli; if a path with '/', {prefix}/current becomes a link to active log file +// +// BSKYLOG_ROTATE_BYTES=int maximum size of log chunk before rotating +// +// BSKYLOG_ROTATE_KEEP=int keep N olg logs (not including current) +// +// The env vars were derived from ipfs logging library, and also respond to some GOLOG_ vars from that library, +// but BSKYLOG_ variables are preferred because imported code still using the ipfs log library may misbehave +// if some GOLOG values are set, especially GOLOG_FILE. +func SetupSlog(options LogOptions) (*slog.Logger, error) { + fmt.Fprintf(os.Stderr, "SetupSlog\n") + var hopts slog.HandlerOptions + hopts.Level = slog.LevelInfo + hopts.AddSource = true + if options.LogLevel == "" { + options.LogLevel = firstenv("BSKYLOG_LOG_LEVEL", "GOLOG_LOG_LEVEL") + } + if options.LogLevel == "" { + hopts.Level = slog.LevelInfo + options.LogLevel = "info" + } else { + level := strings.ToLower(options.LogLevel) + switch level { + case "debug": + hopts.Level = slog.LevelDebug + case "info": + hopts.Level = slog.LevelInfo + case "warn": + hopts.Level = slog.LevelWarn + case "error": + hopts.Level = slog.LevelError + default: + return nil, fmt.Errorf("unknown log level: %#v", options.LogLevel) + } + } + if options.LogFormat == "" { + options.LogFormat = firstenv("BSKYLOG_LOG_FMT", "GOLOG_LOG_FMT") + } + if options.LogFormat == "" { + options.LogFormat = "text" + } else { + format := strings.ToLower(options.LogFormat) + if format == "json" || format == "text" { + // ok + } else { + return nil, fmt.Errorf("invalid log format: %#v", options.LogFormat) + } + options.LogFormat = format + } + + if options.LogPath == "" { + options.LogPath = firstenv("BSKYLOG_FILE", "GOLOG_FILE") + } + if options.LogRotateBytes == 0 { + rotateBytesStr := os.Getenv("BSKYLOG_ROTATE_BYTES") // no GOLOG equivalent + if rotateBytesStr != "" { + rotateBytes, err := strconv.ParseInt(rotateBytesStr, 10, 64) + if err != nil { + return nil, fmt.Errorf("invalid BSKYLOG_ROTATE_BYTES value: %w", err) + } + options.LogRotateBytes = rotateBytes + } + } + if options.KeepOld == 0 { + keepOldUnset := true + keepOldStr := os.Getenv("BSKYLOG_ROTATE_KEEP") // no GOLOG equivalent + if keepOldStr != "" { + keepOld, err := strconv.ParseInt(keepOldStr, 10, 64) + if err != nil { + return nil, fmt.Errorf("invalid BSKYLOG_ROTATE_KEEP value: %w", err) + } + keepOldUnset = false + options.KeepOld = int(keepOld) + } + if keepOldUnset { + options.KeepOld = 2 + } + } + logaround := make(chan string, 100) + go logbouncer(logaround) + var out io.Writer + if (options.LogPath == "") || (options.LogPath == "-") { + out = os.Stdout + } else if options.LogRotateBytes != 0 { + out = &logRotateWriter{ + rotateBytes: options.LogRotateBytes, + outPathTemplate: options.LogPath, + keep: options.KeepOld, + logaround: logaround, + } + } else { + var err error + out, err = os.Create(options.LogPath) + if err != nil { + return nil, fmt.Errorf("%s: %w", options.LogPath, err) + } + fmt.Fprintf(os.Stderr, "SetupSlog create %#v\n", options.LogPath) + } + var handler slog.Handler + switch options.LogFormat { + case "text": + handler = slog.NewTextHandler(out, &hopts) + case "json": + handler = slog.NewJSONHandler(out, &hopts) + default: + return nil, fmt.Errorf("unknown log format: %#v", options.LogFormat) + } + logger := slog.New(handler) + slog.SetDefault(logger) + templateDirPart, _ := filepath.Split(options.LogPath) + ents, _ := os.ReadDir(templateDirPart) + for _, ent := range ents { + fmt.Fprintf(os.Stdout, "%s\n", filepath.Join(templateDirPart, ent.Name())) + } + SetIpfsWriter(out, options.LogFormat, options.LogLevel) + return logger, nil +} + +type logRotateWriter struct { + currentWriter io.WriteCloser + + // how much has been written to current log file + currentBytes int64 + + // e.g. path/to/logs/foo%T + currentPath string + + // e.g. path/to/logs/current + currentPathCurrent string + + rotateBytes int64 + + outPathTemplate string + + // keep the most recent N log files (not including current) + keep int + + // write strings to this from inside the log system, a task outside the log system hands them to slog.Info() + logaround chan<- string +} + +func logbouncer(out <-chan string) { + var logger *slog.Logger + for line := range out { + fmt.Fprintf(os.Stderr, "ll %s\n", line) + if logger == nil { + // lazy to make sure it crops up after slog Default has been set + logger = slog.Default().With("system", "logging") + } + logger.Info(line) + } +} + +var currentMatcher = regexp.MustCompile("current_\\d+") + +func (w *logRotateWriter) cleanOldLogs() { + if w.keep < 0 { + // old log removal is disabled + return + } + // w.currentPath was recently set as the new log + dirpart, _ := filepath.Split(w.currentPath) + // find old logs + templateDirPart, templateNamePart := filepath.Split(w.outPathTemplate) + if dirpart != templateDirPart { + w.logaround <- fmt.Sprintf("current dir part %#v != template dir part %#v\n", w.currentPath, w.outPathTemplate) + return + } + // build a regexp that is string literal parts with \d+ replacing the UnixMilli part + templateNameParts := strings.Split(templateNamePart, "%T") + var sb strings.Builder + first := true + for _, part := range templateNameParts { + if first { + first = false + } else { + sb.WriteString("\\d+") + } + sb.WriteString(regexp.QuoteMeta(part)) + } + tmre, err := regexp.Compile(sb.String()) + if err != nil { + w.logaround <- fmt.Sprintf("failed to compile old log template regexp: %#v\n", err) + return + } + dir, err := os.ReadDir(dirpart) + if err != nil { + w.logaround <- fmt.Sprintf("failed to read old log template dir: %#v\n", err) + return + } + var found []fs.FileInfo + for _, ent := range dir { + name := ent.Name() + if tmre.MatchString(name) || currentMatcher.MatchString(name) { + fi, err := ent.Info() + if err != nil { + continue + } + found = append(found, fi) + } + } + if len(found) <= w.keep { + // not too many, nothing to do + return + } + foundMtimeLess := func(i, j int) bool { + return found[i].ModTime().Before(found[j].ModTime()) + } + sort.Slice(found, foundMtimeLess) + drops := found[:len(found)-w.keep] + for _, fi := range drops { + fullpath := filepath.Join(dirpart, fi.Name()) + err = os.Remove(fullpath) + if err != nil { + w.logaround <- fmt.Sprintf("failed to rm old log: %#v\n", err) + // but keep going + } + // maybe it would be safe to debug-log old log removal from within the logging infrastructure? + } +} + +func (w *logRotateWriter) closeOldLog() []error { + if w.currentWriter == nil { + return nil + } + var earlyWeakErrors []error + err := w.currentWriter.Close() + if err != nil { + earlyWeakErrors = append(earlyWeakErrors, err) + } + w.currentWriter = nil + w.currentBytes = 0 + w.currentPath = "" + if w.currentPathCurrent != "" { + err = os.Remove(w.currentPathCurrent) // not really an error until something else goes wrong + if err != nil { + earlyWeakErrors = append(earlyWeakErrors, err) + } + w.currentPathCurrent = "" + } + return earlyWeakErrors +} + +func (w *logRotateWriter) openNewLog(earlyWeakErrors []error) (badErr error, weakErrors []error) { + nowMillis := time.Now().UnixMilli() + nows := strconv.FormatInt(nowMillis, 10) + w.currentPath = strings.Replace(w.outPathTemplate, "%T", nows, -1) + var err error + w.currentWriter, err = os.Create(w.currentPath) + if err != nil { + earlyWeakErrors = append(earlyWeakErrors, err) + return errors.Join(earlyWeakErrors...), nil + } + w.logaround <- fmt.Sprintf("new log file %#v", w.currentPath) + w.cleanOldLogs() + dirpart, _ := filepath.Split(w.currentPath) + if dirpart != "" { + w.currentPathCurrent = filepath.Join(dirpart, "current") + fi, err := os.Stat(w.currentPathCurrent) + if err == nil && fi.Mode().IsRegular() { + // move aside unknown "current" from a previous run + // see also currentMatcher regexp current_\d+ + err = os.Rename(w.currentPathCurrent, w.currentPathCurrent+"_"+nows) + if err != nil { + // not crucial if we can't move aside "current" + // TODO: log warning ... but not from inside log writer? + earlyWeakErrors = append(earlyWeakErrors, err) + } + } + err = os.Link(w.currentPath, w.currentPathCurrent) + if err != nil { + // not crucial if we can't make "current" link + // TODO: log warning ... but not from inside log writer? + earlyWeakErrors = append(earlyWeakErrors, err) + } + } + return nil, earlyWeakErrors +} + +func (w *logRotateWriter) Write(p []byte) (n int, err error) { + var earlyWeakErrors []error + if int64(len(p))+w.currentBytes > w.rotateBytes { + // next write would be over the limit + earlyWeakErrors = w.closeOldLog() + } + if w.currentWriter == nil { + // start new log file + var err error + err, earlyWeakErrors = w.openNewLog(earlyWeakErrors) + if err != nil { + return 0, err + } + } + var wrote int + wrote, err = w.currentWriter.Write(p) + w.currentBytes += int64(wrote) + if err != nil { + earlyWeakErrors = append(earlyWeakErrors, err) + return wrote, errors.Join(earlyWeakErrors...) + } + if earlyWeakErrors != nil { + w.logaround <- fmt.Sprintf("ok, but: %s", errors.Join(earlyWeakErrors...).Error()) + } + return wrote, nil +}