Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

automod: identical reply rule #466

Merged
merged 8 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions automod/countstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const (
type CountStore interface {
GetCount(ctx context.Context, name, val, period string) (int, error)
Increment(ctx context.Context, name, val string) error
IncrementPeriod(ctx context.Context, name, val, period string) error
// TODO: batch increment method
GetCountDistinct(ctx context.Context, name, bucket, period string) (int, error)
IncrementDistinct(ctx context.Context, name, bucket, val string) error
Expand Down Expand Up @@ -60,14 +61,19 @@ func (s MemCountStore) GetCount(ctx context.Context, name, val, period string) (

func (s MemCountStore) Increment(ctx context.Context, name, val string) error {
for _, p := range []string{PeriodTotal, PeriodDay, PeriodHour} {
k := PeriodBucket(name, val, p)
v, ok := s.Counts[k]
if !ok {
v = 0
}
v = v + 1
s.Counts[k] = v
s.IncrementPeriod(ctx, name, val, p)
}
return nil
}

func (s MemCountStore) IncrementPeriod(ctx context.Context, name, val, period string) error {
k := PeriodBucket(name, val, period)
v, ok := s.Counts[k]
if !ok {
v = 0
}
v = v + 1
s.Counts[k] = v
return nil
}

Expand Down
22 changes: 17 additions & 5 deletions automod/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ type ModReport struct {
}

type CounterRef struct {
Name string
Val string
Name string
Val string
Period *string
}

type CounterDistinctRef struct {
Expand Down Expand Up @@ -55,6 +56,10 @@ func (e *RepoEvent) Increment(name, val string) {
e.CounterIncrements = append(e.CounterIncrements, CounterRef{Name: name, Val: val})
}

func (e *RepoEvent) IncrementPeriod(name, val string, period string) {
e.CounterIncrements = append(e.CounterIncrements, CounterRef{Name: name, Val: val, Period: &period})
}

func (e *RepoEvent) GetCountDistinct(name, bucket, period string) int {
v, err := e.Engine.GetCountDistinct(name, bucket, period)
if err != nil {
Expand Down Expand Up @@ -247,9 +252,16 @@ func (e *RepoEvent) PersistActions(ctx context.Context) error {
func (e *RepoEvent) PersistCounters(ctx context.Context) error {
// TODO: dedupe this array
for _, ref := range e.CounterIncrements {
err := e.Engine.Counters.Increment(ctx, ref.Name, ref.Val)
if err != nil {
return err
if ref.Period != nil {
err := e.Engine.Counters.IncrementPeriod(ctx, ref.Name, ref.Val, *ref.Period)
if err != nil {
return err
}
} else {
err := e.Engine.Counters.Increment(ctx, ref.Name, ref.Val)
if err != nil {
return err
}
}
}
for _, ref := range e.CounterDistinctIncrements {
Expand Down
20 changes: 20 additions & 0 deletions automod/redis_counters.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,26 @@ func (s *RedisCountStore) Increment(ctx context.Context, name, val string) error
return err
}

// Variant of Increment() which only acts on a single specified time period. The intended us of this variant is to control the total number of counters persisted, by using a relatively short time period, for which the counters will expire.
func (s *RedisCountStore) IncrementPeriod(ctx context.Context, name, val, period string) error {

// multiple ops in a single redis round-trip
multi := s.Client.Pipeline()

key := redisCountPrefix + PeriodBucket(name, val, period)
multi.Incr(ctx, key)

switch period {
case PeriodHour:
multi.Expire(ctx, key, 2*time.Hour)
case PeriodDay:
multi.Expire(ctx, key, 48*time.Hour)
}

_, err := multi.Exec(ctx)
return err
}

func (s *RedisCountStore) GetCountDistinct(ctx context.Context, name, val, period string) (int, error) {
key := redisDistinctPrefix + PeriodBucket(name, val, period)
c, err := s.Client.PFCount(ctx, key).Result()
Expand Down
1 change: 1 addition & 0 deletions automod/rules/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ func DefaultRules() automod.RuleSet {
KeywordPostRule,
ReplySingleKeywordPostRule,
AggressivePromotionRule,
IdenticalReplyPostRule,
},
ProfileRules: []automod.ProfileRuleFunc{
GtubeProfileRule,
Expand Down
10 changes: 10 additions & 0 deletions automod/rules/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
appbsky "github.com/bluesky-social/indigo/api/bsky"
"github.com/bluesky-social/indigo/atproto/syntax"
"github.com/bluesky-social/indigo/automod"

"github.com/spaolacci/murmur3"
)

func dedupeStrings(in []string) []string {
Expand Down Expand Up @@ -177,3 +179,11 @@ func IsSelfThread(evt *automod.RecordEvent, post *appbsky.FeedPost) bool {
}
return false
}

// returns a fast, compact hash of a string
//
// current implementation uses murmur3, default seed, and hex encoding
func HashOfString(s string) string {
val := murmur3.Sum64([]byte(s))
return fmt.Sprintf("%016x", val)
}
7 changes: 7 additions & 0 deletions automod/rules/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,10 @@ func TestExtractURL(t *testing.T) {
assert.Equal(fix.out, ExtractTextURLs(fix.s))
}
}

func TestHashOfString(t *testing.T) {
assert := assert.New(t)

// hashing function should be consistent over time
assert.Equal("4e6f69c0e3d10992", HashOfString("dummy-value"))
}
21 changes: 21 additions & 0 deletions automod/rules/replies.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,24 @@ func ReplyCountPostRule(evt *automod.RecordEvent, post *appbsky.FeedPost) error
evt.IncrementDistinct("reply-to", did, parentURI.Authority().String())
return nil
}

var identicalReplyLimit = 5

// Looks for accounts posting the exact same text multiple times. Does not currently count the number of distinct accounts replied to, just counts replies at all.
//
// There can be legitimate situations that trigger this rule, so in most situations should be a "report" not "label" action.
func IdenticalReplyPostRule(evt *automod.RecordEvent, post *appbsky.FeedPost) error {
if post.Reply == nil || IsSelfThread(evt, post) {
return nil
}

// use a specific period (IncrementPeriod()) to reduce the number of counters (one per unique post text)
period := automod.PeriodDay
bucket := evt.Account.Identity.DID.String() + "/" + HashOfString(post.Text)
if evt.GetCount("reply-text", bucket, period) >= identicalReplyLimit {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now, this is proceeding to act immediately on a hash collision. Do you think it would be reasonable to do a more expensive check to see if things are actually identical?

I don't know how bad a false positive is here. Maybe if the threshhold for identical in sheer count is moderate, it's unlikely to trigger on reasonable real human behavior?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the naive false positive rate (for the 64bit variant of murmur3) is low enough to not worry about it and not do secondary network requests to check for exact matches.

This isn't a cryptographic hash, so attacks could be a concern for some rules.

Generally, I feel like all the counters we are using here should be treated as a bit fuzzy, at least for record-level counts. It is totally possible for events to get partially-processed (and partially persisted) and then re-processed again after a crash. I think the semantics and kinds of rules and actions we write are generally resilient to this: doing things like reporting for human review, or having large margins before taking fully-automated action.

evt.AddAccountFlag("multi-identical-reply")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not new to this PR, but I think some docs on the semantic distinctions between AccountLabels, AccountFlags, and AccountReports are needed somewhere (and then perhaps most methods touching them should have a quick pointer to that doc). The last one of the group is close to self-explanatory, but the first two are both just []string in the code and don't autounpack their meaning very readily.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mmm! added doc comments to all the fields for most of the RepoEvent variants

}

evt.IncrementPeriod("reply-text", bucket, period)
return nil
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ require (
github.com/samber/lo v1.38.1 // indirect
github.com/scylladb/go-reflectx v1.0.1 // indirect
github.com/segmentio/asm v1.2.0 // indirect
github.com/spaolacci/murmur3 v1.1.0 // indirect
github.com/spaolacci/murmur3 v1.1.0
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasttemplate v1.2.2 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
Expand Down
Loading