Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

automod: rules engine framework #434

Merged
merged 35 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d98fd5d
automod: start sketching package API
bnewbold Nov 11, 2023
86161b4
hepa: initial skelton of daemon
bnewbold Nov 11, 2023
5541d2c
automod: building
bnewbold Nov 13, 2023
eb7f8b8
automod: significantly simplify firehose consumption, for now
bnewbold Nov 14, 2023
e4744a3
automod: basic set storage API
bnewbold Nov 14, 2023
85d4cd2
automod: rules sub-package, a bit of tests
bnewbold Nov 14, 2023
0eb79b7
automod: misleading URL and mention rules
bnewbold Nov 14, 2023
f9c1fe6
automod: helper to load sets from JSON file
bnewbold Nov 14, 2023
f9af94d
automod: persist mod actions
bnewbold Nov 14, 2023
1d120a5
automod: more persisting and logging
bnewbold Nov 14, 2023
79df1f0
hepa: wire up mod config
bnewbold Nov 14, 2023
b53bcd4
automod: fix trivial err
bnewbold Nov 14, 2023
fbec7f6
hepa: load set config from JSON file
bnewbold Nov 15, 2023
01ae2e9
automod: redis identity cache; early parallelism; fix rules
bnewbold Nov 15, 2023
b195b04
automod: fix label creation/action
bnewbold Nov 15, 2023
1045d05
automod: redis counters; some refactors
bnewbold Nov 16, 2023
d46ed60
rules: refactor with some helpers
bnewbold Nov 16, 2023
ef07c1e
automod: add generic caching, and hydrate some account meta
bnewbold Nov 16, 2023
6b1c597
automod: fix tests (no XRPC in testing)
bnewbold Nov 17, 2023
c0a1092
HACK: remove did doc from createSession to un-break it (temporarily)
bnewbold Nov 17, 2023
f67fb98
automod: private admin state hydration
bnewbold Nov 17, 2023
37042b1
hepa: persist cursor state in redis
bnewbold Nov 17, 2023
21b494d
syntax: fix AT-URI Path() impl
bnewbold Nov 17, 2023
cc9ed96
automod: process individual pre-existing records (by AT-URI)
bnewbold Nov 17, 2023
c54a5e4
hepa: dockerfile and github build actions
bnewbold Nov 18, 2023
aef1908
makefile and HACKING entries for hepa
bnewbold Nov 18, 2023
a70cd1c
automod brief docs/context
bnewbold Nov 18, 2023
499d1fa
automod: update docs
bnewbold Nov 20, 2023
6aee771
automod: refactor event types
bnewbold Nov 20, 2023
4e599db
automod: refactor out RelayClient, connect to account's PDS directly
bnewbold Nov 20, 2023
a2b29a8
automod: don't actually label, just flag
bnewbold Nov 20, 2023
3539c95
automod: tweak misleading URL processing
bnewbold Nov 20, 2023
695ed76
automod: more URL tweaks
bnewbold Nov 20, 2023
ce3afdd
automod: fix nil error on IdentityEvent logging
bnewbold Nov 20, 2023
c4aefa7
automod: yet more URL cleaning
bnewbold Nov 20, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions .github/workflows/container-hepa-aws.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: container-hepa-aws
on: [push]
env:
REGISTRY: ${{ secrets.AWS_ECR_REGISTRY_USEAST2_PACKAGES_REGISTRY }}
USERNAME: ${{ secrets.AWS_ECR_REGISTRY_USEAST2_PACKAGES_USERNAME }}
PASSWORD: ${{ secrets.AWS_ECR_REGISTRY_USEAST2_PACKAGES_PASSWORD }}
# github.repository as <account>/<repo>
IMAGE_NAME: hepa

jobs:
container-hepa-aws:
if: github.repository == 'bluesky-social/indigo'
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
id-token: write

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Setup Docker buildx
uses: docker/setup-buildx-action@v1

- name: Log into registry ${{ env.REGISTRY }}
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ env.USERNAME }}
password: ${{ env.PASSWORD }}

- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v4
with:
images: |
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=sha,enable=true,priority=100,prefix=,suffix=,format=long

- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: ./cmd/hepa/Dockerfile
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
54 changes: 54 additions & 0 deletions .github/workflows/container-hepa-ghcr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: container-hepa-ghcr
on:
push:
branches:
- main
- bnewbold/automod
env:
REGISTRY: ghcr.io
# github.repository as <account>/<repo>
IMAGE_NAME: ${{ github.repository }}

jobs:
container-hepa-ghcr:
if: github.repository == 'bluesky-social/indigo'
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
id-token: write

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Setup Docker buildx
uses: docker/setup-buildx-action@v1

- name: Log into registry ${{ env.REGISTRY }}
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v4
with:
images: |
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=sha,enable=true,priority=100,prefix=hepa:,suffix=,format=long

- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: ./cmd/hepa/Dockerfile
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
2 changes: 2 additions & 0 deletions HACKING.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Run with, eg, `go run ./cmd/bigsky`):
- `cmd/fakermaker`: helper to generate fake accounts and content for testing
- `cmd/supercollider`: event stream load generation tool
- `cmd/sonar`: event stream monitoring tool
- `cmd/hepa`: auto-moderation rule engine service
- `gen`: dev tool to run CBOR type codegen

Packages:
Expand All @@ -23,6 +24,7 @@ Packages:
- `atproto/crypto`: crytographic helpers (signing, key generation and serialization)
- `atproto/syntax`: string types and parsers for identifiers, datetimes, etc
- `atproto/identity`: DID and handle resolution
- `automod`: moderation and anti-spam rules engine
- `bgs`: server implementation for crawling, etc
- `carstore`: library for storing repo data in CAR files on disk, plus a metadata SQL db
- `events`: types, codegen CBOR helpers, and persistence for event feeds
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ build: ## Build all executables
go build ./cmd/stress
go build ./cmd/fakermaker
go build ./cmd/labelmaker
go build ./cmd/hepa
go build ./cmd/supercollider
go build -o ./sonar-cli ./cmd/sonar
go build ./cmd/palomar
Expand Down
15 changes: 7 additions & 8 deletions api/atproto/servercreateSession.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions atproto/syntax/aturi.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ func (n ATURI) Authority() AtIdentifier {
// Returns path segment, without leading slash, as would be used in an atproto repository key. Or empty string if there is no path.
func (n ATURI) Path() string {
parts := strings.SplitN(string(n), "/", 5)
if len(parts) < 3 {
if len(parts) < 4 {
// something has gone wrong (would not validate)
return ""
}
if len(parts) == 3 {
return parts[2]
if len(parts) == 4 {
return parts[3]
}
return parts[2] + "/" + parts[3]
return parts[3] + "/" + parts[4]
}

// Returns a valid NSID if there is one in the appropriate part of the path, otherwise empty.
Expand Down
27 changes: 26 additions & 1 deletion atproto/syntax/aturi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,20 @@ func TestInteropATURIsValid(t *testing.T) {
if len(line) == 0 || line[0] == '#' {
continue
}
_, err := ParseATURI(line)
aturi, err := ParseATURI(line)
if err != nil {
fmt.Println("FAILED, GOOD: " + line)
}
assert.NoError(err)

// check that Path() is working
col := aturi.Collection()
rkey := aturi.RecordKey()
if rkey != "" {
assert.Equal(col.String()+"/"+rkey.String(), aturi.Path())
} else if col != "" {
assert.Equal(col.String(), aturi.Path())
}
}
assert.NoError(scanner.Err())
}
Expand Down Expand Up @@ -67,7 +76,22 @@ func TestATURIParts(t *testing.T) {
rkey := uri.RecordKey()
assert.Equal(parts[3], rkey.String())
}
}

func TestATURIPath(t *testing.T) {
assert := assert.New(t)

uri1, err := ParseATURI("at://did:abc:123/io.nsid.someFunc/record-key")
assert.NoError(err)
assert.Equal("io.nsid.someFunc/record-key", uri1.Path())

uri2, err := ParseATURI("at://did:abc:123/io.nsid.someFunc")
assert.NoError(err)
assert.Equal("io.nsid.someFunc", uri2.Path())

uri3, err := ParseATURI("at://did:abc:123")
assert.NoError(err)
assert.Equal("", uri3.Path())
}

func TestATURINormalize(t *testing.T) {
Expand All @@ -93,5 +117,6 @@ func TestATURINoPanic(t *testing.T) {
_ = bad.RecordKey()
_ = bad.Normalize()
_ = bad.String()
_ = bad.Path()
}
}
23 changes: 23 additions & 0 deletions automod/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
indigo/automod
==============

This package (`github.com/bluesky-social/indigo/automod`) contains a "rules engine" to augment human moderators in the atproto network. Batches of rules are processed for novel "events" such as a new post or update of an account handle. Counters and other statistics are collected, which can drive subsequent rule invocations. The outcome of rules can be moderation events like "report account for human review" or "label post". A lot of what this package does is collect and maintain caches of relevant metadata about accounts and pieces of content, so that rules have efficient access to this information.

A primary design goal is to have a flexible framework to allow new rules to be written and deployed rapidly in response to new patterns of spam and abuse.

Some example rules are included in the `automod/rules` package, but the expectation is that some real-world rules will be kept secret.

Code for subscribing to a firehose is not included here; see `cmd/hepa` for a complete service built on this library.


## Design

Prior art and inspiration:

* The [SQRL language](https://sqrl-lang.github.io/sqrl/) and runtime was originally developed by an industry vendor named Smyte, then acquired by Twitter, with some core Javascript components released open source in 2023. The SQRL documentation is extensive and describes many of the design trade-offs and features specific to rules engines. Bluesky considered adopting SQRL but decided to start with a simpler runtime with rules in a known language (golang).

* Reddit's [automod system](https://www.reddit.com/wiki/automoderator/) is simple an accessible for non-technical sub-reddit community moderators. Discord has a large ecosystem of bots which can help communities manage some moderation tasks, in particular mitigating spam and brigading.

* Facebook's FXL and Haxl rule languages have been in use for over a decade. The 2012 paper ["The Facebook Immune System"](https://css.csail.mit.edu/6.858/2012/readings/facebook-immune.pdf) gives a good overview of design goals and how a rules engine fits in to a an overall anti-spam/anti-abuse pipeline.

* Email anti-spam systems like SpamAssassin and rspamd have been modular and configurable for several decades.
126 changes: 126 additions & 0 deletions automod/account_meta.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package automod

import (
"context"
"encoding/json"
"fmt"
"time"

comatproto "github.com/bluesky-social/indigo/api/atproto"
appbsky "github.com/bluesky-social/indigo/api/bsky"
"github.com/bluesky-social/indigo/atproto/identity"
"github.com/bluesky-social/indigo/atproto/syntax"
)

type ProfileSummary struct {
HasAvatar bool
Description *string
DisplayName *string
}

type AccountPrivate struct {
Email string
EmailConfirmed bool
IndexedAt time.Time
}

// information about a repo/account/identity, always pre-populated and relevant to many rules
type AccountMeta struct {
Identity *identity.Identity
Profile ProfileSummary
Private *AccountPrivate
AccountLabels []string
FollowersCount int64
FollowsCount int64
PostsCount int64
}

func (e *Engine) GetAccountMeta(ctx context.Context, ident *identity.Identity) (*AccountMeta, error) {

// wipe parsed public key; it's a waste of space and can't serialize
ident.ParsedPublicKey = nil

// fallback in case client wasn't configured (eg, testing)
if e.BskyClient == nil {
e.Logger.Warn("skipping account meta hydration")
am := AccountMeta{
Identity: ident,
Profile: ProfileSummary{},
}
return &am, nil
}

existing, err := e.Cache.Get(ctx, "acct", ident.DID.String())
if err != nil {
return nil, err
}
if existing != "" {
var am AccountMeta
err := json.Unmarshal([]byte(existing), &am)
if err != nil {
return nil, fmt.Errorf("parsing AccountMeta from cache: %v", err)
}
am.Identity = ident
return &am, nil
}

// fetch account metadata
pv, err := appbsky.ActorGetProfile(ctx, e.BskyClient, ident.DID.String())
if err != nil {
return nil, err
}

var labels []string
for _, lbl := range pv.Labels {
labels = append(labels, lbl.Val)
}

am := AccountMeta{
Identity: ident,
Profile: ProfileSummary{
HasAvatar: pv.Avatar != nil,
Description: pv.Description,
DisplayName: pv.DisplayName,
},
AccountLabels: dedupeStrings(labels),
}
if pv.PostsCount != nil {
am.PostsCount = *pv.PostsCount
}
if pv.FollowersCount != nil {
am.FollowersCount = *pv.FollowersCount
}
if pv.FollowsCount != nil {
am.FollowsCount = *pv.FollowsCount
}

if e.AdminClient != nil {
pv, err := comatproto.AdminGetAccountInfo(ctx, e.AdminClient, ident.DID.String())
if err != nil {
return nil, err
}
ap := AccountPrivate{}
if pv.Email != nil && *pv.Email != "" {
ap.Email = *pv.Email
}
if pv.EmailConfirmedAt != nil && *pv.EmailConfirmedAt != "" {
ap.EmailConfirmed = true
}
ts, err := syntax.ParseDatetimeTime(pv.IndexedAt)
if err != nil {
return nil, err
}
ap.IndexedAt = ts
am.Private = &ap
}

val, err := json.Marshal(&am)
if err != nil {
return nil, err
}

if err := e.Cache.Set(ctx, "acct", ident.DID.String(), string(val)); err != nil {
return nil, err
}
return &am, nil
}
Loading
Loading