Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: otel tracing #494

Merged
merged 1 commit into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions .docker/docker-compose-infra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,23 @@ services:
- IMGPROXY_USE_ETAG=true
- IMGPROXY_ENABLE_WEBP_DETECTION=true

# Optional for rate-limiting
redis:
image: redis:6.2-alpine
restart: always
ports:
- '6379:6379'
# Optional for rate-limiting
# redis:
# image: redis:6.2-alpine
# restart: always
# ports:
# - '6379:6379'

# Optional for tracing
# otel:
# extends:
# service: otel-collector
# file: ./.docker/docker-compose-monitoring.yml
#
# jaeger:
# extends:
# service: jaeger
# file: ./.docker/docker-compose-monitoring.yml

configs:
init.sql:
Expand Down
25 changes: 24 additions & 1 deletion .docker/docker-compose-monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,27 @@ services:
- GF_SECURITY_ADMIN_PASSWORD=grafana
volumes:
- ../monitoring/grafana/config:/etc/grafana/provisioning
- ../monitoring/grafana/dashboards:/var/lib/grafana/dashboards
- ../monitoring/grafana/dashboards:/var/lib/grafana/dashboards

jaeger:
image: jaegertracing/all-in-one:1.57.0
ports:
- "16686:16686" # Jaeger UI
- "14250:14250" # GRPC
- "14268:14268" # HTTP
- "14269:14269" # HTTP
- "6831:6831/udp" # UDP
- "6832:6832/udp" # UDP
- "5778:5778" # HTTP

otel-collector:
image: otel/opentelemetry-collector-contrib:0.100.0
ports:
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP Http receiver
- "55680:55680" # OTLP HTTP receiver
command: [ "--config=/etc/otel/otel-collector-config.yml" ]
depends_on:
- jaeger
volumes:
- ../monitoring/otel/config:/etc/otel
10 changes: 10 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,14 @@ services:
# prometheus:
# extends:
# service: prometheus
# file: ./.docker/docker-compose-monitoring.yml
#
# otel:
# extends:
# service: otel-collector
# file: ./.docker/docker-compose-monitoring.yml
#
# jaeger:
# extends:
# service: jaeger
# file: ./.docker/docker-compose-monitoring.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
CREATE FUNCTION tenants_delete_notify_trigger ()
RETURNS TRIGGER
AS $$
BEGIN
PERFORM
pg_notify('tenants_update', '"' || OLD.id || '"');
inian marked this conversation as resolved.
Show resolved Hide resolved
RETURN NULL;
END;
$$
LANGUAGE plpgsql;
CREATE TRIGGER tenants_delete_notify_trigger
AFTER DELETE ON tenants
FOR EACH ROW
EXECUTE PROCEDURE tenants_delete_notify_trigger ();
2 changes: 2 additions & 0 deletions migrations/multitenant/0011-tracing-mode-column.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

ALTER TABLE tenants ADD COLUMN tracing_mode text NOT NULL DEFAULT 'basic';
183 changes: 183 additions & 0 deletions monitoring/otel/config/otel-collector-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
receivers:
otlp:
protocols:
grpc:
http:

processors:
memory_limiter:
check_interval: 1s
limit_percentage: 70
spike_limit_percentage: 20
batch:
send_batch_size: 10000
timeout: 10s
tail_sampling/storage:
decision_wait: 10s
expected_new_traces_per_sec: 10000
num_traces: 50000
policies:
[
# Exclude probes URLs
{
name: exclude-urls,
type: string_attribute,
string_attribute: { key: http.route, values: [ \/health.*, \/metrics, \/tenants, \/version, \/status ], enabled_regex_matching: true, invert_match: true }
},
# All error are sampled
{
name: error-status-codes,
type: numeric_attribute,
numeric_attribute: { key: http.status_code, min_value: 500, max_value: 599 }
},
# Always sample high latency traces that are not uploads
{
name: high-latency-excluding-uploads,
type: and,
and: {
and_sub_policy:
[
{
type: latency,
latency: { threshold_ms: 5000 }
},
# Exclude upload operations
{
type: string_attribute,
string_attribute: {
key: http.operation,
values: [ .*upload.* ],
enabled_regex_matching: true,
invert_match: true
}
}
]
}
},
# Always sample high latency uploads
{
name: high-latency-uploads,
type: and,
and: {
and_sub_policy:
[
{
type: latency,
latency: { threshold_ms: 300000 }
},
# Only upload operations
{
type: string_attribute,
string_attribute: {
key: http.operation,
values: [ .*upload.* ],
enabled_regex_matching: true,
}
}
]
}
},
# Sample traces for tenants with default mode
# Default mode is the mode where the trace.mode attribute is set to basic
# and only 0.2 of traces are sampled for each tenant
{
name: sampling-basic-tenants,
type: and,
and: {
and_sub_policy:
[
{
# must have tenant.ref attribute
name: has-tenant-ref,
type: string_attribute,
string_attribute:
{
key: tenant.ref,
values: [ .* ],
enabled_regex_matching: true
},
},
{
# trace.mode = basic
name: trace-mode-default,
type: string_attribute,
string_attribute:
{
key: trace.mode,
values: [ basic ],
},
},
{
name: success-status-codes,
type: numeric_attribute,
numeric_attribute: { key: http.status_code, min_value: 200, max_value: 399 }
},
{
name: basic-sampling,
type: probabilistic,
probabilistic: {
sampling_percentage: 5
}
}
]
}
},

# Sample traces for tenants with premium mode
# Premium mode sample 100% of traces for each tenant
{
name: sampling-premium-tenants,
type: and,
and: {
and_sub_policy:
[
{
# must have tenant.ref attribute
name: has-tenant-ref,
type: string_attribute,
string_attribute:
{
key: tenant.ref,
values: [ .* ],
enabled_regex_matching: true
},
},
{
# trace.mode = premium
name: trace-mode-default,
type: string_attribute,
string_attribute:
{
key: trace.mode,
values: [ full ],
},
},
{
name: success-status-codes,
type: numeric_attribute,
numeric_attribute: { key: http.status_code, min_value: 200, max_value: 399 }
},
{
name: full-sampling,
type: probabilistic,
probabilistic: {
sampling_percentage: 100
}
}
]
}
}
]

exporters:
otlp/jaeger:
endpoint: "jaeger:4317"
tls:
insecure: true

service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, tail_sampling/storage, batch]
exporters: [otlp/jaeger]
Loading
Loading