-
Notifications
You must be signed in to change notification settings - Fork 14
/
bullet_defaults.yaml
181 lines (154 loc) · 10.6 KB
/
bullet_defaults.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# The name of the Storm DRPC function
topology.function: "tracer"
# The name of the Storm topology
topology.name: "bullet-topology"
# The number of topology workers to use. Applicable when the scheduler is not ras.
topology.workers: 92
# Enable Storm debug logging
topology.debug: false
# The scheduling strategy to use. only "ras" currently supported.
topology.scheduler: "ras"
# Enable metrics collection for the topology. This uses the LoggingMetricsConsumer
# and collects a CPU metric using org.apache.storm.metrics.sigar.CPUMetric
topology.metrics.enable: false
# The following CPU loads and memory on and off heap control their respective component's CPU
# and memory configuration. These settings are only used when scheduler is "ras"
# The parallelism setting controls the number of executors used for each component.
topology.drpc.spout.cpu.load: 20.0
topology.drpc.spout.memory.on.heap.load: 256.0
topology.drpc.spout.memory.off.heap.load: 160.0
topology.drpc.spout.parallelism: 20
topology.prepare.bolt.cpu.load: 20.0
topology.prepare.bolt.memory.on.heap.load: 128.0
topology.prepare.bolt.memory.off.heap.load: 160.0
topology.prepare.bolt.parallelism: 5
topology.return.bolt.cpu.load: 20.0
topology.return.bolt.memory.on.heap.load: 128.0
topology.return.bolt.memory.off.heap.load: 160.0
topology.return.bolt.parallelism: 10
topology.filter.bolt.cpu.load: 100.0
topology.filter.bolt.memory.on.heap.load: 256.0
topology.filter.bolt.memory.off.heap.load: 160.0
topology.filter.bolt.parallelism: 35
topology.join.bolt.cpu.load: 100.0
topology.join.bolt.memory.on.heap.load: 512.0
topology.join.bolt.memory.off.heap.load: 160.0
topology.join.bolt.parallelism: 20
# Bullet uses tick tuples underneath the hood as a "clock" mechanism to do metadata and query updates (checking if rules
# have expired) etc. This setting controls the how frequently a tick happens - number of seconds between ticks.
topology.tick.interval.secs: 5
# This is the number of ticks for which an error caused by receiving a bad rule will be buffered if the
# return information has not been received, will be buffered before being thrown away
topology.join.bolt.error.tick.timeout: 3
# This is the number of ticks for which a rule will be buffered past its expiry in order to wait for
# aggregations to trickle in from the Filter Bolts.
topology.join.bolt.rule.tick.timeout: 3
# The default duration in milliseconds for a rule if one has not been specified.
rule.default.duration: 30000
# The maximum duration in milliseconds allowed for a rule. Anything greater will be clamped to this value.
rule.max.duration: 120000
# The default number of records that can be aggregated for a rule if one has not been specified.
rule.aggregation.default.size: 1
# The maximum number of records that will be aggregated per rule. Anything greater will be clamped to this value.
rule.aggregation.max.size: 512
# This is the separator that is used when a set of fields has to be considered as a single String.
# This is relevant when hashing a set of fields (for example, in a GROUP operation) for uniqueness purposes, such
# as when inserting into a Sketch. Without this, for example, if you were considering two fields together as a
# group, with values ab and cd, simply concatenating them would produce abcd. This is ambiguous if you with another
# record that had values a and bcd for those two fields. Using this separator distinguishes them for this purpose.
# If the default separator occurs in your fields, you should change it something else.
rule.aggregation.composite.field.separator: "|"
# The maximum number of records that will be collected in the Filter Bolt till it is emitted - i.e. a micro-batch.
# Leaving this at 1 emits your raw aggregation records as soon as they are received in the Filter Bolt. This makes
# your raw aggregation query run snappier if the total number of matched records across the Filter Bolts exceeds
# the number of records your query is looking for but individually each Filter Bolt does not find enough records to
# satisfy the query. Since the records are emitted immediately, the Join Bolt will terminate your query as soon
# as the total records are received instead of waiting for the micro-batch size to be reached.
# If you set this too high (for example, higher than the query size), you will wait the entire duration of the query,
# and the number of ticks specified in topology.join.bolt.rule.tick.timeout.
rule.aggregation.raw.micro.batch.size: 1
# The maximum number of records that will be collected for Raw aggregations. This number should be <= to the
# rule.aggregation.max.size value. If it is greater, the aggregation size will be used.
rule.aggregation.raw.max.size: 30
# The maximum number of entries stored by a Sketch created for doing COUNT DISTINCTS. Decreasing this number
# (rounded to powers of 2) can decrease the accuracy for high cardinality dimensions while decreasing the total
# memory used by the Sketch. The errors for a Theta Sketch is fixed at a maximum when this number is chosen - in other
# words, the error will be no worse than this maximum regardless of how many unique entries are inserted into the
# Sketch. Refer to: https://datasketches.github.io/docs/Theta/ThetaErrorTable.html
rule.aggregation.count.distinct.sketch.entries: 16384
# Controls how much sampling is done by the Sketch for COUNT DISTINCTS. A value of 1.0 means no sampling is done.
# A value of 0.5 means, the Sketch will throw out half the data coming into the Sketch.
# You can leave this at 1 since it really only affects it when we start supporting COUNT DISTINCTS as GROUP operations.
# https://datasketches.github.io/docs/Theta/ThetaPSampling.html
rule.aggregation.count.distinct.sketch.sampling: 1.0
# This can either be QuickSelect or Alpha (CaSe SeNsiTive). You can leave this at the default.
# Alpha Sketches are 30% more accurate if their estimates are queried directly but since we union them, their accuracy
# reverts back to the QuickSelect accuracy. Alpha Sketches are also faster when updating.
# https://datasketches.github.io/docs/Theta/ThetaUpdateSpeed.html
rule.aggregation.count.distinct.sketch.family: "Alpha"
# A Sketch does not start the maximum size specified tbe sketch.entries setting. It grows toward it and can be at most
# 2 x the size at the maximum. This factor controls by how much the size grows when the threshold is reached. Valid
# values are 1 (no resize start at maximum), 2 (double), 4 (quadruple) and 8 (octuple). Any other value defaults to 8.
# https://datasketches.github.io/docs/Theta/ThetaUpdateSpeed.html
rule.aggregation.count.distinct.sketch.resize.factor: 8
# The maximum number of entries stored by a Sketch created for doing GROUP BY. Sketches are used to do a uniform
# sample across your unique groups. So, this value should be set to a power of 2 approximately equal to your value for
# rule.aggregation.max.size. Anything greater will still work but the aggregation max size will limit your result
# anyway, so it's just a waste of resources to do so. If you have a count or sum as a metric for the group, summing them
# across the groups and dividing by your Sketch Theta (in the metadata), gives you an approximate estimate of the real
# sum/count across all your actual groups. The error is defined by the QuickSelect Sketch error. Refer to:
# https://datasketches.github.io/docs/Theta/ThetaErrorTable.html
rule.aggregation.group.sketch.entries: 512
# Controls how much sampling is done by the Sketch for GROUP BY. A value of 1.0 means no sampling is done.
# A value of 0.5 means, the Sketch will throw out half the data coming into the Sketch.
# You can leave this at 1 since it really only affects it when we start supporting COUNT DISTINCTS as GROUP operations.
# https://datasketches.github.io/docs/Theta/ThetaPSampling.html
rule.aggregation.group.sketch.sampling: 1.0
# A Sketch does not start the maximum size specified tbe sketch.entries setting. It grows toward it and can be at most
# 2 x the size at the maximum. This factor controls by how much the size grows when the threshold is reached. Valid
# values are 1 (no resize start at maximum), 2 (double), 4 (quadruple) and 8 (octuple). Any other value defaults to 8.
# https://datasketches.github.io/docs/Theta/ThetaUpdateSpeed.html
rule.aggregation.group.sketch.resize.factor: 8
# Enable logging meta information in the results. Configured metadata will be add to the meta section of the
# results: {"meta": {}, "records": []}
result.metadata.enable: true
# Each entry in this list indicates which metadata to collect (the name) and what key to add it as (the key) to the meta
# AbstractRule Identifier adds the original DRPC ID that was generated for the rule.
# AbstractRule Body adds the received rule definition. This is useful for diagnosing syntax exceptions when errors are received.
# Creation Time adds the timestamp in milliseconds when the AbstractRule was received by the Join Bolt
# Termination Time adds the timestamp in milliseconds when the Records were emitted by the Join Bolt
# Aggregation Metadata adds additional nested metadata about the aggregation if set. These are listed below.
# Estimated Result adds a boolean denoting whether the result was estimated. (COUNT DISTINCT, GROUP)
# Standard Deviations adds an object inside the Aggregation Metadata object where the keys are the standard deviations
# and the values are objects containing upper and lower bounds (COUNT DISTINCT, GROUP)
# Sketch Family adds the family of Sketches uses to produce the result, if one was used. (COUNT DISTINCT)
# Sketch Size adds the size of final Sketch used to produced the result, if one was used. (COUNT DISTINCT)
# Sketch Theta adds the theta value of the Sketch for Theta and Tuple Sketches, if one was used. (COUNT DISTINCT, GROUP)
# Uniques Estimate adds the approximate unique values seen for Tuple Sketches. (GROUP)
result.metadata.metrics:
- name: "Rule Identifier"
key: "rule_id"
- name: "Rule Body"
key: "rule_body"
- name: "Creation Time"
key: "rule_receive_time"
- name: "Termination Time"
key: "rule_finish_time"
- name: "Aggregation Metadata"
key: "aggregation"
- name: "Estimated Result"
key: "wasEstimated"
- name: "Standard Deviations"
key: "standardDeviations"
- name: "Sketch Family"
key: "sketchFamily"
- name: "Sketch Size"
key: "sketchSize"
- name: "Sketch Theta"
key: "sketchTheta"
- name: "Uniques Estimate"
key: "uniquesEstimate"
# Enables whether each record should have a new key added to it denoting when the Filter Bolt saw it
record.inject.timestamp.enable: true
# This is the key that is used to add the timestamp in milliseconds to the record, if record.inject.timestamp.enable is true
record.inject.timestamp.key: "__receive_timestamp"