-
Notifications
You must be signed in to change notification settings - Fork 3
/
bcd.go
605 lines (494 loc) · 16.3 KB
/
bcd.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
// Package bt provides integration with out-of-process tracers. Using the
// provided Tracer interface, applications may invoke tracer execution on
// demand. Panic and signal handling integrations are provided.
//
// The Tracer interface is generic and will support any out-of-process tracer
// implementing it. A default Tracer implementation, which uses the
// Backtrace I/O platform, is provided.
package bt
import (
"errors"
"fmt"
"os"
"os/exec"
"os/signal"
"reflect"
"runtime"
"strconv"
"strings"
"sync"
"time"
)
var (
// Only one tracer should be running on a process at any time; this is
// global to all created tracers.
traceLock chan struct{}
// Configuration applicable to all tracer invocations.
state globalState
)
type globalState struct {
c GlobalConfig
m sync.RWMutex
}
type GlobalConfig struct {
// If the tracer's timeout expires and the tracer cannot be killed,
// generate a run-time panic.
//
// Defaults to true.
PanicOnKillFailure bool
// Upon receipt of a signal and execution of the tracer, re-sends the
// signal to the default Go signal handler for the signal and stops
// listening for the signal.
// Note: this will call signal.Reset(signal) on the received signal,
// which undoes the effect of any signal.Notify() calls for the signal.
//
// Defaults to true.
ResendSignal bool
// Length of time to wait after completion of a tracer's
// execution before allowing the next tracer to run.
//
// Defaults to 3 seconds.
RateLimit time.Duration
// If bt.Trace() has been configured to attempt an upload immediately,
// wait for the Tracer to finish uploading its results (instead of
// asynchronously uploading in a new goroutine) before returning
// from bt.Trace().
//
// Defaults to true.
SynchronousPut bool
}
func init() {
// Tracer execution timeouts are supported, which is why we use a
// channel here instead of sync.Mutex.
traceLock = make(chan struct{}, 1)
traceLock <- struct{}{}
state = globalState{
c: GlobalConfig{
PanicOnKillFailure: true,
ResendSignal: true,
RateLimit: time.Second * 3,
SynchronousPut: true}}
}
// Update global Tracer configuration.
func UpdateConfig(c GlobalConfig) {
state.m.Lock()
defer state.m.Unlock()
state.c = c
}
// A generic out-of-process tracer interface.
//
// This is used primarily by the top-level functions of the bt package,
// like bt.Trace, to handle execution and synchronization of various
// generic tracers.
//
// Tracers are not limited to this interface and may provide additional
// utility methods; see specific tracer implementation (e.g. BTTracer)
// documentation for details.
//
// The methods in this interface are expected to be goroutine safe; multiple
// trace requests (which ultimately call into these methods) from different
// goroutines may run concurrently.
type Tracer interface {
// Store the options provided by v.
//
// If the options slice is non-nil, the provided options should be
// stored in it; otherwise, the options are added to the Tracer's
// base set of options.
// Returns the final options slice if the provided options slice is
// non-nil.
AddOptions(options []string, v ...string) []string
// Add a key-value attribute.
//
// See AddOptions for rules regarding the specified options slice and
// the return value.
AddKV(options []string, key, val string) []string
// Add a thread filter option using the specified tid. If any thread
// filter options are added, all non-matching threads and goroutines
// are expected to be excluded from the generated snapshot.
//
// See AddOptions for rules regarding the specified options slice and
// the return value.
AddThreadFilter(options []string, tid int) []string
// Add a faulted thread option using the specified tid. Threads and
// goroutines matching any faulted thread options are marked as faulted
// and subject to analysis and grouping.
//
// See AddOptions for rules regarding the specified options slice and
// the return value.
AddFaultedThread(options []string, tid int) []string
// Add a caller goroutine option using the specified goid.
//
// See AddOptions for rules regarding the specified options slice and
// the return value.
AddCallerGo(options []string, goid int) []string
// Add a classification to the generated snapshot.
//
// See AddOptions for rules regarding the specified options slice and
// the return value.
AddClassifier(options []string, classifier string) []string
// Returns a copy of the base set of options for the Tracer.
Options() []string
// Clears the base set of options for the Tracer.
ClearOptions()
// Returns the default TraceOptions used in bt.Trace() if an override
// is not specified as an argument to it.
DefaultTraceOptions() *TraceOptions
// Accepts a final set of options and returns a Command object
// representing a tracer that is ready to run. This will be executed
// on the current process.
Finalize(options []string) *exec.Cmd
// Determines when and to what the Tracer will log.
Log
// String representation of a Tracer.
fmt.Stringer
// Returns whether the Tracer should upload its results to a remote
// server after successful tracer execution.
PutOnTrace() bool
// Uploads Tracer results given by the snapshot argument, which is
// the stdout of the Tracer process, to the configured remote server.
//
// As this is part of the generic Tracer interface, callers know
// nothing about the contents of the output; thus, it is passed
// unfiltered to the specific underlying implementation.
Put(snapshot []byte) error
}
// Options determining actions taken during Tracer execution.
type TraceOptions struct {
// If true, the calling thread/goroutine will be marked as faulted
// (i.e. the cause of the error or trace request).
//
// This is a Linux-specific option; it results in a noop on other
// systems.
Faulted bool
// If true, only the calling thread/goroutine will be traced; all others
// will be excluded from the generated snapshot.
//
// This is a Linux-specific option; it results in a noop on other
// systems.
CallerOnly bool
// If true and a non-nil error object is passed to bt.Trace(), a
// classifier will be added based on the specified error's type.
ErrClassification bool
// If non-nil, all contained strings will be added as classifiers to
// the generated snapshot.
Classifications []string
// Amount of time to wait for the tracer to finish execution.
// If 0 is specified, Tracer.DefaultTraceOptions()'s timeout will be
// used. If <0 is specified, no timeout will be used; the Tracer command
// will run until it exits.
Timeout time.Duration
// If non-nil, any goroutines spawned during the Trace() request will
// be added to the wait group. This facilitates waiting for things like
// asynchronous snapshot uploads to complete before exiting the
// application.
SpawnedGs *sync.WaitGroup
}
type Log interface {
// Logs the specified message if the specified log level is enabled.
Logf(level LogPriority, format string, v ...interface{})
// Sets the log level to the specified bitmask of LogPriorities; all
// priorities excluded from the mask are ignored.
SetLogLevel(level LogPriority)
}
type LogPriority int
const (
LogDebug = 1 << iota
LogWarning
LogError
LogMax = (1 << iota) - 1
)
// This is a superset of the generic Tracer interface for those that wish
// to support signal handling. The methods unique to this interface are
// not expected to be goroutine-safe.
type TracerSig interface {
Tracer
// Sets the desired set of signals for which to invoke the Tracer upon
// receipt of the signal.
SetSigset(sigs ...os.Signal)
// Returns the desired signal set.
Sigset() []os.Signal
// Sets the channel through which the Tracer will respond to signals.
SetSigchan(sc chan os.Signal)
// Returns the channel through which the Tracer will respond to signals.
Sigchan() chan os.Signal
}
// Create a unique error to pass to a Trace request.
type signalError struct {
s os.Signal
}
func (s *signalError) Error() string {
return s.s.String()
}
// Registers a signal handler to execute the specified Tracer upon receipt of
// any signal in the set specified by TracerSig.Sigset().
// If the GlobalConfiguration value ResendSignal is true, then when a signal is
// received through this handler, all handlers for that signal will be reset
// with signal.Reset(s) after tracer execution completes. The signal will then
// be resent to the default Go handler for that signal.
func Register(t TracerSig) {
ss := t.Sigset()
if ss == nil {
t.Logf(LogError, "Failed to register signal handler: empty "+
"sigset\n")
return
}
c := t.Sigchan()
if c != nil {
unregisterInternal(t, c)
}
c = make(chan os.Signal, len(ss))
t.SetSigchan(c)
signal.Notify(c, ss...)
t.Logf(LogDebug, "Registered tracer %s (signal set: %v)\n", t, ss)
state.m.RLock()
rs := state.c.ResendSignal
state.m.RUnlock()
go func(t TracerSig) {
for s := range c {
t.Logf(LogDebug, "Received %v; executing tracer\n", s)
_ = Trace(t, &signalError{s}, nil)
if !rs {
continue
}
t.Logf(LogDebug, "Resending %v to default handler\n", s)
// Re-handle the signal with the default Go behavior.
signal.Reset(s)
p, err := os.FindProcess(os.Getpid())
if err != nil {
t.Logf(LogError, "Failed to resend signal: "+
"cannot find process object")
return
}
_ = p.Signal(s)
}
t.Logf(LogDebug, "Signal channel closed; exiting goroutine\n")
}(t)
}
// Stops the specified TracerSig from handling any signals it was previously
// registered to handle via bt.Register().
func Unregister(t TracerSig) {
c := t.Sigchan()
if c == nil {
return
}
unregisterInternal(t, c)
}
func unregisterInternal(t TracerSig, c chan os.Signal) {
t.Logf(LogDebug, "Stopping signal channel...\n")
signal.Stop(c)
t.Logf(LogDebug, "Closing signal channel...\n")
close(c)
t.SetSigchan(nil)
t.Logf(LogDebug, "Tracer unregistered\n")
}
type tracerResult struct {
stdOut []byte
err error
}
// Executes the specified Tracer on the current process.
//
// If e is non-nil, it will be used to augment the trace according to the
// TraceOptions.
// If traceOptions is non-nil, it will be used instead of the Tracer's
// DefaultTraceOptions(). See TraceOptions for details on the various options.
//
// This is goroutine-safe; multiple goroutines may share the same Tracer and
// execute Trace() concurrently. Only one tracer will be allowed to run at
// any point; others will wait to acquire resources (locks) or timeout (if
// timeouts are not disabled). Trace execution will be rate-limited according
// to the GlobalConfig settings.
//
// This may also be called in a new goroutine via go Trace(...). In that case,
// ensure TraceOptions.CallerOnly is false (you will likely also want to set
// TraceOptions.Faulted to false); otherwise, only the newly spawned goroutine
// will be traced.
//
// Output of specific Tracer execution depends on the implementation; most
// Tracers will have options for specifying output paths.
func Trace(t Tracer, e error, traceOptions *TraceOptions) (err error) {
if traceOptions == nil {
traceOptions = t.DefaultTraceOptions()
}
// If no timeouts are specified, the timeout channel will block
// forever (i.e. it will return only after the tracer exits).
// We create the timer first to account for the work below, but
// we won't wrap setup in a timeout as it's unlikely to be
// a bottleneck.
var timeout <-chan time.Time
if traceOptions.Timeout == 0 {
to := t.DefaultTraceOptions().Timeout
timeout = time.After(to)
t.Logf(LogDebug, "Tracer timeout: %v\n", to)
} else if traceOptions.Timeout > 0 {
timeout = time.After(traceOptions.Timeout)
t.Logf(LogDebug, "Tracer timeout: %v\n", traceOptions.Timeout)
}
// We create a new options slice to avoid modifying the base
// set of tracer options just for this particular trace
// invocation.
options := t.Options()
// If the caller has requested a trace with thread-specific options,
// then add the relevant thread specifications to the options list.
if traceOptions.CallerOnly || traceOptions.Faulted {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if tid, err := gettid(); err == nil {
t.Logf(LogDebug, "Retrieved tid: %v\n", tid)
if traceOptions.CallerOnly {
options = t.AddThreadFilter(options, tid)
}
if traceOptions.Faulted {
options = t.AddFaultedThread(options, tid)
}
} else {
t.Logf(LogWarning, "Failed to retrieve tid: %v\n", err)
}
}
// Report caller's goid
var buf [64]byte
n := runtime.Stack(buf[:], false)
idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0]
if goid, err := strconv.Atoi(idField); err == nil {
t.Logf(LogDebug, "Retrieved goid: %v\n", goid)
options = t.AddCallerGo(options, goid)
} else {
t.Logf(LogWarning, "Failed to retrieve goid: %v\n", err)
}
if e != nil {
options = t.AddKV(options, "error", e.Error())
if traceOptions.ErrClassification {
options = t.AddClassifier(options,
reflect.TypeOf(e).String())
}
}
for _, c := range traceOptions.Classifications {
options = t.AddClassifier(options, c)
}
state.m.RLock()
kfPanic := state.c.PanicOnKillFailure
rl := state.c.RateLimit
synchronousPut := state.c.SynchronousPut
state.m.RUnlock()
select {
case <-timeout:
err = errors.New("Tracer lock acquisition timed out")
t.Logf(LogError, "%v\n", err)
return
case <-traceLock:
break
}
// We now hold the trace lock.
// Allow another tracer to execute (i.e. by re-populating the
// traceLock channel) as long as the current tracer has
// exited.
defer func() {
go traceUnlockRL(t, rl)
}()
done := make(chan tracerResult, 1)
tracer := t.Finalize(options)
if traceOptions.SpawnedGs != nil {
traceOptions.SpawnedGs.Add(1)
}
go func() {
if traceOptions.SpawnedGs != nil {
defer traceOptions.SpawnedGs.Done()
}
t.Logf(LogDebug, "Starting tracer %v\n", tracer)
var res tracerResult
res.stdOut, res.err = tracer.Output()
done <- res
t.Logf(LogDebug, "Tracer finished execution\n")
}()
t.Logf(LogDebug, "Waiting for tracer completion...\n")
var res tracerResult
select {
case <-timeout:
if err = tracer.Process.Kill(); err != nil {
t.Logf(LogError,
"Failed to kill tracer upon timeout: %v\n",
err)
if kfPanic {
t.Logf(LogWarning,
"PanicOnKillFailure set; "+
"panicking\n")
panic(err)
}
}
err = errors.New("Tracer execution timed out")
t.Logf(LogError, "%v; process killed\n", err)
return
case res = <-done:
break
}
// Tracer execution has completed by this point.
if res.err != nil {
t.Logf(LogError, "Tracer failed to run: %v\n",
res.err)
err = res.err
return
}
if !t.PutOnTrace() {
t.Logf(LogDebug, "Trace request complete\n")
return
}
putFn := func() error {
t.Logf(LogDebug, "Uploading snapshot...")
if err := t.Put(res.stdOut); err != nil {
t.Logf(LogError, "Failed to upload snapshot: %s",
err)
return err
}
t.Logf(LogDebug, "Successfully uploaded snapshot\n")
return nil
}
if synchronousPut {
err = putFn()
} else {
t.Logf(LogDebug, "Starting asynchronous put...\n")
if traceOptions.SpawnedGs != nil {
traceOptions.SpawnedGs.Add(1)
}
go func() {
if traceOptions.SpawnedGs != nil {
defer traceOptions.SpawnedGs.Done()
}
_ = putFn()
}()
}
t.Logf(LogDebug, "Trace request complete\n")
return
}
func traceUnlockRL(t Tracer, rl time.Duration) {
t.Logf(LogDebug, "Waiting for ratelimit (%v)\n", rl)
<-time.After(rl)
t.Logf(LogDebug, "Unlocking traceLock\n")
traceLock <- struct{}{}
}
// Create a unique error type to use during panic recovery.
type panicError struct {
v interface{}
}
func (p *panicError) Error() string {
return fmt.Sprintf("%v", p.v)
}
// Establishes a panic handler that will execute the specified Tracer in
// response. If repanic is true, this will repanic again after Tracer execution
// completes (with the original value returned by recover()).
// This must be used with Go's defer, panic, and recover pattern; see
// https://blog.golang.org/defer-panic-and-recover.
func Recover(t Tracer, repanic bool, options *TraceOptions) {
if r := recover(); r != nil {
err, ok := r.(error)
if !ok {
// We use the runtime type of the error object for
// classification (and thus potential grouping);
// *bt.PanicError is a more descriptive classifier
// than something like *errors.errorString.
err = &panicError{r}
}
_ = Trace(t, err, options)
if repanic {
panic(r)
}
}
}