forked from tidwall/buntdb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
buntdb.go
2370 lines (2225 loc) · 69 KB
/
buntdb.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Package buntdb implements a low-level in-memory key/value store in pure Go.
// It persists to disk, is ACID compliant, and uses locking for multiple
// readers and a single writer. Bunt is ideal for projects that need a
// dependable database, and favor speed over data size.
package buntdb
import (
"bufio"
"errors"
"io"
"os"
"sort"
"strconv"
"strings"
"sync"
"time"
"github.com/tidwall/btree"
"github.com/tidwall/gjson"
"github.com/tidwall/grect"
"github.com/tidwall/match"
"github.com/tidwall/rtred"
)
var (
// ErrTxNotWritable is returned when performing a write operation on a
// read-only transaction.
ErrTxNotWritable = errors.New("tx not writable")
// ErrTxClosed is returned when committing or rolling back a transaction
// that has already been committed or rolled back.
ErrTxClosed = errors.New("tx closed")
// ErrNotFound is returned when an item or index is not in the database.
ErrNotFound = errors.New("not found")
// ErrInvalid is returned when the database file is an invalid format.
ErrInvalid = errors.New("invalid database")
// ErrDatabaseClosed is returned when the database is closed.
ErrDatabaseClosed = errors.New("database closed")
// ErrIndexExists is returned when an index already exists in the database.
ErrIndexExists = errors.New("index exists")
// ErrInvalidOperation is returned when an operation cannot be completed.
ErrInvalidOperation = errors.New("invalid operation")
// ErrInvalidSyncPolicy is returned for an invalid SyncPolicy value.
ErrInvalidSyncPolicy = errors.New("invalid sync policy")
// ErrShrinkInProcess is returned when a shrink operation is in-process.
ErrShrinkInProcess = errors.New("shrink is in-process")
// ErrPersistenceActive is returned when post-loading data from an database
// not opened with Open(":memory:").
ErrPersistenceActive = errors.New("persistence active")
// ErrTxIterating is returned when Set or Delete are called while iterating.
ErrTxIterating = errors.New("tx is iterating")
)
// DB represents a collection of key-value pairs that persist on disk.
// Transactions are used for all forms of data access to the DB.
type DB struct {
mu sync.RWMutex // the gatekeeper for all fields
file *os.File // the underlying file
buf []byte // a buffer to write to
keys *btree.BTree // a tree of all item ordered by key
exps *btree.BTree // a tree of items ordered by expiration
idxs map[string]*index // the index trees.
insIdxs []*index // a reuse buffer for gathering indexes
flushes int // a count of the number of disk flushes
closed bool // set when the database has been closed
config Config // the database configuration
persist bool // do we write to disk
shrinking bool // when an aof shrink is in-process.
lastaofsz int // the size of the last shrink aof size
}
// SyncPolicy represents how often data is synced to disk.
type SyncPolicy int
const (
// Never is used to disable syncing data to disk.
// The faster and less safe method.
Never SyncPolicy = 0
// EverySecond is used to sync data to disk every second.
// It's pretty fast and you can lose 1 second of data if there
// is a disaster.
// This is the recommended setting.
EverySecond = 1
// Always is used to sync data after every write to disk.
// Slow. Very safe.
Always = 2
)
// Config represents database configuration options. These
// options are used to change various behaviors of the database.
type Config struct {
// SyncPolicy adjusts how often the data is synced to disk.
// This value can be Never, EverySecond, or Always.
// The default is EverySecond.
SyncPolicy SyncPolicy
// AutoShrinkPercentage is used by the background process to trigger
// a shrink of the aof file when the size of the file is larger than the
// percentage of the result of the previous shrunk file.
// For example, if this value is 100, and the last shrink process
// resulted in a 100mb file, then the new aof file must be 200mb before
// a shrink is triggered.
AutoShrinkPercentage int
// AutoShrinkMinSize defines the minimum size of the aof file before
// an automatic shrink can occur.
AutoShrinkMinSize int
// AutoShrinkDisabled turns off automatic background shrinking
AutoShrinkDisabled bool
// OnExpired is used to custom handle the deletion option when a key
// has been expired.
OnExpired func(keys []string)
// OnExpiredSync will be called inside the same transaction that is
// performing the deletion of expired items. If OnExpired is present then
// this callback will not be called. If this callback is present, then the
// deletion of the timeed-out item is the explicit responsibility of this
// callback.
OnExpiredSync func(key, value string, tx *Tx) error
}
// exctx is a simple b-tree context for ordering by expiration.
type exctx struct {
db *DB
}
// Open opens a database at the provided path.
// If the file does not exist then it will be created automatically.
func Open(path string) (*DB, error) {
db := &DB{}
// initialize trees and indexes
db.keys = btreeNew(lessCtx(nil))
db.exps = btreeNew(lessCtx(&exctx{db}))
db.idxs = make(map[string]*index)
// initialize default configuration
db.config = Config{
SyncPolicy: EverySecond,
AutoShrinkPercentage: 100,
AutoShrinkMinSize: 32 * 1024 * 1024,
}
// turn off persistence for pure in-memory
db.persist = path != ":memory:"
if db.persist {
var err error
// hardcoding 0666 as the default mode.
db.file, err = os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0666)
if err != nil {
return nil, err
}
// load the database from disk
if err := db.load(); err != nil {
// close on error, ignore close error
_ = db.file.Close()
return nil, err
}
}
// start the background manager.
go db.backgroundManager()
return db, nil
}
// Close releases all database resources.
// All transactions must be closed before closing the database.
func (db *DB) Close() error {
db.mu.Lock()
defer db.mu.Unlock()
if db.closed {
return ErrDatabaseClosed
}
db.closed = true
if db.persist {
db.file.Sync() // do a sync but ignore the error
if err := db.file.Close(); err != nil {
return err
}
}
// Let's release all references to nil. This will help both with debugging
// late usage panics and it provides a hint to the garbage collector
db.keys, db.exps, db.idxs, db.file = nil, nil, nil, nil
return nil
}
// Save writes a snapshot of the database to a writer. This operation blocks all
// writes, but not reads. This can be used for snapshots and backups for pure
// in-memory databases using the ":memory:". Database that persist to disk
// can be snapshotted by simply copying the database file.
func (db *DB) Save(wr io.Writer) error {
var err error
db.mu.RLock()
defer db.mu.RUnlock()
// use a buffered writer and flush every 4MB
var buf []byte
now := time.Now()
// iterated through every item in the database and write to the buffer
btreeAscend(db.keys, func(item interface{}) bool {
dbi := item.(*dbItem)
buf = dbi.writeSetTo(buf, now)
if len(buf) > 1024*1024*4 {
// flush when buffer is over 4MB
_, err = wr.Write(buf)
if err != nil {
return false
}
buf = buf[:0]
}
return true
})
if err != nil {
return err
}
// one final flush
if len(buf) > 0 {
_, err = wr.Write(buf)
if err != nil {
return err
}
}
return nil
}
// Load loads commands from reader. This operation blocks all reads and writes.
// Note that this can only work for fully in-memory databases opened with
// Open(":memory:").
func (db *DB) Load(rd io.Reader) error {
db.mu.Lock()
defer db.mu.Unlock()
if db.persist {
// cannot load into databases that persist to disk
return ErrPersistenceActive
}
_, err := db.readLoad(rd, time.Now())
return err
}
// index represents a b-tree or r-tree index and also acts as the
// b-tree/r-tree context for itself.
type index struct {
btr *btree.BTree // contains the items
rtr *rtred.RTree // contains the items
name string // name of the index
pattern string // a required key pattern
less func(a, b string) bool // less comparison function
rect func(item string) (min, max []float64) // rect from string function
db *DB // the origin database
opts IndexOptions // index options
}
// match matches the pattern to the key
func (idx *index) match(key string) bool {
if idx.pattern == "*" {
return true
}
if idx.opts.CaseInsensitiveKeyMatching {
for i := 0; i < len(key); i++ {
if key[i] >= 'A' && key[i] <= 'Z' {
key = strings.ToLower(key)
break
}
}
}
return match.Match(key, idx.pattern)
}
// clearCopy creates a copy of the index, but with an empty dataset.
func (idx *index) clearCopy() *index {
// copy the index meta information
nidx := &index{
name: idx.name,
pattern: idx.pattern,
db: idx.db,
less: idx.less,
rect: idx.rect,
opts: idx.opts,
}
// initialize with empty trees
if nidx.less != nil {
nidx.btr = btreeNew(lessCtx(nidx))
}
if nidx.rect != nil {
nidx.rtr = rtred.New(nidx)
}
return nidx
}
// rebuild rebuilds the index
func (idx *index) rebuild() {
// initialize trees
if idx.less != nil {
idx.btr = btreeNew(lessCtx(idx))
}
if idx.rect != nil {
idx.rtr = rtred.New(idx)
}
// iterate through all keys and fill the index
btreeAscend(idx.db.keys, func(item interface{}) bool {
dbi := item.(*dbItem)
if !idx.match(dbi.key) {
// does not match the pattern, continue
return true
}
if idx.less != nil {
idx.btr.Set(dbi)
}
if idx.rect != nil {
idx.rtr.Insert(dbi)
}
return true
})
}
// CreateIndex builds a new index and populates it with items.
// The items are ordered in an b-tree and can be retrieved using the
// Ascend* and Descend* methods.
// An error will occur if an index with the same name already exists.
//
// When a pattern is provided, the index will be populated with
// keys that match the specified pattern. This is a very simple pattern
// match where '*' matches on any number characters and '?' matches on
// any one character.
// The less function compares if string 'a' is less than string 'b'.
// It allows for indexes to create custom ordering. It's possible
// that the strings may be textual or binary. It's up to the provided
// less function to handle the content format and comparison.
// There are some default less function that can be used such as
// IndexString, IndexBinary, etc.
func (db *DB) CreateIndex(name, pattern string,
less ...func(a, b string) bool) error {
return db.Update(func(tx *Tx) error {
return tx.CreateIndex(name, pattern, less...)
})
}
// ReplaceIndex builds a new index and populates it with items.
// The items are ordered in an b-tree and can be retrieved using the
// Ascend* and Descend* methods.
// If a previous index with the same name exists, that index will be deleted.
func (db *DB) ReplaceIndex(name, pattern string,
less ...func(a, b string) bool) error {
return db.Update(func(tx *Tx) error {
err := tx.CreateIndex(name, pattern, less...)
if err != nil {
if err == ErrIndexExists {
err := tx.DropIndex(name)
if err != nil {
return err
}
return tx.CreateIndex(name, pattern, less...)
}
return err
}
return nil
})
}
// CreateSpatialIndex builds a new index and populates it with items.
// The items are organized in an r-tree and can be retrieved using the
// Intersects method.
// An error will occur if an index with the same name already exists.
//
// The rect function converts a string to a rectangle. The rectangle is
// represented by two arrays, min and max. Both arrays may have a length
// between 1 and 20, and both arrays must match in length. A length of 1 is a
// one dimensional rectangle, and a length of 4 is a four dimension rectangle.
// There is support for up to 20 dimensions.
// The values of min must be less than the values of max at the same dimension.
// Thus min[0] must be less-than-or-equal-to max[0].
// The IndexRect is a default function that can be used for the rect
// parameter.
func (db *DB) CreateSpatialIndex(name, pattern string,
rect func(item string) (min, max []float64)) error {
return db.Update(func(tx *Tx) error {
return tx.CreateSpatialIndex(name, pattern, rect)
})
}
// ReplaceSpatialIndex builds a new index and populates it with items.
// The items are organized in an r-tree and can be retrieved using the
// Intersects method.
// If a previous index with the same name exists, that index will be deleted.
func (db *DB) ReplaceSpatialIndex(name, pattern string,
rect func(item string) (min, max []float64)) error {
return db.Update(func(tx *Tx) error {
err := tx.CreateSpatialIndex(name, pattern, rect)
if err != nil {
if err == ErrIndexExists {
err := tx.DropIndex(name)
if err != nil {
return err
}
return tx.CreateSpatialIndex(name, pattern, rect)
}
return err
}
return nil
})
}
// DropIndex removes an index.
func (db *DB) DropIndex(name string) error {
return db.Update(func(tx *Tx) error {
return tx.DropIndex(name)
})
}
// Indexes returns a list of index names.
func (db *DB) Indexes() ([]string, error) {
var names []string
var err = db.View(func(tx *Tx) error {
var err error
names, err = tx.Indexes()
return err
})
return names, err
}
// ReadConfig returns the database configuration.
func (db *DB) ReadConfig(config *Config) error {
db.mu.RLock()
defer db.mu.RUnlock()
if db.closed {
return ErrDatabaseClosed
}
*config = db.config
return nil
}
// SetConfig updates the database configuration.
func (db *DB) SetConfig(config Config) error {
db.mu.Lock()
defer db.mu.Unlock()
if db.closed {
return ErrDatabaseClosed
}
switch config.SyncPolicy {
default:
return ErrInvalidSyncPolicy
case Never, EverySecond, Always:
}
db.config = config
return nil
}
// insertIntoDatabase performs inserts an item in to the database and updates
// all indexes. If a previous item with the same key already exists, that item
// will be replaced with the new one, and return the previous item.
func (db *DB) insertIntoDatabase(item *dbItem) *dbItem {
var pdbi *dbItem
// Generate a list of indexes that this item will be inserted in to.
idxs := db.insIdxs
for _, idx := range db.idxs {
if idx.match(item.key) {
idxs = append(idxs, idx)
}
}
prev := db.keys.Set(item)
if prev != nil {
// A previous item was removed from the keys tree. Let's
// fully delete this item from all indexes.
pdbi = prev.(*dbItem)
if pdbi.opts != nil && pdbi.opts.ex {
// Remove it from the expires tree.
db.exps.Delete(pdbi)
}
for _, idx := range idxs {
if idx.btr != nil {
// Remove it from the btree index.
idx.btr.Delete(pdbi)
}
if idx.rtr != nil {
// Remove it from the rtree index.
idx.rtr.Remove(pdbi)
}
}
}
if item.opts != nil && item.opts.ex {
// The new item has eviction options. Add it to the
// expires tree
db.exps.Set(item)
}
for i, idx := range idxs {
if idx.btr != nil {
// Add new item to btree index.
idx.btr.Set(item)
}
if idx.rtr != nil {
// Add new item to rtree index.
idx.rtr.Insert(item)
}
// clear the index
idxs[i] = nil
}
// reuse the index list slice
db.insIdxs = idxs[:0]
// we must return the previous item to the caller.
return pdbi
}
// deleteFromDatabase removes and item from the database and indexes. The input
// item must only have the key field specified thus "&dbItem{key: key}" is all
// that is needed to fully remove the item with the matching key. If an item
// with the matching key was found in the database, it will be removed and
// returned to the caller. A nil return value means that the item was not
// found in the database
func (db *DB) deleteFromDatabase(item *dbItem) *dbItem {
var pdbi *dbItem
prev := db.keys.Delete(item)
if prev != nil {
pdbi = prev.(*dbItem)
if pdbi.opts != nil && pdbi.opts.ex {
// Remove it from the exipres tree.
db.exps.Delete(pdbi)
}
for _, idx := range db.idxs {
if !idx.match(pdbi.key) {
continue
}
if idx.btr != nil {
// Remove it from the btree index.
idx.btr.Delete(pdbi)
}
if idx.rtr != nil {
// Remove it from the rtree index.
idx.rtr.Remove(pdbi)
}
}
}
return pdbi
}
// backgroundManager runs continuously in the background and performs various
// operations such as removing expired items and syncing to disk.
func (db *DB) backgroundManager() {
flushes := 0
t := time.NewTicker(time.Second)
defer t.Stop()
for range t.C {
var shrink bool
// Open a standard view. This will take a full lock of the
// database thus allowing for access to anything we need.
var onExpired func([]string)
var expired []*dbItem
var onExpiredSync func(key, value string, tx *Tx) error
err := db.Update(func(tx *Tx) error {
onExpired = db.config.OnExpired
if onExpired == nil {
onExpiredSync = db.config.OnExpiredSync
}
if db.persist && !db.config.AutoShrinkDisabled {
pos, err := db.file.Seek(0, 1)
if err != nil {
return err
}
aofsz := int(pos)
if aofsz > db.config.AutoShrinkMinSize {
prc := float64(db.config.AutoShrinkPercentage) / 100.0
shrink = aofsz > db.lastaofsz+int(float64(db.lastaofsz)*prc)
}
}
// produce a list of expired items that need removing
btreeAscendLessThan(db.exps, &dbItem{
opts: &dbItemOpts{ex: true, exat: time.Now()},
}, func(item interface{}) bool {
expired = append(expired, item.(*dbItem))
return true
})
if onExpired == nil && onExpiredSync == nil {
for _, itm := range expired {
if _, err := tx.Delete(itm.key); err != nil {
// it's ok to get a "not found" because the
// 'Delete' method reports "not found" for
// expired items.
if err != ErrNotFound {
return err
}
}
}
} else if onExpiredSync != nil {
for _, itm := range expired {
if err := onExpiredSync(itm.key, itm.val, tx); err != nil {
return err
}
}
}
return nil
})
if err == ErrDatabaseClosed {
break
}
// send expired event, if needed
if onExpired != nil && len(expired) > 0 {
keys := make([]string, 0, 32)
for _, itm := range expired {
keys = append(keys, itm.key)
}
onExpired(keys)
}
// execute a disk sync, if needed
func() {
db.mu.Lock()
defer db.mu.Unlock()
if db.persist && db.config.SyncPolicy == EverySecond &&
flushes != db.flushes {
_ = db.file.Sync()
flushes = db.flushes
}
}()
if shrink {
if err = db.Shrink(); err != nil {
if err == ErrDatabaseClosed {
break
}
}
}
}
}
// Shrink will make the database file smaller by removing redundant
// log entries. This operation does not block the database.
func (db *DB) Shrink() error {
db.mu.Lock()
if db.closed {
db.mu.Unlock()
return ErrDatabaseClosed
}
if !db.persist {
// The database was opened with ":memory:" as the path.
// There is no persistence, and no need to do anything here.
db.mu.Unlock()
return nil
}
if db.shrinking {
// The database is already in the process of shrinking.
db.mu.Unlock()
return ErrShrinkInProcess
}
db.shrinking = true
defer func() {
db.mu.Lock()
db.shrinking = false
db.mu.Unlock()
}()
fname := db.file.Name()
tmpname := fname + ".tmp"
// the endpos is used to return to the end of the file when we are
// finished writing all of the current items.
endpos, err := db.file.Seek(0, 2)
if err != nil {
return err
}
db.mu.Unlock()
time.Sleep(time.Second / 4) // wait just a bit before starting
f, err := os.Create(tmpname)
if err != nil {
return err
}
defer func() {
_ = f.Close()
_ = os.RemoveAll(tmpname)
}()
// we are going to read items in as chunks as to not hold up the database
// for too long.
var buf []byte
pivot := ""
done := false
for !done {
err := func() error {
db.mu.RLock()
defer db.mu.RUnlock()
if db.closed {
return ErrDatabaseClosed
}
done = true
var n int
now := time.Now()
btreeAscendGreaterOrEqual(db.keys, &dbItem{key: pivot},
func(item interface{}) bool {
dbi := item.(*dbItem)
// 1000 items or 64MB buffer
if n > 1000 || len(buf) > 64*1024*1024 {
pivot = dbi.key
done = false
return false
}
buf = dbi.writeSetTo(buf, now)
n++
return true
},
)
if len(buf) > 0 {
if _, err := f.Write(buf); err != nil {
return err
}
buf = buf[:0]
}
return nil
}()
if err != nil {
return err
}
}
// We reached this far so all of the items have been written to a new tmp
// There's some more work to do by appending the new line from the aof
// to the tmp file and finally swap the files out.
return func() error {
// We're wrapping this in a function to get the benefit of a defered
// lock/unlock.
db.mu.Lock()
defer db.mu.Unlock()
if db.closed {
return ErrDatabaseClosed
}
// We are going to open a new version of the aof file so that we do
// not change the seek position of the previous. This may cause a
// problem in the future if we choose to use syscall file locking.
aof, err := os.Open(fname)
if err != nil {
return err
}
defer func() { _ = aof.Close() }()
if _, err := aof.Seek(endpos, 0); err != nil {
return err
}
// Just copy all of the new commands that have occurred since we
// started the shrink process.
if _, err := io.Copy(f, aof); err != nil {
return err
}
// Close all files
if err := aof.Close(); err != nil {
return err
}
if err := f.Close(); err != nil {
return err
}
if err := db.file.Close(); err != nil {
return err
}
// Any failures below here is really bad. So just panic.
if err := os.Rename(tmpname, fname); err != nil {
panic(err)
}
db.file, err = os.OpenFile(fname, os.O_CREATE|os.O_RDWR, 0666)
if err != nil {
panic(err)
}
pos, err := db.file.Seek(0, 2)
if err != nil {
return err
}
db.lastaofsz = int(pos)
return nil
}()
}
// readLoad reads from the reader and loads commands into the database.
// modTime is the modified time of the reader, should be no greater than
// the current time.Now().
// Returns the number of bytes of the last command read and the error if any.
func (db *DB) readLoad(rd io.Reader, modTime time.Time) (n int64, err error) {
defer func() {
if err == io.EOF {
err = io.ErrUnexpectedEOF
}
}()
totalSize := int64(0)
data := make([]byte, 4096)
parts := make([]string, 0, 8)
r := bufio.NewReader(rd)
for {
// peek at the first byte. If it's a 'nul' control character then
// ignore it and move to the next byte.
c, err := r.ReadByte()
if err != nil {
if err == io.EOF {
err = nil
}
return totalSize, err
}
if c == 0 {
// ignore nul control characters
n += 1
continue
}
if err := r.UnreadByte(); err != nil {
return totalSize, err
}
// read a single command.
// first we should read the number of parts that the of the command
cmdByteSize := int64(0)
line, err := r.ReadBytes('\n')
if err != nil {
return totalSize, err
}
if line[0] != '*' {
return totalSize, ErrInvalid
}
cmdByteSize += int64(len(line))
// convert the string number to and int
var n int
if len(line) == 4 && line[len(line)-2] == '\r' {
if line[1] < '0' || line[1] > '9' {
return totalSize, ErrInvalid
}
n = int(line[1] - '0')
} else {
if len(line) < 5 || line[len(line)-2] != '\r' {
return totalSize, ErrInvalid
}
for i := 1; i < len(line)-2; i++ {
if line[i] < '0' || line[i] > '9' {
return totalSize, ErrInvalid
}
n = n*10 + int(line[i]-'0')
}
}
// read each part of the command.
parts = parts[:0]
for i := 0; i < n; i++ {
// read the number of bytes of the part.
line, err := r.ReadBytes('\n')
if err != nil {
return totalSize, err
}
if line[0] != '$' {
return totalSize, ErrInvalid
}
cmdByteSize += int64(len(line))
// convert the string number to and int
var n int
if len(line) == 4 && line[len(line)-2] == '\r' {
if line[1] < '0' || line[1] > '9' {
return totalSize, ErrInvalid
}
n = int(line[1] - '0')
} else {
if len(line) < 5 || line[len(line)-2] != '\r' {
return totalSize, ErrInvalid
}
for i := 1; i < len(line)-2; i++ {
if line[i] < '0' || line[i] > '9' {
return totalSize, ErrInvalid
}
n = n*10 + int(line[i]-'0')
}
}
// resize the read buffer
if len(data) < n+2 {
dataln := len(data)
for dataln < n+2 {
dataln *= 2
}
data = make([]byte, dataln)
}
if _, err = io.ReadFull(r, data[:n+2]); err != nil {
return totalSize, err
}
if data[n] != '\r' || data[n+1] != '\n' {
return totalSize, ErrInvalid
}
// copy string
parts = append(parts, string(data[:n]))
cmdByteSize += int64(n + 2)
}
// finished reading the command
if len(parts) == 0 {
continue
}
if (parts[0][0] == 's' || parts[0][0] == 'S') &&
(parts[0][1] == 'e' || parts[0][1] == 'E') &&
(parts[0][2] == 't' || parts[0][2] == 'T') {
// SET
if len(parts) < 3 || len(parts) == 4 || len(parts) > 5 {
return totalSize, ErrInvalid
}
if len(parts) == 5 {
if strings.ToLower(parts[3]) != "ex" {
return totalSize, ErrInvalid
}
ex, err := strconv.ParseUint(parts[4], 10, 64)
if err != nil {
return totalSize, err
}
now := time.Now()
dur := (time.Duration(ex) * time.Second) - now.Sub(modTime)
if dur > 0 {
db.insertIntoDatabase(&dbItem{
key: parts[1],
val: parts[2],
opts: &dbItemOpts{
ex: true,
exat: now.Add(dur),
},
})
}
} else {
db.insertIntoDatabase(&dbItem{key: parts[1], val: parts[2]})
}
} else if (parts[0][0] == 'd' || parts[0][0] == 'D') &&
(parts[0][1] == 'e' || parts[0][1] == 'E') &&
(parts[0][2] == 'l' || parts[0][2] == 'L') {
// DEL
if len(parts) != 2 {
return totalSize, ErrInvalid
}
db.deleteFromDatabase(&dbItem{key: parts[1]})
} else if (parts[0][0] == 'f' || parts[0][0] == 'F') &&
strings.ToLower(parts[0]) == "flushdb" {
db.keys = btreeNew(lessCtx(nil))
db.exps = btreeNew(lessCtx(&exctx{db}))
db.idxs = make(map[string]*index)
} else {
return totalSize, ErrInvalid
}
totalSize += cmdByteSize
}
}
// load reads entries from the append only database file and fills the database.
// The file format uses the Redis append only file format, which is and a series
// of RESP commands. For more information on RESP please read
// http://redis.io/topics/protocol. The only supported RESP commands are DEL and
// SET.
func (db *DB) load() error {
fi, err := db.file.Stat()
if err != nil {
return err
}
n, err := db.readLoad(db.file, fi.ModTime())
if err != nil {
if err == io.ErrUnexpectedEOF {
// The db file has ended mid-command, which is allowed but the
// data file should be truncated to the end of the last valid
// command
if err := db.file.Truncate(n); err != nil {
return err
}
} else {
return err
}
}
if _, err := db.file.Seek(n, 0); err != nil {
return err
}
var estaofsz int
db.keys.Walk(func(items []interface{}) {
for _, v := range items {
estaofsz += v.(*dbItem).estAOFSetSize()
}
})
db.lastaofsz += estaofsz
return nil
}
// managed calls a block of code that is fully contained in a transaction.
// This method is intended to be wrapped by Update and View
func (db *DB) managed(writable bool, fn func(tx *Tx) error) (err error) {
var tx *Tx
tx, err = db.Begin(writable)
if err != nil {
return
}
defer func() {
if err != nil {
// The caller returned an error. We must rollback.
_ = tx.Rollback()
return
}
if writable {
// Everything went well. Lets Commit()
err = tx.Commit()
} else {
// read-only transaction can only roll back.
err = tx.Rollback()
}
}()
tx.funcd = true
defer func() {
tx.funcd = false
}()
err = fn(tx)
return
}