Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADBDEV-6339 Implement files tracking for arenadata_toolkit #1079

Open
wants to merge 69 commits into
base: adb-6.x-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 44 commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
d349006
Initial solution
bimboterminator1 Oct 15, 2024
87b02dc
fix tests
bimboterminator1 Oct 15, 2024
3e87652
trying to fix tests
bimboterminator1 Oct 15, 2024
9e64dc8
attempt to fix tests
bimboterminator1 Oct 15, 2024
c5f0e11
Add shared memory check
bimboterminator1 Oct 17, 2024
855b5ae
Make bgworker run on segments
bimboterminator1 Oct 20, 2024
7c9a6c9
elog to ereport and minor changes
bimboterminator1 Oct 20, 2024
37cd981
Change order in PG_FINI
bimboterminator1 Oct 20, 2024
6023a63
GUC refactoring
bimboterminator1 Oct 20, 2024
d25e299
Restore dbsize file to original identation
bimboterminator1 Oct 20, 2024
6b01018
ternary operator
bimboterminator1 Oct 20, 2024
58406f3
fix assignment expression
bimboterminator1 Oct 20, 2024
f308583
Locks logic refactoring + light refactoring of bloom_set and bloom
bimboterminator1 Oct 21, 2024
d5fdfc4
Calculate factual timeout in bgworker loop
bimboterminator1 Oct 22, 2024
76849ef
add atomics
bimboterminator1 Oct 23, 2024
96ac880
Soft size calculation
bimboterminator1 Oct 24, 2024
1ab4187
remove &s
bimboterminator1 Oct 24, 2024
36cfd5a
formatting
bimboterminator1 Oct 24, 2024
b32426c
Rework worker
bimboterminator1 Oct 27, 2024
f62b609
More smart hashing strategy
bimboterminator1 Oct 27, 2024
d13acdc
use uint8 in bloom
bimboterminator1 Oct 27, 2024
7847bab
Rework drops track
bimboterminator1 Oct 27, 2024
ed9e300
Rework initialization approach
bimboterminator1 Oct 28, 2024
0cdd431
fix segfault
bimboterminator1 Oct 28, 2024
d10812b
Move to bits
bimboterminator1 Oct 28, 2024
f7c8c6d
A bunch of comments
bimboterminator1 Oct 28, 2024
e827cd6
Remove hard code
bimboterminator1 Oct 28, 2024
42e1b2d
Comments and pgindent
bimboterminator1 Oct 28, 2024
bea4ca9
Add Readme
bimboterminator1 Oct 28, 2024
d7793a5
Fix tests
bimboterminator1 Oct 28, 2024
b9e94f0
Remove size adjustment
bimboterminator1 Oct 29, 2024
72d6071
Set gucs
bimboterminator1 Oct 29, 2024
1305ba5
Remove shared_state_lock
bimboterminator1 Oct 29, 2024
919f7cd
Make bloom_set singleton
bimboterminator1 Oct 30, 2024
af8c73b
dbid inline function
bimboterminator1 Oct 30, 2024
f4e252c
Move init_locks under if
bimboterminator1 Nov 3, 2024
b694b96
Use versioning
bimboterminator1 Nov 7, 2024
0a8cf7e
Exclude explain mode
bimboterminator1 Nov 8, 2024
8f763a4
Smiplify
bimboterminator1 Nov 8, 2024
8fcb1e6
new header file
bimboterminator1 Nov 8, 2024
83a5c23
remove db reconnection in tests
bimboterminator1 Nov 12, 2024
55ecda2
Refactor drops track
bimboterminator1 Nov 12, 2024
c8139d4
Fix locks for set entries
bimboterminator1 Nov 12, 2024
9980cc9
Add isolation tests
bimboterminator1 Nov 12, 2024
627fdec
Improve tests
bimboterminator1 Nov 13, 2024
857c245
Remove error_flag
bimboterminator1 Nov 13, 2024
a694039
Fix explain case
bimboterminator1 Nov 13, 2024
d38a461
Take the error back in
bimboterminator1 Nov 13, 2024
8c39794
Typos
bimboterminator1 Nov 14, 2024
cab839b
return typo
bimboterminator1 Nov 14, 2024
9483c06
Remove unnecessary functions and change priveleges
bimboterminator1 Nov 17, 2024
f740832
Fix control version
bimboterminator1 Nov 18, 2024
d56241c
Change worker
bimboterminator1 Nov 18, 2024
d3b3e48
Refactor dbsize
bimboterminator1 Nov 18, 2024
4a6fc1b
Simplify gucs
bimboterminator1 Nov 18, 2024
2c87274
track_files.c changes
bimboterminator1 Nov 18, 2024
3f73780
remove if clause
bimboterminator1 Nov 19, 2024
2fb8ff4
Refactor
bimboterminator1 Nov 19, 2024
6dfaeca
readme
bimboterminator1 Nov 19, 2024
05a0a24
int64
bimboterminator1 Nov 20, 2024
bfd00a8
isolation2 tests in main installcheck
bimboterminator1 Nov 20, 2024
1d7a325
fix typos
bimboterminator1 Nov 20, 2024
881c81e
Minor changes
bimboterminator1 Nov 21, 2024
510d87e
Fix size calculation of AO tabe size.
bimboterminator1 Nov 25, 2024
f215113
Return false on any error
bimboterminator1 Nov 25, 2024
4eb6e14
Change default parameters handling
bimboterminator1 Nov 29, 2024
4767309
Improve validation and emit warning
bimboterminator1 Nov 29, 2024
64df2b5
Typo
bimboterminator1 Nov 29, 2024
b6fec63
Use default settings at track acquisition
bimboterminator1 Nov 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions gpcontrib/arenadata_toolkit/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
MODULES = arenadata_toolkit

EXTENSION = arenadata_toolkit
EXTENSION_VERSION = 1.6
EXTENSION_VERSION = 1.7
DATA = \
arenadata_toolkit--1.0.sql \
arenadata_toolkit--1.0--1.1.sql \
Expand All @@ -12,15 +12,29 @@ DATA = \
arenadata_toolkit--1.3--1.4.sql \
arenadata_toolkit--1.4--1.5.sql \
arenadata_toolkit--1.5--1.6.sql \
arenadata_toolkit--1.6--1.7.sql \

DATA_built = $(EXTENSION)--$(EXTENSION_VERSION).sql

$(DATA_built): $(DATA)
cat $(DATA) > $(DATA_built)
MODULE_big = arenadata_toolkit
OBJS = \
src/arenadata_toolkit_guc.o \
src/bloom.o \
src/bloom_set.o \
src/drops_track.o \
src/file_hook.o \
src/tf_shmem.o \
src/arenadata_toolkit.o \
src/arenadata_toolkit_worker.o \
src/track_files.o \
src/dbsize.o \

PG_CFLAGS = -I$(libpq_srcdir) -I$(CURDIR)/src/include

REGRESS = arenadata_toolkit_test arenadata_toolkit_skew_test adb_get_relfilenodes_test \
adb_collect_table_stats_test adb_vacuum_strategy_test adb_relation_storage_size_test \
tablespace_location upgrade_test adb_hba_file_rules_view_test
tablespace_location upgrade_test adb_hba_file_rules_view_test \
arenadata_toolkit_guc arenadata_toolkit_tracking
REGRESS_OPTS += --init-file=$(top_srcdir)/src/test/regress/init_file

ifdef USE_PGXS
Expand All @@ -33,3 +47,6 @@ top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

$(DATA_built): $(DATA)
cat $(DATA) > $(DATA_built)
59 changes: 59 additions & 0 deletions gpcontrib/arenadata_toolkit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
## arenadata_toolkit - database objects tracking extension for GPDB

`arenadata_toolkit` starting from version 1.7 represents a GPDB extension that efficiently tracks file system changes (extend, truncate, create, unlink operations) using space-efficient Bloom filters stored in shared memory. This extension is particularly useful for monitoring and maintaining database files sizes across a distributed environment.

The main purpose of this code is achieving fast database size calculation and tracking file changes at relation
level. The extension implements a probabilistic tracking system using Bloom filters to monitor file changes across Greenplum segments. It utilizes shared memory for state management and employs background workers to maintain consistency.

#### Configuring GPDB and extension usage
Since extension uses shared memory, configuration on all GPDB segments must be changed by setting
```shell script
gpconfig -c shared_preload_libraries -v 'arenadata_toolkit'
```
Extension may track restricted number of databases. The maximum number of them is defined by GUC
| arenadata_toolkit.tracking_db_track_count | Need restart |Possible values [1, 1000]; Default 5|
|--|--|--|
For each tracked database there allocated a Bloom filter in shared memory. The size of each filter is controlled via
| arenadata_toolkit.tracking_bloom_size | Need restart |Possible values (bytes) [64, 128000000] Default 1048576|
|--|--|--|
The specific database can be bound to unoccupied filter with function
KnightMurloc marked this conversation as resolved.
Show resolved Hide resolved
```shell script
psql -d my_db -c select arenadata_toolkit.tracking_register_db()
or
psql -c select arenadata_toolkit.tracking_register_db(12345)
```
After registering each relation file change within the database will be noted in Bloom filter.
Using Bloom filter allows us to calculate the sizes of only relations whose relfilenode is present in the filter.
The current size snapshot can be taken via view:
```
select * from arenadata_toolkit.tables_track;
```
In order to get the snapshot of all database relations you should call in the database of interest
```
arenadata_toolkit.tracking_trigger_initial_snapshot();
```

#### Choosing optimal Bloom size

Choosing the optimal Bloom filter size is crucial for balancing memory usage and accuracy.
First of all, when choosing the filter size, you should take into account your system resources, because bloom filters are allocated in shared memory for each segment, and too wide structures (tracking_db_track_count * tracking_bloom_size) could decrease overall performance.

Next, choose the filter size satisfying your performance goals:
- Define false positive tolerance, p. Since Bloom filter is probabilistic data structure there is a probability to calculate the size of relation, which has not been modified. And the smaller filter is, the more often this occurs.
- Memory constraints
- Query patterns, if queries are mostly reading then huge sizes are unnecessary.

If you will estimate number of objects in your database, you can calculate theoretical size:
$$m = -\frac{n \ln p}{(\ln 2)^2}$$
- n = estimated number of elements
- p = target false positive rate
- m = filter size in bits

Quick Reference Table

Deployment Size | Files | Target FPR | Recommended Size, bytes|
|----------------|------------|------------|------------------------|
| Small | < 100K | 1% | 1048576 |
| Medium | 100K - 1M | 1% | 8388608 |
| Large | > 1M | 1% | 33554432 |
| Enterprise | > 10M | 1% | 134217728 |
127 changes: 127 additions & 0 deletions gpcontrib/arenadata_toolkit/arenadata_toolkit--1.6--1.7.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/* gpcontrib/arenadata_toolkit/arenadata_toolkit--1.6--1.7.sql */

CREATE FUNCTION arenadata_toolkit.tracking_register_db(dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_register_db' LANGUAGE C EXECUTE ON MASTER;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_register_db(dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_register_db_main(reg BOOL, dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_register_db_main' LANGUAGE C;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_register_db_main(reg BOOL, dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_unregister_db(dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_unregister_db' LANGUAGE C EXECUTE ON MASTER;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_unregister_db(dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_register_schema(schemaname NAME, dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_register_schema' LANGUAGE C EXECUTE ON master;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_register_schema(schema NAME, dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_unregister_schema(schema NAME, dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_unregister_schema' LANGUAGE C EXECUTE ON master;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_unregister_schema(schema NAME, dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_set_relkinds(relkinds NAME, dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_set_relkinds' LANGUAGE C EXECUTE ON master;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_set_relkinds(relkinds NAME, dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_set_relstorages(relstorages NAME, dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_set_relstorages' LANGUAGE C EXECUTE ON master;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_set_relstorages(relstorages NAME, dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_set_snapshot_on_recovery(val BOOL, dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_set_snapshot_on_recovery' LANGUAGE C EXECUTE ON master;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_set_snapshot_on_recovery(val BOOL, dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_trigger_initial_snapshot(dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_trigger_initial_snapshot' LANGUAGE C;

CREATE FUNCTION arenadata_toolkit.tracking_is_initial_snapshot_triggered(dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_is_initial_snapshot_triggered' LANGUAGE C;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_is_initial_snapshot_triggered(dbid OID) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_is_initial_snapshot_triggered_master(dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_is_initial_snapshot_triggered' LANGUAGE C EXECUTE ON master;

CREATE FUNCTION arenadata_toolkit.tracking_is_initial_snapshot_triggered_segments(dbid OID DEFAULT 0)
returns BOOL AS '$libdir/arenadata_toolkit',
'tracking_is_initial_snapshot_triggered' LANGUAGE C EXECUTE ON ALL segments;

CREATE FUNCTION arenadata_toolkit.tracking_is_segment_initialized()
returns TABLE(segindex INT, is_initialized BOOL) AS '$libdir/arenadata_toolkit',
'tracking_is_segment_initialized' LANGUAGE C;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_is_segment_initialized() FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_track_version()
returns BIGINT AS '$libdir/arenadata_toolkit',
'tracking_track_version' LANGUAGE C STABLE EXECUTE ON MASTER;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_track_version() FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_get_track_master(version BIGINT)
RETURNS TABLE(relid OID, relname NAME, relfilenode OID, size BIGINT, state "char", segid INT,
relnamespace OID, relkind "char", relstorage "char") AS '$libdir/arenadata_toolkit',
'tracking_get_track' LANGUAGE C EXECUTE ON MASTER;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_get_track_master(version BIGINT) FROM public;

CREATE FUNCTION arenadata_toolkit.tracking_get_track_segments(version BIGINT)
RETURNS TABLE(relid OID, relname NAME, relfilenode OID, size BIGINT, state "char", segid INT,
relnamespace OID, relkind "char", relstorage "char") AS '$libdir/arenadata_toolkit',
'tracking_get_track' LANGUAGE C EXECUTE ON ALL SEGMENTS;

REVOKE ALL ON FUNCTION arenadata_toolkit.tracking_get_track_segments(version BIGINT) FROM public;

CREATE VIEW arenadata_toolkit.tables_track AS
SELECT t.*, coalesce(c.oid, i.indrelid, vm.relid, blk.relid, seg.relid) AS parent_relid
FROM arenadata_toolkit.tracking_get_track_master(arenadata_toolkit.tracking_track_version()) AS t
LEFT JOIN pg_class AS c
ON c.reltoastrelid = t.relid AND t.relkind = 't'
LEFT JOIN pg_index AS i
ON i.indexrelid = t.relid AND t.relkind = 'i'
LEFT JOIN pg_catalog.pg_appendonly AS vm
ON vm.visimaprelid = t.relid AND t.relkind = 'M'
LEFT JOIN pg_catalog.pg_appendonly AS blk
ON blk.blkdirrelid = t.relid AND t.relkind = 'b'
LEFT JOIN pg_catalog.pg_appendonly AS seg
ON seg.segrelid = t.relid AND t.relkind = 'o'
UNION ALL
SELECT t.*, coalesce(c.oid, i.indrelid, vm.relid, blk.relid, seg.relid) AS parent_relid
FROM arenadata_toolkit.tracking_get_track_segments(arenadata_toolkit.tracking_track_version()) AS t
LEFT JOIN pg_class AS c
ON c.reltoastrelid = t.relid AND t.relkind = 't'
LEFT JOIN pg_index AS i
ON i.indexrelid = t.relid AND t.relkind = 'i'
LEFT JOIN pg_catalog.pg_appendonly AS vm
ON vm.visimaprelid = t.relid AND t.relkind = 'M'
LEFT JOIN pg_catalog.pg_appendonly AS blk
ON blk.blkdirrelid = t.relid AND t.relkind = 'b'
LEFT JOIN pg_catalog.pg_appendonly AS seg
ON seg.segrelid = t.relid AND t.relkind = 'o';

CREATE VIEW arenadata_toolkit.is_initial_snapshot_triggered AS
SELECT CASE
WHEN TRUE = ALL(select arenadata_toolkit.tracking_is_initial_snapshot_triggered_segments())
AND
arenadata_toolkit.tracking_is_initial_snapshot_triggered_master()
THEN 1 ELSE NULL END AS is_triggered;
2 changes: 1 addition & 1 deletion gpcontrib/arenadata_toolkit/arenadata_toolkit.control
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# arenadata_toolkit extension
comment = 'extension is used for manipulation of objects created by adb-bundle'
default_version = '1.6'
default_version = '1.7'
module_pathname = '$libdir/arenadata_toolkit'
relocatable = false
Loading