From 85456168443602cc07be8a1a993c8cf313b19ec2 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Sat, 3 Aug 2024 18:20:47 +0100 Subject: [PATCH 1/6] Minimize pg_tde changes This commit just removed old no longer needed comments, and fixes a few inconsistent naming issues so the diff between upstream and our code is minimal (after running the sed script to process upstream code) --- src/access/pg_tde_prune.c | 13 +++-- src/access/pg_tde_rewrite.c | 4 +- src/access/pg_tde_vacuumlazy.c | 10 ++-- src/access/pg_tdeam.c | 62 ++++++++--------------- src/access/pg_tdetoast.c | 3 +- src/include/access/pg_tde_visibilitymap.h | 6 +-- src/include/access/pg_tdeam.h | 4 +- src/include/access/pg_tdeam_xlog.h | 14 ++--- src/include/access/pg_tdetoast.h | 1 - 9 files changed, 45 insertions(+), 72 deletions(-) diff --git a/src/access/pg_tde_prune.c b/src/access/pg_tde_prune.c index ff8ed4cf..552151c5 100644 --- a/src/access/pg_tde_prune.c +++ b/src/access/pg_tde_prune.c @@ -127,7 +127,6 @@ tdeheap_page_prune_opt(Relation relation, Buffer buffer) if (RecoveryInProgress()) return; -#if PG_VERSION_NUM < 170000 /* * XXX: Magic to keep old_snapshot_threshold tests appear "working". They * currently are broken, and discussion of what to do about them is @@ -136,7 +135,7 @@ tdeheap_page_prune_opt(Relation relation, Buffer buffer) */ if (old_snapshot_threshold == 0) SnapshotTooOldMagicForTest(); -#endif + /* * First check whether there's any chance there's something to prune, * determining the appropriate horizon is a waste if there's no prune_xid @@ -167,14 +166,14 @@ tdeheap_page_prune_opt(Relation relation, Buffer buffer) if (!GlobalVisTestIsRemovableXid(vistest, prune_xid)) { -#if PG_VERSION_NUM < 170000 - if ( !OldSnapshotThresholdActive()) + if (!OldSnapshotThresholdActive()) return; + if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest), relation, &limited_xmin, &limited_ts)) return; -#endif + if (!TransactionIdPrecedes(prune_xid, limited_xmin)) return; } @@ -540,7 +539,6 @@ tdeheap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer */ if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after)) res = HEAPTUPLE_DEAD; -#if PG_VERSION_NUM < 170000 else if (OldSnapshotThresholdActive()) { /* haven't determined limited horizon yet, requests */ @@ -568,7 +566,7 @@ tdeheap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer res = HEAPTUPLE_DEAD; } } -#endif + return res; } @@ -1228,6 +1226,7 @@ tdeheap_get_root_tuples(Page page, OffsetNumber *root_offsets) } } +// TODO: move to own file so it can be autoupdated // FROM src/page/bufpage.c /* diff --git a/src/access/pg_tde_rewrite.c b/src/access/pg_tde_rewrite.c index 2ad8bcca..964082a0 100644 --- a/src/access/pg_tde_rewrite.c +++ b/src/access/pg_tde_rewrite.c @@ -676,7 +676,7 @@ raw_tdeheap_insert(RewriteState state, HeapTuple tup) { /* * Doesn't fit, so write out the existing page. It always - * contains a tuple. Hence, unlike RelationGetBufferForTuple(), + * contains a tuple. Hence, unlike tdeheap_RelationGetBufferForTuple(), * enforce saveFreeSpace unconditionally. */ @@ -731,8 +731,6 @@ raw_tdeheap_insert(RewriteState state, HeapTuple tup) newitemid = PageGetItemId(page, newoff); onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); - // TODO: decrypt/encrypt - onpage_tup->t_ctid = tup->t_self; } diff --git a/src/access/pg_tde_vacuumlazy.c b/src/access/pg_tde_vacuumlazy.c index 994a5e95..8a3f49ef 100644 --- a/src/access/pg_tde_vacuumlazy.c +++ b/src/access/pg_tde_vacuumlazy.c @@ -1421,7 +1421,7 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, * (which creates a number of empty pages at the tail end of the * relation), and then enters them into the FSM. * - * Note we do not enter the page into the tdeheap_visibilitymap. That has the + * Note we do not enter the page into the visibilitymap. That has the * downside that we repeatedly visit this page in subsequent vacuums, * but otherwise we'll never discover the space on a promoted standby. * The harm of repeated checking ought to normally not be too bad. The @@ -2014,7 +2014,6 @@ lazy_scan_noprune(LVRelState *vacrel, *hastup = true; /* page prevents rel truncation */ tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); - // TODO: decrypt if (tdeheap_tuple_should_freeze(tupleheader, &vacrel->cutoffs, &NoFreezePageRelfrozenXid, &NoFreezePageRelminMxid)) @@ -2822,11 +2821,8 @@ should_attempt_truncation(LVRelState *vacrel) { BlockNumber possibly_freeable; - if (!vacrel->do_rel_truncate || VacuumFailsafeActive -#if PG_VERSION_NUM < 170000 - || old_snapshot_threshold >= 0 -#endif - ) + if (!vacrel->do_rel_truncate || VacuumFailsafeActive || + old_snapshot_threshold >= 0) return false; possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages; diff --git a/src/access/pg_tdeam.c b/src/access/pg_tdeam.c index 71ba5782..e4d1267a 100644 --- a/src/access/pg_tdeam.c +++ b/src/access/pg_tdeam.c @@ -122,6 +122,7 @@ static XLogRecPtr log_tdeheap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy); + /* * Each tuple lock mode has a corresponding heavyweight lock, and one or two * corresponding MultiXactStatuses (one to merely lock tuples, another one to @@ -431,9 +432,7 @@ tdeheapgetpage(TableScanDesc sscan, BlockNumber block) LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); -#if PG_VERSION_NUM < 170000 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, page); -#endif lines = PageGetMaxOffsetNumber(page); ntup = 0; @@ -572,9 +571,9 @@ tdeheapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, /* Caller is responsible for ensuring buffer is locked if needed */ page = BufferGetPage(scan->rs_cbuf); -#if PG_VERSION_NUM < 170000 + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); -#endif + *linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1; if (ScanDirectionIsForward(dir)) @@ -605,9 +604,9 @@ tdeheapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft /* Caller is responsible for ensuring buffer is locked if needed */ page = BufferGetPage(scan->rs_cbuf); -#if PG_VERSION_NUM < 170000 + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); -#endif + if (ScanDirectionIsForward(dir)) { *lineoff = OffsetNumberNext(scan->rs_coffset); @@ -655,17 +654,6 @@ tdeheapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection if (block >= scan->rs_nblocks) block = 0; - /* we're done if we're back at where we started */ - if (block == scan->rs_startblock) - return InvalidBlockNumber; - - /* check if the limit imposed by tdeheap_setscanlimits() is met */ - if (scan->rs_numblocks != InvalidBlockNumber) - { - if (--scan->rs_numblocks == 0) - return InvalidBlockNumber; - } - /* * Report our new scan position for synchronization purposes. We * don't do that when moving backwards, however. That would just @@ -681,6 +669,17 @@ tdeheapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection if (scan->rs_base.rs_flags & SO_ALLOW_SYNC) ss_report_location(scan->rs_base.rs_rd, block); + /* we're done if we're back at where we started */ + if (block == scan->rs_startblock) + return InvalidBlockNumber; + + /* check if the limit imposed by tdeheap_setscanlimits() is met */ + if (scan->rs_numblocks != InvalidBlockNumber) + { + if (--scan->rs_numblocks == 0) + return InvalidBlockNumber; + } + return block; } else @@ -872,9 +871,8 @@ tdeheapgettup_pagemode(HeapScanDesc scan, /* continue from previously returned page/tuple */ block = scan->rs_cblock; /* current page */ page = BufferGetPage(scan->rs_cbuf); -#if PG_VERSION_NUM < 170000 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); -#endif + lineindex = scan->rs_cindex + dir; if (ScanDirectionIsForward(dir)) linesleft = scan->rs_ntuples - lineindex; @@ -893,9 +891,7 @@ tdeheapgettup_pagemode(HeapScanDesc scan, { tdeheapgetpage((TableScanDesc) scan, block); page = BufferGetPage(scan->rs_cbuf); -#if PG_VERSION_NUM < 170000 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); -#endif linesleft = scan->rs_ntuples; lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1; @@ -1385,9 +1381,8 @@ tdeheap_fetch(Relation relation, */ LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); -#if PG_VERSION_NUM < 170000 TestForOldSnapshot(snapshot, relation, page); -#endif + /* * We'd better check for out-of-range offnum in case of VACUUM since the * TID was obtained. @@ -1677,9 +1672,8 @@ tdeheap_get_latest_tid(TableScanDesc sscan, buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); -#if PG_VERSION_NUM < 170000 TestForOldSnapshot(snapshot, relation, page); -#endif + /* * Check for bogus item number. This is not treated as an error * condition because it can happen while following a t_ctid link. We @@ -1905,7 +1899,7 @@ tdeheap_insert(Relation relation, HeapTuple tup, CommandId cid, /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); - tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, + tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, (options & HEAP_INSERT_TDE_NO_ENCRYPT) == 0, (options & HEAP_INSERT_SPECULATIVE) != 0); @@ -2425,7 +2419,7 @@ tdeheap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (all_frozen_set) { Assert(PageIsAllVisible(page)); - Assert (tdeheap_visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); + Assert(tdeheap_visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); /* * It's fine to use InvalidTransactionId here - this is only used @@ -5998,7 +5992,6 @@ tdeheap_inplace_update(Relation relation, HeapTuple tuple) elog(ERROR, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); - // encryption / decryption here: HOW? oldlen = ItemIdGetLength(lp) - htup->t_hoff; newlen = tuple->t_len - tuple->t_data->t_hoff; @@ -6781,7 +6774,6 @@ tdeheap_freeze_execute_prepared(Relation rel, Buffer buffer, /* Deliberately avoid relying on tuple hint bits here */ if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED) { - // TODO: how to keep compiling both? TransactionId xmin = HeapTupleHeaderGetRawXmin(htup); Assert(!HeapTupleHeaderXminFrozen(htup)); @@ -6819,7 +6811,6 @@ tdeheap_freeze_execute_prepared(Relation rel, Buffer buffer, HeapTupleHeader htup; htup = (HeapTupleHeader) PageGetItem(page, itemid); - // TODO: Decryption/encryption here tdeheap_execute_freeze_tuple(htup, frz); } @@ -7696,7 +7687,6 @@ index_delete_check_htid(TM_IndexDeleteOp *delstate, Assert(ItemIdIsNormal(iid)); htup = (HeapTupleHeader) PageGetItem(page, iid); - // TODO: Decryption/encryption here if (unlikely(HeapTupleHeaderIsHeapOnly(htup))) ereport(ERROR, @@ -7984,7 +7974,6 @@ tdeheap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) break; htup = (HeapTupleHeader) PageGetItem(page, lp); - // TODO: Decryption/encryption here /* * Check the tuple XMIN against prior XMAX, if any @@ -9194,7 +9183,6 @@ tdeheap_xlog_freeze_page(XLogReaderState *record) lp = PageGetItemId(page, offset); tuple = (HeapTupleHeader) PageGetItem(page, lp); - // TODO: Decryption/encryption here tdeheap_execute_freeze_tuple(tuple, &frz); } } @@ -9276,7 +9264,6 @@ tdeheap_xlog_delete(XLogReaderState *record) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); - // TODO: Decryption/encryption here htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; @@ -9658,7 +9645,6 @@ tdeheap_xlog_update(XLogReaderState *record, bool hot_update) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); - // TODO: Decryption/encryption here oldtup.t_data = htup; oldtup.t_len = ItemIdGetLength(lp); @@ -9870,7 +9856,6 @@ tdeheap_xlog_confirm(XLogReaderState *record) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); - // TODO: Decryption/encryption here /* * Confirm tuple as actually inserted @@ -9928,7 +9913,6 @@ tdeheap_xlog_lock(XLogReaderState *record) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); - // TODO: Decryption/encryption here htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; @@ -10002,7 +9986,6 @@ tdeheap_xlog_lock_updated(XLogReaderState *record) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); - // TODO: Decryption/encryption here htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; @@ -10044,7 +10027,6 @@ tdeheap_xlog_inplace(XLogReaderState *record) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); - // TODO: Decryption/encryption here oldlen = ItemIdGetLength(lp) - htup->t_hoff; if (oldlen != newlen) @@ -10106,7 +10088,7 @@ tdeheap_redo(XLogReaderState *record) } void -pg_tde2_redo(XLogReaderState *record) +heapam2_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; diff --git a/src/access/pg_tdetoast.c b/src/access/pg_tdetoast.c index 171d6af8..6b4d45d5 100644 --- a/src/access/pg_tdetoast.c +++ b/src/access/pg_tdetoast.c @@ -706,7 +706,6 @@ tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, /* Prepare for scan */ init_toast_snapshot(&SnapshotToast); - toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], &SnapshotToast, nscankeys, toastkey); @@ -840,7 +839,7 @@ tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, systable_endscan_ordered(toastscan); toast_close_indexes(toastidxs, num_indexes, AccessShareLock); } - +// TODO: these should be in their own file so we can proplerly autoupdate them /* pg_tde extension */ static void tdeheap_toast_encrypt(Pointer dval, Oid valueid, RelKeyData *key) diff --git a/src/include/access/pg_tde_visibilitymap.h b/src/include/access/pg_tde_visibilitymap.h index 516ecb21..0b8213f0 100644 --- a/src/include/access/pg_tde_visibilitymap.h +++ b/src/include/access/pg_tde_visibilitymap.h @@ -20,11 +20,11 @@ #include "storage/buf.h" #include "utils/relcache.h" -/* Macros for tdeheap_visibilitymap test */ +/* Macros for visibilitymap test */ #define VM_ALL_VISIBLE(r, b, v) \ - ( (tdeheap_visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_VISIBLE) != 0) + ((tdeheap_visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_VISIBLE) != 0) #define VM_ALL_FROZEN(r, b, v) \ - ( (tdeheap_visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0) + ((tdeheap_visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0) extern bool tdeheap_visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags); diff --git a/src/include/access/pg_tdeam.h b/src/include/access/pg_tdeam.h index b85deffb..b982c8ff 100644 --- a/src/include/access/pg_tdeam.h +++ b/src/include/access/pg_tdeam.h @@ -302,7 +302,7 @@ struct VacuumParams; extern void tdeheap_vacuum_rel(Relation rel, struct VacuumParams *params, BufferAccessStrategy bstrategy); -/* in heap/heapam_visibility.c */ +/* in heap/pg_tdeam_visibility.c */ extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer); extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, @@ -319,7 +319,7 @@ extern bool HeapTupleIsSurelyDead(HeapTuple htup, /* * To avoid leaking too much knowledge about reorderbuffer implementation - * details this is implemented in reorderbuffer.c not heapam_visibility.c + * details this is implemented in reorderbuffer.c not pg_tdeam_visibility.c */ struct HTAB; extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data, diff --git a/src/include/access/pg_tdeam_xlog.h b/src/include/access/pg_tdeam_xlog.h index 89e17dec..9f07212c 100644 --- a/src/include/access/pg_tdeam_xlog.h +++ b/src/include/access/pg_tdeam_xlog.h @@ -254,7 +254,7 @@ typedef struct xl_tdeheap_prune /* * The vacuum page record is similar to the prune record, but can only mark - * already LP_DEAD items LP_UNUSED (during VACUUM's second pg_tde pass) + * already LP_DEAD items LP_UNUSED (during VACUUM's second heap pass) * * Acquires an ordinary exclusive lock only. */ @@ -317,7 +317,7 @@ typedef struct xl_tdeheap_inplace /* * This struct represents a 'freeze plan', which describes how to freeze a - * group of one or more pg_tde tuples (appears in xl_tdeheap_freeze_page record) + * group of one or more heap tuples (appears in xl_tdeheap_freeze_page record) */ /* 0x01 was XLH_FREEZE_XMIN */ #define XLH_FREEZE_XVAC 0x02 @@ -340,7 +340,7 @@ typedef struct xl_tdeheap_freeze_plan * Backup block 0's data contains an array of xl_tdeheap_freeze_plan structs * (with nplans elements), followed by one or more page offset number arrays. * Each such page offset number array corresponds to a single freeze plan - * (REDO routine freezes corresponding pg_tde tuples using freeze plan). + * (REDO routine freezes corresponding heap tuples using freeze plan). */ typedef struct xl_tdeheap_freeze_page { @@ -360,7 +360,7 @@ typedef struct xl_tdeheap_freeze_page * This is what we need to know about setting a visibility map bit * * Backup blk 0: visibility map buffer - * Backup blk 1: pg_tde buffer + * Backup blk 1: heap buffer */ typedef struct xl_tdeheap_visible { @@ -408,9 +408,9 @@ extern void tdeheap_redo(XLogReaderState *record); extern void tdeheap_desc(StringInfo buf, XLogReaderState *record); extern const char *tdeheap_identify(uint8 info); extern void tdeheap_mask(char *pagedata, BlockNumber blkno); -extern void pg_tde2_redo(XLogReaderState *record); -extern void pg_tde2_desc(StringInfo buf, XLogReaderState *record); -extern const char *heap2_identify(uint8 info); +extern void tdeheap2_redo(XLogReaderState *record); +extern void tdeheap2_desc(StringInfo buf, XLogReaderState *record); +extern const char *tdeheap2_identify(uint8 info); extern void tdeheap_xlog_logical_rewrite(XLogReaderState *r); extern XLogRecPtr log_tdeheap_visible(Relation rel, Buffer tdeheap_buffer, diff --git a/src/include/access/pg_tdetoast.h b/src/include/access/pg_tdetoast.h index 8761c168..c17a7816 100644 --- a/src/include/access/pg_tdetoast.h +++ b/src/include/access/pg_tdetoast.h @@ -13,7 +13,6 @@ #ifndef PG_TDE_TOAST_H #define PG_TDE_TOAST_H - #include "access/htup_details.h" #include "storage/lockdefs.h" #include "utils/relcache.h" From 26d1bf23a5eaf2c4576c4b40a9aede32670e1e70 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Sat, 3 Aug 2024 19:36:00 +0100 Subject: [PATCH 2/6] Added new src16 folder WITHOUT tde patches. --- src16/COMMIT | 1 + src16/access/pg_tde_io.c | 886 ++ src16/access/pg_tde_prune.c | 1213 +++ src16/access/pg_tde_rewrite.c | 1287 +++ src16/access/pg_tde_vacuumlazy.c | 3472 ++++++ src16/access/pg_tde_visibilitymap.c | 647 ++ src16/access/pg_tdeam.c | 10247 ++++++++++++++++++ src16/access/pg_tdeam_handler.c | 2615 +++++ src16/access/pg_tdeam_visibility.c | 1790 +++ src16/access/pg_tdetoast.c | 793 ++ src16/include/access/pg_tde_io.h | 62 + src16/include/access/pg_tde_rewrite.h | 57 + src16/include/access/pg_tde_visibilitymap.h | 42 + src16/include/access/pg_tdeam.h | 332 + src16/include/access/pg_tdeam_xlog.h | 421 + src16/include/access/pg_tdetoast.h | 149 + 16 files changed, 24014 insertions(+) create mode 100644 src16/COMMIT create mode 100644 src16/access/pg_tde_io.c create mode 100644 src16/access/pg_tde_prune.c create mode 100644 src16/access/pg_tde_rewrite.c create mode 100644 src16/access/pg_tde_vacuumlazy.c create mode 100644 src16/access/pg_tde_visibilitymap.c create mode 100644 src16/access/pg_tdeam.c create mode 100644 src16/access/pg_tdeam_handler.c create mode 100644 src16/access/pg_tdeam_visibility.c create mode 100644 src16/access/pg_tdetoast.c create mode 100644 src16/include/access/pg_tde_io.h create mode 100644 src16/include/access/pg_tde_rewrite.h create mode 100644 src16/include/access/pg_tde_visibilitymap.h create mode 100644 src16/include/access/pg_tdeam.h create mode 100644 src16/include/access/pg_tdeam_xlog.h create mode 100644 src16/include/access/pg_tdetoast.h diff --git a/src16/COMMIT b/src16/COMMIT new file mode 100644 index 00000000..090b64cf --- /dev/null +++ b/src16/COMMIT @@ -0,0 +1 @@ +f199436c12819d2c01b72eaa6429de0ca5838471 diff --git a/src16/access/pg_tde_io.c b/src16/access/pg_tde_io.c new file mode 100644 index 00000000..125804d9 --- /dev/null +++ b/src16/access/pg_tde_io.c @@ -0,0 +1,886 @@ +/*------------------------------------------------------------------------- + * + * hio.c + * POSTGRES heap access method input/output code. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/hio.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/hio.h" +#include "access/htup_details.h" +#include "access/visibilitymap.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" + + +/* + * tdeheap_RelationPutHeapTuple - place tuple at specified page + * + * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!! + * + * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer. + */ +void +tdeheap_RelationPutHeapTuple(Relation relation, + Buffer buffer, + HeapTuple tuple, + bool token) +{ + Page pageHeader; + OffsetNumber offnum; + + /* + * A tuple that's being inserted speculatively should already have its + * token set. + */ + Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data)); + + /* + * Do not allow tuples with invalid combinations of hint bits to be placed + * on a page. This combination is detected as corruption by the + * contrib/amcheck logic, so if you disable this assertion, make + * corresponding changes there. + */ + Assert(!((tuple->t_data->t_infomask & HEAP_XMAX_COMMITTED) && + (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI))); + + /* Add the tuple to the page */ + pageHeader = BufferGetPage(buffer); + + offnum = PageAddItem(pageHeader, (Item) tuple->t_data, + tuple->t_len, InvalidOffsetNumber, false, true); + + if (offnum == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple to page"); + + /* Update tuple->t_self to the actual position where it was stored */ + ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum); + + /* + * Insert the correct position into CTID of the stored tuple, too (unless + * this is a speculative insertion, in which case the token is held in + * CTID field instead) + */ + if (!token) + { + ItemId itemId = PageGetItemId(pageHeader, offnum); + HeapTupleHeader item = (HeapTupleHeader) PageGetItem(pageHeader, itemId); + + item->t_ctid = tuple->t_self; + } +} + +/* + * Read in a buffer in mode, using bulk-insert strategy if bistate isn't NULL. + */ +static Buffer +ReadBufferBI(Relation relation, BlockNumber targetBlock, + ReadBufferMode mode, BulkInsertState bistate) +{ + Buffer buffer; + + /* If not bulk-insert, exactly like ReadBuffer */ + if (!bistate) + return ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, + mode, NULL); + + /* If we have the desired block already pinned, re-pin and return it */ + if (bistate->current_buf != InvalidBuffer) + { + if (BufferGetBlockNumber(bistate->current_buf) == targetBlock) + { + /* + * Currently the LOCK variants are only used for extending + * relation, which should never reach this branch. + */ + Assert(mode != RBM_ZERO_AND_LOCK && + mode != RBM_ZERO_AND_CLEANUP_LOCK); + + IncrBufferRefCount(bistate->current_buf); + return bistate->current_buf; + } + /* ... else drop the old buffer */ + ReleaseBuffer(bistate->current_buf); + bistate->current_buf = InvalidBuffer; + } + + /* Perform a read using the buffer strategy */ + buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, + mode, bistate->strategy); + + /* Save the selected block as target for future inserts */ + IncrBufferRefCount(buffer); + bistate->current_buf = buffer; + + return buffer; +} + +/* + * For each heap page which is all-visible, acquire a pin on the appropriate + * visibility map page, if we haven't already got one. + * + * To avoid complexity in the callers, either buffer1 or buffer2 may be + * InvalidBuffer if only one buffer is involved. For the same reason, block2 + * may be smaller than block1. + * + * Returns whether buffer locks were temporarily released. + */ +static bool +GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, + BlockNumber block1, BlockNumber block2, + Buffer *vmbuffer1, Buffer *vmbuffer2) +{ + bool need_to_pin_buffer1; + bool need_to_pin_buffer2; + bool released_locks = false; + + /* + * Swap buffers around to handle case of a single block/buffer, and to + * handle if lock ordering rules require to lock block2 first. + */ + if (!BufferIsValid(buffer1) || + (BufferIsValid(buffer2) && block1 > block2)) + { + Buffer tmpbuf = buffer1; + Buffer *tmpvmbuf = vmbuffer1; + BlockNumber tmpblock = block1; + + buffer1 = buffer2; + vmbuffer1 = vmbuffer2; + block1 = block2; + + buffer2 = tmpbuf; + vmbuffer2 = tmpvmbuf; + block2 = tmpblock; + } + + Assert(BufferIsValid(buffer1)); + Assert(buffer2 == InvalidBuffer || block1 <= block2); + + while (1) + { + /* Figure out which pins we need but don't have. */ + need_to_pin_buffer1 = PageIsAllVisible(BufferGetPage(buffer1)) + && !tdeheap_visibilitymap_pin_ok(block1, *vmbuffer1); + need_to_pin_buffer2 = buffer2 != InvalidBuffer + && PageIsAllVisible(BufferGetPage(buffer2)) + && !tdeheap_visibilitymap_pin_ok(block2, *vmbuffer2); + if (!need_to_pin_buffer1 && !need_to_pin_buffer2) + break; + + /* We must unlock both buffers before doing any I/O. */ + released_locks = true; + LockBuffer(buffer1, BUFFER_LOCK_UNLOCK); + if (buffer2 != InvalidBuffer && buffer2 != buffer1) + LockBuffer(buffer2, BUFFER_LOCK_UNLOCK); + + /* Get pins. */ + if (need_to_pin_buffer1) + tdeheap_visibilitymap_pin(relation, block1, vmbuffer1); + if (need_to_pin_buffer2) + tdeheap_visibilitymap_pin(relation, block2, vmbuffer2); + + /* Relock buffers. */ + LockBuffer(buffer1, BUFFER_LOCK_EXCLUSIVE); + if (buffer2 != InvalidBuffer && buffer2 != buffer1) + LockBuffer(buffer2, BUFFER_LOCK_EXCLUSIVE); + + /* + * If there are two buffers involved and we pinned just one of them, + * it's possible that the second one became all-visible while we were + * busy pinning the first one. If it looks like that's a possible + * scenario, we'll need to make a second pass through this loop. + */ + if (buffer2 == InvalidBuffer || buffer1 == buffer2 + || (need_to_pin_buffer1 && need_to_pin_buffer2)) + break; + } + + return released_locks; +} + +/* + * Extend the relation. By multiple pages, if beneficial. + * + * If the caller needs multiple pages (num_pages > 1), we always try to extend + * by at least that much. + * + * If there is contention on the extension lock, we don't just extend "for + * ourselves", but we try to help others. We can do so by adding empty pages + * into the FSM. Typically there is no contention when we can't use the FSM. + * + * We do have to limit the number of pages to extend by to some value, as the + * buffers for all the extended pages need to, temporarily, be pinned. For now + * we define MAX_BUFFERS_TO_EXTEND_BY to be 64 buffers, it's hard to see + * benefits with higher numbers. This partially is because copyfrom.c's + * MAX_BUFFERED_TUPLES / MAX_BUFFERED_BYTES prevents larger multi_inserts. + * + * Returns a buffer for a newly extended block. If possible, the buffer is + * returned exclusively locked. *did_unlock is set to true if the lock had to + * be released, false otherwise. + * + * + * XXX: It would likely be beneficial for some workloads to extend more + * aggressively, e.g. using a heuristic based on the relation size. + */ +static Buffer +RelationAddBlocks(Relation relation, BulkInsertState bistate, + int num_pages, bool use_fsm, bool *did_unlock) +{ +#define MAX_BUFFERS_TO_EXTEND_BY 64 + Buffer victim_buffers[MAX_BUFFERS_TO_EXTEND_BY]; + BlockNumber first_block = InvalidBlockNumber; + BlockNumber last_block = InvalidBlockNumber; + uint32 extend_by_pages; + uint32 not_in_fsm_pages; + Buffer buffer; + Page page; + + /* + * Determine by how many pages to try to extend by. + */ + if (bistate == NULL && !use_fsm) + { + /* + * If we have neither bistate, nor can use the FSM, we can't bulk + * extend - there'd be no way to find the additional pages. + */ + extend_by_pages = 1; + } + else + { + uint32 waitcount; + + /* + * Try to extend at least by the number of pages the caller needs. We + * can remember the additional pages (either via FSM or bistate). + */ + extend_by_pages = num_pages; + + if (!RELATION_IS_LOCAL(relation)) + waitcount = RelationExtensionLockWaiterCount(relation); + else + waitcount = 0; + + /* + * Multiply the number of pages to extend by the number of waiters. Do + * this even if we're not using the FSM, as it still relieves + * contention, by deferring the next time this backend needs to + * extend. In that case the extended pages will be found via + * bistate->next_free. + */ + extend_by_pages += extend_by_pages * waitcount; + + /* --- + * If we previously extended using the same bistate, it's very likely + * we'll extend some more. Try to extend by as many pages as + * before. This can be important for performance for several reasons, + * including: + * + * - It prevents mdzeroextend() switching between extending the + * relation in different ways, which is inefficient for some + * filesystems. + * + * - Contention is often intermittent. Even if we currently don't see + * other waiters (see above), extending by larger amounts can + * prevent future contention. + * --- + */ + if (bistate) + extend_by_pages = Max(extend_by_pages, bistate->already_extended_by); + + /* + * Can't extend by more than MAX_BUFFERS_TO_EXTEND_BY, we need to pin + * them all concurrently. + */ + extend_by_pages = Min(extend_by_pages, MAX_BUFFERS_TO_EXTEND_BY); + } + + /* + * How many of the extended pages should be entered into the FSM? + * + * If we have a bistate, only enter pages that we don't need ourselves + * into the FSM. Otherwise every other backend will immediately try to + * use the pages this backend needs for itself, causing unnecessary + * contention. If we don't have a bistate, we can't avoid the FSM. + * + * Never enter the page returned into the FSM, we'll immediately use it. + */ + if (num_pages > 1 && bistate == NULL) + not_in_fsm_pages = 1; + else + not_in_fsm_pages = num_pages; + + /* prepare to put another buffer into the bistate */ + if (bistate && bistate->current_buf != InvalidBuffer) + { + ReleaseBuffer(bistate->current_buf); + bistate->current_buf = InvalidBuffer; + } + + /* + * Extend the relation. We ask for the first returned page to be locked, + * so that we are sure that nobody has inserted into the page + * concurrently. + * + * With the current MAX_BUFFERS_TO_EXTEND_BY there's no danger of + * [auto]vacuum trying to truncate later pages as REL_TRUNCATE_MINIMUM is + * way larger. + */ + first_block = ExtendBufferedRelBy(BMR_REL(relation), MAIN_FORKNUM, + bistate ? bistate->strategy : NULL, + EB_LOCK_FIRST, + extend_by_pages, + victim_buffers, + &extend_by_pages); + buffer = victim_buffers[0]; /* the buffer the function will return */ + last_block = first_block + (extend_by_pages - 1); + Assert(first_block == BufferGetBlockNumber(buffer)); + + /* + * Relation is now extended. Initialize the page. We do this here, before + * potentially releasing the lock on the page, because it allows us to + * double check that the page contents are empty (this should never + * happen, but if it does we don't want to risk wiping out valid data). + */ + page = BufferGetPage(buffer); + if (!PageIsNew(page)) + elog(ERROR, "page %u of relation \"%s\" should be empty but is not", + first_block, + RelationGetRelationName(relation)); + + PageInit(page, BufferGetPageSize(buffer), 0); + MarkBufferDirty(buffer); + + /* + * If we decided to put pages into the FSM, release the buffer lock (but + * not pin), we don't want to do IO while holding a buffer lock. This will + * necessitate a bit more extensive checking in our caller. + */ + if (use_fsm && not_in_fsm_pages < extend_by_pages) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + *did_unlock = true; + } + else + *did_unlock = false; + + /* + * Relation is now extended. Release pins on all buffers, except for the + * first (which we'll return). If we decided to put pages into the FSM, + * we can do that as part of the same loop. + */ + for (uint32 i = 1; i < extend_by_pages; i++) + { + BlockNumber curBlock = first_block + i; + + Assert(curBlock == BufferGetBlockNumber(victim_buffers[i])); + Assert(BlockNumberIsValid(curBlock)); + + ReleaseBuffer(victim_buffers[i]); + + if (use_fsm && i >= not_in_fsm_pages) + { + Size freespace = BufferGetPageSize(victim_buffers[i]) - + SizeOfPageHeaderData; + + RecordPageWithFreeSpace(relation, curBlock, freespace); + } + } + + if (use_fsm && not_in_fsm_pages < extend_by_pages) + { + BlockNumber first_fsm_block = first_block + not_in_fsm_pages; + + FreeSpaceMapVacuumRange(relation, first_fsm_block, last_block); + } + + if (bistate) + { + /* + * Remember the additional pages we extended by, so we later can use + * them without looking into the FSM. + */ + if (extend_by_pages > 1) + { + bistate->next_free = first_block + 1; + bistate->last_free = last_block; + } + else + { + bistate->next_free = InvalidBlockNumber; + bistate->last_free = InvalidBlockNumber; + } + + /* maintain bistate->current_buf */ + IncrBufferRefCount(buffer); + bistate->current_buf = buffer; + bistate->already_extended_by += extend_by_pages; + } + + return buffer; +#undef MAX_BUFFERS_TO_EXTEND_BY +} + +/* + * tdeheap_RelationGetBufferForTuple + * + * Returns pinned and exclusive-locked buffer of a page in given relation + * with free space >= given len. + * + * If num_pages is > 1, we will try to extend the relation by at least that + * many pages when we decide to extend the relation. This is more efficient + * for callers that know they will need multiple pages + * (e.g. tdeheap_multi_insert()). + * + * If otherBuffer is not InvalidBuffer, then it references a previously + * pinned buffer of another page in the same relation; on return, this + * buffer will also be exclusive-locked. (This case is used by tdeheap_update; + * the otherBuffer contains the tuple being updated.) + * + * The reason for passing otherBuffer is that if two backends are doing + * concurrent tdeheap_update operations, a deadlock could occur if they try + * to lock the same two buffers in opposite orders. To ensure that this + * can't happen, we impose the rule that buffers of a relation must be + * locked in increasing page number order. This is most conveniently done + * by having tdeheap_RelationGetBufferForTuple lock them both, with suitable care + * for ordering. + * + * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the + * same buffer we select for insertion of the new tuple (this could only + * happen if space is freed in that page after tdeheap_update finds there's not + * enough there). In that case, the page will be pinned and locked only once. + * + * We also handle the possibility that the all-visible flag will need to be + * cleared on one or both pages. If so, pin on the associated visibility map + * page must be acquired before acquiring buffer lock(s), to avoid possibly + * doing I/O while holding buffer locks. The pins are passed back to the + * caller using the input-output arguments vmbuffer and vmbuffer_other. + * Note that in some cases the caller might have already acquired such pins, + * which is indicated by these arguments not being InvalidBuffer on entry. + * + * We normally use FSM to help us find free space. However, + * if HEAP_INSERT_SKIP_FSM is specified, we just append a new empty page to + * the end of the relation if the tuple won't fit on the current target page. + * This can save some cycles when we know the relation is new and doesn't + * contain useful amounts of free space. + * + * HEAP_INSERT_SKIP_FSM is also useful for non-WAL-logged additions to a + * relation, if the caller holds exclusive lock and is careful to invalidate + * relation's smgr_targblock before the first insertion --- that ensures that + * all insertions will occur into newly added pages and not be intermixed + * with tuples from other transactions. That way, a crash can't risk losing + * any committed data of other transactions. (See tdeheap_insert's comments + * for additional constraints needed for safe usage of this behavior.) + * + * The caller can also provide a BulkInsertState object to optimize many + * insertions into the same relation. This keeps a pin on the current + * insertion target page (to save pin/unpin cycles) and also passes a + * BULKWRITE buffer selection strategy object to the buffer manager. + * Passing NULL for bistate selects the default behavior. + * + * We don't fill existing pages further than the fillfactor, except for large + * tuples in nearly-empty pages. This is OK since this routine is not + * consulted when updating a tuple and keeping it on the same page, which is + * the scenario fillfactor is meant to reserve space for. + * + * ereport(ERROR) is allowed here, so this routine *must* be called + * before any (unlogged) changes are made in buffer pool. + */ +Buffer +tdeheap_RelationGetBufferForTuple(Relation relation, Size len, + Buffer otherBuffer, int options, + BulkInsertState bistate, + Buffer *vmbuffer, Buffer *vmbuffer_other, + int num_pages) +{ + bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); + Buffer buffer = InvalidBuffer; + Page page; + Size nearlyEmptyFreeSpace, + pageFreeSpace = 0, + saveFreeSpace = 0, + targetFreeSpace = 0; + BlockNumber targetBlock, + otherBlock; + bool unlockedTargetBuffer; + bool recheckVmPins; + + len = MAXALIGN(len); /* be conservative */ + + /* if the caller doesn't know by how many pages to extend, extend by 1 */ + if (num_pages <= 0) + num_pages = 1; + + /* Bulk insert is not supported for updates, only inserts. */ + Assert(otherBuffer == InvalidBuffer || !bistate); + + /* + * If we're gonna fail for oversize tuple, do it right away + */ + if (len > MaxHeapTupleSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("row is too big: size %zu, maximum size %zu", + len, MaxHeapTupleSize))); + + /* Compute desired extra freespace due to fillfactor option */ + saveFreeSpace = RelationGetTargetPageFreeSpace(relation, + HEAP_DEFAULT_FILLFACTOR); + + /* + * Since pages without tuples can still have line pointers, we consider + * pages "empty" when the unavailable space is slight. This threshold is + * somewhat arbitrary, but it should prevent most unnecessary relation + * extensions while inserting large tuples into low-fillfactor tables. + */ + nearlyEmptyFreeSpace = MaxHeapTupleSize - + (MaxHeapTuplesPerPage / 8 * sizeof(ItemIdData)); + if (len + saveFreeSpace > nearlyEmptyFreeSpace) + targetFreeSpace = Max(len, nearlyEmptyFreeSpace); + else + targetFreeSpace = len + saveFreeSpace; + + if (otherBuffer != InvalidBuffer) + otherBlock = BufferGetBlockNumber(otherBuffer); + else + otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */ + + /* + * We first try to put the tuple on the same page we last inserted a tuple + * on, as cached in the BulkInsertState or relcache entry. If that + * doesn't work, we ask the Free Space Map to locate a suitable page. + * Since the FSM's info might be out of date, we have to be prepared to + * loop around and retry multiple times. (To ensure this isn't an infinite + * loop, we must update the FSM with the correct amount of free space on + * each page that proves not to be suitable.) If the FSM has no record of + * a page with enough free space, we give up and extend the relation. + * + * When use_fsm is false, we either put the tuple onto the existing target + * page or extend the relation. + */ + if (bistate && bistate->current_buf != InvalidBuffer) + targetBlock = BufferGetBlockNumber(bistate->current_buf); + else + targetBlock = RelationGetTargetBlock(relation); + + if (targetBlock == InvalidBlockNumber && use_fsm) + { + /* + * We have no cached target page, so ask the FSM for an initial + * target. + */ + targetBlock = GetPageWithFreeSpace(relation, targetFreeSpace); + } + + /* + * If the FSM knows nothing of the rel, try the last page before we give + * up and extend. This avoids one-tuple-per-page syndrome during + * bootstrapping or in a recently-started system. + */ + if (targetBlock == InvalidBlockNumber) + { + BlockNumber nblocks = RelationGetNumberOfBlocks(relation); + + if (nblocks > 0) + targetBlock = nblocks - 1; + } + +loop: + while (targetBlock != InvalidBlockNumber) + { + /* + * Read and exclusive-lock the target block, as well as the other + * block if one was given, taking suitable care with lock ordering and + * the possibility they are the same block. + * + * If the page-level all-visible flag is set, caller will need to + * clear both that and the corresponding visibility map bit. However, + * by the time we return, we'll have x-locked the buffer, and we don't + * want to do any I/O while in that state. So we check the bit here + * before taking the lock, and pin the page if it appears necessary. + * Checking without the lock creates a risk of getting the wrong + * answer, so we'll have to recheck after acquiring the lock. + */ + if (otherBuffer == InvalidBuffer) + { + /* easy case */ + buffer = ReadBufferBI(relation, targetBlock, RBM_NORMAL, bistate); + if (PageIsAllVisible(BufferGetPage(buffer))) + tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); + + /* + * If the page is empty, pin vmbuffer to set all_frozen bit later. + */ + if ((options & HEAP_INSERT_FROZEN) && + (PageGetMaxOffsetNumber(BufferGetPage(buffer)) == 0)) + tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + else if (otherBlock == targetBlock) + { + /* also easy case */ + buffer = otherBuffer; + if (PageIsAllVisible(BufferGetPage(buffer))) + tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + else if (otherBlock < targetBlock) + { + /* lock other buffer first */ + buffer = ReadBuffer(relation, targetBlock); + if (PageIsAllVisible(BufferGetPage(buffer))) + tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); + LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + else + { + /* lock target buffer first */ + buffer = ReadBuffer(relation, targetBlock); + if (PageIsAllVisible(BufferGetPage(buffer))) + tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * We now have the target page (and the other buffer, if any) pinned + * and locked. However, since our initial PageIsAllVisible checks + * were performed before acquiring the lock, the results might now be + * out of date, either for the selected victim buffer, or for the + * other buffer passed by the caller. In that case, we'll need to + * give up our locks, go get the pin(s) we failed to get earlier, and + * re-lock. That's pretty painful, but hopefully shouldn't happen + * often. + * + * Note that there's a small possibility that we didn't pin the page + * above but still have the correct page pinned anyway, either because + * we've already made a previous pass through this loop, or because + * caller passed us the right page anyway. + * + * Note also that it's possible that by the time we get the pin and + * retake the buffer locks, the visibility map bit will have been + * cleared by some other backend anyway. In that case, we'll have + * done a bit of extra work for no gain, but there's no real harm + * done. + */ + GetVisibilityMapPins(relation, buffer, otherBuffer, + targetBlock, otherBlock, vmbuffer, + vmbuffer_other); + + /* + * Now we can check to see if there's enough free space here. If so, + * we're done. + */ + page = BufferGetPage(buffer); + + /* + * If necessary initialize page, it'll be used soon. We could avoid + * dirtying the buffer here, and rely on the caller to do so whenever + * it puts a tuple onto the page, but there seems not much benefit in + * doing so. + */ + if (PageIsNew(page)) + { + PageInit(page, BufferGetPageSize(buffer), 0); + MarkBufferDirty(buffer); + } + + pageFreeSpace = PageGetHeapFreeSpace(page); + if (targetFreeSpace <= pageFreeSpace) + { + /* use this page as future insert target, too */ + RelationSetTargetBlock(relation, targetBlock); + return buffer; + } + + /* + * Not enough space, so we must give up our page locks and pin (if + * any) and prepare to look elsewhere. We don't care which order we + * unlock the two buffers in, so this can be slightly simpler than the + * code above. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + if (otherBuffer == InvalidBuffer) + ReleaseBuffer(buffer); + else if (otherBlock != targetBlock) + { + LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* Is there an ongoing bulk extension? */ + if (bistate && bistate->next_free != InvalidBlockNumber) + { + Assert(bistate->next_free <= bistate->last_free); + + /* + * We bulk extended the relation before, and there are still some + * unused pages from that extension, so we don't need to look in + * the FSM for a new page. But do record the free space from the + * last page, somebody might insert narrower tuples later. + */ + if (use_fsm) + RecordPageWithFreeSpace(relation, targetBlock, pageFreeSpace); + + targetBlock = bistate->next_free; + if (bistate->next_free >= bistate->last_free) + { + bistate->next_free = InvalidBlockNumber; + bistate->last_free = InvalidBlockNumber; + } + else + bistate->next_free++; + } + else if (!use_fsm) + { + /* Without FSM, always fall out of the loop and extend */ + break; + } + else + { + /* + * Update FSM as to condition of this page, and ask for another + * page to try. + */ + targetBlock = RecordAndGetPageWithFreeSpace(relation, + targetBlock, + pageFreeSpace, + targetFreeSpace); + } + } + + /* Have to extend the relation */ + buffer = RelationAddBlocks(relation, bistate, num_pages, use_fsm, + &unlockedTargetBuffer); + + targetBlock = BufferGetBlockNumber(buffer); + page = BufferGetPage(buffer); + + /* + * The page is empty, pin vmbuffer to set all_frozen bit. We don't want to + * do IO while the buffer is locked, so we unlock the page first if IO is + * needed (necessitating checks below). + */ + if (options & HEAP_INSERT_FROZEN) + { + Assert(PageGetMaxOffsetNumber(page) == 0); + + if (!tdeheap_visibilitymap_pin_ok(targetBlock, *vmbuffer)) + { + if (!unlockedTargetBuffer) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + unlockedTargetBuffer = true; + tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); + } + } + + /* + * Reacquire locks if necessary. + * + * If the target buffer was unlocked above, or is unlocked while + * reacquiring the lock on otherBuffer below, it's unlikely, but possible, + * that another backend used space on this page. We check for that below, + * and retry if necessary. + */ + recheckVmPins = false; + if (unlockedTargetBuffer) + { + /* released lock on target buffer above */ + if (otherBuffer != InvalidBuffer) + LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + recheckVmPins = true; + } + else if (otherBuffer != InvalidBuffer) + { + /* + * We did not release the target buffer, and otherBuffer is valid, + * need to lock the other buffer. It's guaranteed to be of a lower + * page number than the new page. To conform with the deadlock + * prevent rules, we ought to lock otherBuffer first, but that would + * give other backends a chance to put tuples on our page. To reduce + * the likelihood of that, attempt to lock the other buffer + * conditionally, that's very likely to work. + * + * Alternatively, we could acquire the lock on otherBuffer before + * extending the relation, but that'd require holding the lock while + * performing IO, which seems worse than an unlikely retry. + */ + Assert(otherBuffer != buffer); + Assert(targetBlock > otherBlock); + + if (unlikely(!ConditionalLockBuffer(otherBuffer))) + { + unlockedTargetBuffer = true; + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + recheckVmPins = true; + } + + /* + * If one of the buffers was unlocked (always the case if otherBuffer is + * valid), it's possible, although unlikely, that an all-visible flag + * became set. We can use GetVisibilityMapPins to deal with that. It's + * possible that GetVisibilityMapPins() might need to temporarily release + * buffer locks, in which case we'll need to check if there's still enough + * space on the page below. + */ + if (recheckVmPins) + { + if (GetVisibilityMapPins(relation, otherBuffer, buffer, + otherBlock, targetBlock, vmbuffer_other, + vmbuffer)) + unlockedTargetBuffer = true; + } + + /* + * If the target buffer was temporarily unlocked since the relation + * extension, it's possible, although unlikely, that all the space on the + * page was already used. If so, we just retry from the start. If we + * didn't unlock, something has gone wrong if there's not enough space - + * the test at the top should have prevented reaching this case. + */ + pageFreeSpace = PageGetHeapFreeSpace(page); + if (len > pageFreeSpace) + { + if (unlockedTargetBuffer) + { + if (otherBuffer != InvalidBuffer) + LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); + UnlockReleaseBuffer(buffer); + + goto loop; + } + elog(PANIC, "tuple is too big: size %zu", len); + } + + /* + * Remember the new page as our target for future insertions. + * + * XXX should we enter the new page into the free space map immediately, + * or just keep it for this backend's exclusive use in the short run + * (until VACUUM sees it)? Seems to depend on whether you expect the + * current backend to make more insertions or not, which is probably a + * good bet most of the time. So for now, don't add it to FSM yet. + */ + RelationSetTargetBlock(relation, targetBlock); + + return buffer; +} diff --git a/src16/access/pg_tde_prune.c b/src16/access/pg_tde_prune.c new file mode 100644 index 00000000..ee3daa46 --- /dev/null +++ b/src16/access/pg_tde_prune.c @@ -0,0 +1,1213 @@ +/*------------------------------------------------------------------------- + * + * pruneheap.c + * heap page pruning and HOT-chain management code + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/pruneheap.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/pg_tdeam_xlog.h" +#include "access/htup_details.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/catalog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "utils/snapmgr.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +/* Working data for tdeheap_page_prune and subroutines */ +typedef struct +{ + Relation rel; + + /* tuple visibility test, initialized for the relation */ + GlobalVisState *vistest; + + /* + * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have + * been computed (done on demand, and only if + * OldSnapshotThresholdActive()). The first time a tuple is about to be + * removed based on the limited horizon, old_snap_used is set to true, and + * SetOldSnapshotThresholdTimestamp() is called. See + * tdeheap_prune_satisfies_vacuum(). + */ + TimestampTz old_snap_ts; + TransactionId old_snap_xmin; + bool old_snap_used; + + TransactionId new_prune_xid; /* new prune hint value for page */ + TransactionId snapshotConflictHorizon; /* latest xid removed */ + int nredirected; /* numbers of entries in arrays below */ + int ndead; + int nunused; + /* arrays that accumulate indexes of items to be changed */ + OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; + OffsetNumber nowdead[MaxHeapTuplesPerPage]; + OffsetNumber nowunused[MaxHeapTuplesPerPage]; + + /* + * marked[i] is true if item i is entered in one of the above arrays. + * + * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is + * 1. Otherwise every access would need to subtract 1. + */ + bool marked[MaxHeapTuplesPerPage + 1]; + + /* + * Tuple visibility is only computed once for each tuple, for correctness + * and efficiency reasons; see comment in tdeheap_page_prune() for details. + * This is of type int8[], instead of HTSV_Result[], so we can use -1 to + * indicate no visibility has been computed, e.g. for LP_DEAD items. + * + * Same indexing as ->marked. + */ + int8 htsv[MaxHeapTuplesPerPage + 1]; +} PruneState; + +/* Local functions */ +static HTSV_Result tdeheap_prune_satisfies_vacuum(PruneState *prstate, + HeapTuple tup, + Buffer buffer); +static int tdeheap_prune_chain(Buffer buffer, + OffsetNumber rootoffnum, + PruneState *prstate); +static void tdeheap_prune_record_prunable(PruneState *prstate, TransactionId xid); +static void tdeheap_prune_record_redirect(PruneState *prstate, + OffsetNumber offnum, OffsetNumber rdoffnum); +static void tdeheap_prune_record_dead(PruneState *prstate, OffsetNumber offnum); +static void tdeheap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); +static void page_verify_redirects(Page page); + + +/* + * Optionally prune and repair fragmentation in the specified page. + * + * This is an opportunistic function. It will perform housekeeping + * only if the page heuristically looks like a candidate for pruning and we + * can acquire buffer cleanup lock without blocking. + * + * Note: this is called quite often. It's important that it fall out quickly + * if there's not any use in pruning. + * + * Caller must have pin on the buffer, and must *not* have a lock on it. + */ +void +tdeheap_page_prune_opt(Relation relation, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + TransactionId prune_xid; + GlobalVisState *vistest; + TransactionId limited_xmin = InvalidTransactionId; + TimestampTz limited_ts = 0; + Size minfree; + + /* + * We can't write WAL in recovery mode, so there's no point trying to + * clean the page. The primary will likely issue a cleaning WAL record + * soon anyway, so this is no particular loss. + */ + if (RecoveryInProgress()) + return; + + /* + * XXX: Magic to keep old_snapshot_threshold tests appear "working". They + * currently are broken, and discussion of what to do about them is + * ongoing. See + * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de + */ + if (old_snapshot_threshold == 0) + SnapshotTooOldMagicForTest(); + + /* + * First check whether there's any chance there's something to prune, + * determining the appropriate horizon is a waste if there's no prune_xid + * (i.e. no updates/deletes left potentially dead tuples around). + */ + prune_xid = ((PageHeader) page)->pd_prune_xid; + if (!TransactionIdIsValid(prune_xid)) + return; + + /* + * Check whether prune_xid indicates that there may be dead rows that can + * be cleaned up. + * + * It is OK to check the old snapshot limit before acquiring the cleanup + * lock because the worst that can happen is that we are not quite as + * aggressive about the cleanup (by however many transaction IDs are + * consumed between this point and acquiring the lock). This allows us to + * save significant overhead in the case where the page is found not to be + * prunable. + * + * Even if old_snapshot_threshold is set, we first check whether the page + * can be pruned without. Both because + * TransactionIdLimitedForOldSnapshots() is not cheap, and because not + * unnecessarily relying on old_snapshot_threshold avoids causing + * conflicts. + */ + vistest = GlobalVisTestFor(relation); + + if (!GlobalVisTestIsRemovableXid(vistest, prune_xid)) + { + if (!OldSnapshotThresholdActive()) + return; + + if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest), + relation, + &limited_xmin, &limited_ts)) + return; + + if (!TransactionIdPrecedes(prune_xid, limited_xmin)) + return; + } + + /* + * We prune when a previous UPDATE failed to find enough space on the page + * for a new tuple version, or when free space falls below the relation's + * fill-factor target (but not less than 10%). + * + * Checking free space here is questionable since we aren't holding any + * lock on the buffer; in the worst case we could get a bogus answer. It's + * unlikely to be *seriously* wrong, though, since reading either pd_lower + * or pd_upper is probably atomic. Avoiding taking a lock seems more + * important than sometimes getting a wrong answer in what is after all + * just a heuristic estimate. + */ + minfree = RelationGetTargetPageFreeSpace(relation, + HEAP_DEFAULT_FILLFACTOR); + minfree = Max(minfree, BLCKSZ / 10); + + if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) + { + /* OK, try to get exclusive buffer lock */ + if (!ConditionalLockBufferForCleanup(buffer)) + return; + + /* + * Now that we have buffer lock, get accurate information about the + * page's free space, and recheck the heuristic about whether to + * prune. + */ + if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) + { + int ndeleted, + nnewlpdead; + + ndeleted = tdeheap_page_prune(relation, buffer, vistest, limited_xmin, + limited_ts, &nnewlpdead, NULL); + + /* + * Report the number of tuples reclaimed to pgstats. This is + * ndeleted minus the number of newly-LP_DEAD-set items. + * + * We derive the number of dead tuples like this to avoid totally + * forgetting about items that were set to LP_DEAD, since they + * still need to be cleaned up by VACUUM. We only want to count + * heap-only tuples that just became LP_UNUSED in our report, + * which don't. + * + * VACUUM doesn't have to compensate in the same way when it + * tracks ndeleted, since it will set the same LP_DEAD items to + * LP_UNUSED separately. + */ + if (ndeleted > nnewlpdead) + pgstat_update_heap_dead_tuples(relation, + ndeleted - nnewlpdead); + } + + /* And release buffer lock */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * We avoid reuse of any free space created on the page by unrelated + * UPDATEs/INSERTs by opting to not update the FSM at this point. The + * free space should be reused by UPDATEs to *this* page. + */ + } +} + + +/* + * Prune and repair fragmentation in the specified page. + * + * Caller must have pin and buffer cleanup lock on the page. Note that we + * don't update the FSM information for page on caller's behalf. Caller might + * also need to account for a reduction in the length of the line pointer + * array following array truncation by us. + * + * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD + * (see tdeheap_prune_satisfies_vacuum and + * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to + * either have been set by TransactionIdLimitedForOldSnapshots, or + * InvalidTransactionId/0 respectively. + * + * Sets *nnewlpdead for caller, indicating the number of items that were + * newly set LP_DEAD during prune operation. + * + * off_loc is the offset location required by the caller to use in error + * callback. + * + * Returns the number of tuples deleted from the page during this call. + */ +int +tdeheap_page_prune(Relation relation, Buffer buffer, + GlobalVisState *vistest, + TransactionId old_snap_xmin, + TimestampTz old_snap_ts, + int *nnewlpdead, + OffsetNumber *off_loc) +{ + int ndeleted = 0; + Page page = BufferGetPage(buffer); + BlockNumber blockno = BufferGetBlockNumber(buffer); + OffsetNumber offnum, + maxoff; + PruneState prstate; + HeapTupleData tup; + + /* + * Our strategy is to scan the page and make lists of items to change, + * then apply the changes within a critical section. This keeps as much + * logic as possible out of the critical section, and also ensures that + * WAL replay will work the same as the normal case. + * + * First, initialize the new pd_prune_xid value to zero (indicating no + * prunable tuples). If we find any tuples which may soon become + * prunable, we will save the lowest relevant XID in new_prune_xid. Also + * initialize the rest of our working state. + */ + prstate.new_prune_xid = InvalidTransactionId; + prstate.rel = relation; + prstate.vistest = vistest; + prstate.old_snap_xmin = old_snap_xmin; + prstate.old_snap_ts = old_snap_ts; + prstate.old_snap_used = false; + prstate.snapshotConflictHorizon = InvalidTransactionId; + prstate.nredirected = prstate.ndead = prstate.nunused = 0; + memset(prstate.marked, 0, sizeof(prstate.marked)); + + maxoff = PageGetMaxOffsetNumber(page); + tup.t_tableOid = RelationGetRelid(prstate.rel); + + /* + * Determine HTSV for all tuples. + * + * This is required for correctness to deal with cases where running HTSV + * twice could result in different results (e.g. RECENTLY_DEAD can turn to + * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid() + * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the + * inserting transaction aborts, ...). That in turn could cause + * tdeheap_prune_chain() to behave incorrectly if a tuple is reached twice, + * once directly via a tdeheap_prune_chain() and once following a HOT chain. + * + * It's also good for performance. Most commonly tuples within a page are + * stored at decreasing offsets (while the items are stored at increasing + * offsets). When processing all tuples on a page this leads to reading + * memory at decreasing offsets within a page, with a variable stride. + * That's hard for CPU prefetchers to deal with. Processing the items in + * reverse order (and thus the tuples in increasing order) increases + * prefetching efficiency significantly / decreases the number of cache + * misses. + */ + for (offnum = maxoff; + offnum >= FirstOffsetNumber; + offnum = OffsetNumberPrev(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + HeapTupleHeader htup; + + /* Nothing to do if slot doesn't contain a tuple */ + if (!ItemIdIsNormal(itemid)) + { + prstate.htsv[offnum] = -1; + continue; + } + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + tup.t_data = htup; + tup.t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tup.t_self), blockno, offnum); + + /* + * Set the offset number so that we can display it along with any + * error that occurred while processing this tuple. + */ + if (off_loc) + *off_loc = offnum; + + prstate.htsv[offnum] = tdeheap_prune_satisfies_vacuum(&prstate, &tup, + buffer); + } + + /* Scan the page */ + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + /* Ignore items already processed as part of an earlier chain */ + if (prstate.marked[offnum]) + continue; + + /* see preceding loop */ + if (off_loc) + *off_loc = offnum; + + /* Nothing to do if slot is empty or already dead */ + itemid = PageGetItemId(page, offnum); + if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) + continue; + + /* Process this item or chain of items */ + ndeleted += tdeheap_prune_chain(buffer, offnum, &prstate); + } + + /* Clear the offset information once we have processed the given page. */ + if (off_loc) + *off_loc = InvalidOffsetNumber; + + /* Any error while applying the changes is critical */ + START_CRIT_SECTION(); + + /* Have we found any prunable items? */ + if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) + { + /* + * Apply the planned item changes, then repair page fragmentation, and + * update the page's hint bit about whether it has free line pointers. + */ + tdeheap_page_prune_execute(buffer, + prstate.redirected, prstate.nredirected, + prstate.nowdead, prstate.ndead, + prstate.nowunused, prstate.nunused); + + /* + * Update the page's pd_prune_xid field to either zero, or the lowest + * XID of any soon-prunable tuple. + */ + ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; + + /* + * Also clear the "page is full" flag, since there's no point in + * repeating the prune/defrag process until something else happens to + * the page. + */ + PageClearFull(page); + + MarkBufferDirty(buffer); + + /* + * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did + */ + if (RelationNeedsWAL(relation)) + { + xl_tdeheap_prune xlrec; + XLogRecPtr recptr; + + xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(relation); + xlrec.snapshotConflictHorizon = prstate.snapshotConflictHorizon; + xlrec.nredirected = prstate.nredirected; + xlrec.ndead = prstate.ndead; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapPrune); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + /* + * The OffsetNumber arrays are not actually in the buffer, but we + * pretend that they are. When XLogInsert stores the whole + * buffer, the offset arrays need not be stored too. + */ + if (prstate.nredirected > 0) + XLogRegisterBufData(0, (char *) prstate.redirected, + prstate.nredirected * + sizeof(OffsetNumber) * 2); + + if (prstate.ndead > 0) + XLogRegisterBufData(0, (char *) prstate.nowdead, + prstate.ndead * sizeof(OffsetNumber)); + + if (prstate.nunused > 0) + XLogRegisterBufData(0, (char *) prstate.nowunused, + prstate.nunused * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE); + + PageSetLSN(BufferGetPage(buffer), recptr); + } + } + else + { + /* + * If we didn't prune anything, but have found a new value for the + * pd_prune_xid field, update it and mark the buffer dirty. This is + * treated as a non-WAL-logged hint. + * + * Also clear the "page is full" flag if it is set, since there's no + * point in repeating the prune/defrag process until something else + * happens to the page. + */ + if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || + PageIsFull(page)) + { + ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; + PageClearFull(page); + MarkBufferDirtyHint(buffer, true); + } + } + + END_CRIT_SECTION(); + + /* Record number of newly-set-LP_DEAD items for caller */ + *nnewlpdead = prstate.ndead; + + return ndeleted; +} + + +/* + * Perform visibility checks for heap pruning. + * + * This is more complicated than just using GlobalVisTestIsRemovableXid() + * because of old_snapshot_threshold. We only want to increase the threshold + * that triggers errors for old snapshots when we actually decide to remove a + * row based on the limited horizon. + * + * Due to its cost we also only want to call + * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have + * done so in tdeheap_page_prune_opt() if pd_prune_xid was old enough. But we + * still want to be able to remove rows that are too new to be removed + * according to prstate->vistest, but that can be removed based on + * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on + * demand in here, if appropriate. + */ +static HTSV_Result +tdeheap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) +{ + HTSV_Result res; + TransactionId dead_after; + + res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after); + + if (res != HEAPTUPLE_RECENTLY_DEAD) + return res; + + /* + * If we are already relying on the limited xmin, there is no need to + * delay doing so anymore. + */ + if (prstate->old_snap_used) + { + Assert(TransactionIdIsValid(prstate->old_snap_xmin)); + + if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin)) + res = HEAPTUPLE_DEAD; + return res; + } + + /* + * First check if GlobalVisTestIsRemovableXid() is sufficient to find the + * row dead. If not, and old_snapshot_threshold is enabled, try to use the + * lowered horizon. + */ + if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after)) + res = HEAPTUPLE_DEAD; + else if (OldSnapshotThresholdActive()) + { + /* haven't determined limited horizon yet, requests */ + if (!TransactionIdIsValid(prstate->old_snap_xmin)) + { + TransactionId horizon = + GlobalVisTestNonRemovableHorizon(prstate->vistest); + + TransactionIdLimitedForOldSnapshots(horizon, prstate->rel, + &prstate->old_snap_xmin, + &prstate->old_snap_ts); + } + + if (TransactionIdIsValid(prstate->old_snap_xmin) && + TransactionIdPrecedes(dead_after, prstate->old_snap_xmin)) + { + /* + * About to remove row based on snapshot_too_old. Need to raise + * the threshold so problematic accesses would error. + */ + Assert(!prstate->old_snap_used); + SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts, + prstate->old_snap_xmin); + prstate->old_snap_used = true; + res = HEAPTUPLE_DEAD; + } + } + + return res; +} + + +/* + * Prune specified line pointer or a HOT chain originating at line pointer. + * + * If the item is an index-referenced tuple (i.e. not a heap-only tuple), + * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT + * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. + * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really + * DEAD, our visibility test is just too coarse to detect it. + * + * In general, pruning must never leave behind a DEAD tuple that still has + * tuple storage. VACUUM isn't prepared to deal with that case. That's why + * VACUUM prunes the same heap page a second time (without dropping its lock + * in the interim) when it sees a newly DEAD tuple that we initially saw as + * in-progress. Retrying pruning like this can only happen when an inserting + * transaction concurrently aborts. + * + * The root line pointer is redirected to the tuple immediately after the + * latest DEAD tuple. If all tuples in the chain are DEAD, the root line + * pointer is marked LP_DEAD. (This includes the case of a DEAD simple + * tuple, which we treat as a chain of length 1.) + * + * We don't actually change the page here. We just add entries to the arrays in + * prstate showing the changes to be made. Items to be redirected are added + * to the redirected[] array (two entries per redirection); items to be set to + * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED + * state are added to nowunused[]. + * + * Returns the number of tuples (to be) deleted from the page. + */ +static int +tdeheap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) +{ + int ndeleted = 0; + Page dp = (Page) BufferGetPage(buffer); + TransactionId priorXmax = InvalidTransactionId; + ItemId rootlp; + HeapTupleHeader htup; + OffsetNumber latestdead = InvalidOffsetNumber, + maxoff = PageGetMaxOffsetNumber(dp), + offnum; + OffsetNumber chainitems[MaxHeapTuplesPerPage]; + int nchain = 0, + i; + + rootlp = PageGetItemId(dp, rootoffnum); + + /* + * If it's a heap-only tuple, then it is not the start of a HOT chain. + */ + if (ItemIdIsNormal(rootlp)) + { + Assert(prstate->htsv[rootoffnum] != -1); + htup = (HeapTupleHeader) PageGetItem(dp, rootlp); + + if (HeapTupleHeaderIsHeapOnly(htup)) + { + /* + * If the tuple is DEAD and doesn't chain to anything else, mark + * it unused immediately. (If it does chain, we can only remove + * it as part of pruning its chain.) + * + * We need this primarily to handle aborted HOT updates, that is, + * XMIN_INVALID heap-only tuples. Those might not be linked to by + * any chain, since the parent tuple might be re-updated before + * any pruning occurs. So we have to be able to reap them + * separately from chain-pruning. (Note that + * HeapTupleHeaderIsHotUpdated will never return true for an + * XMIN_INVALID tuple, so this code will work even when there were + * sequential updates within the aborted transaction.) + * + * Note that we might first arrive at a dead heap-only tuple + * either here or while following a chain below. Whichever path + * gets there first will mark the tuple unused. + */ + if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD && + !HeapTupleHeaderIsHotUpdated(htup)) + { + tdeheap_prune_record_unused(prstate, rootoffnum); + HeapTupleHeaderAdvanceConflictHorizon(htup, + &prstate->snapshotConflictHorizon); + ndeleted++; + } + + /* Nothing more to do */ + return ndeleted; + } + } + + /* Start from the root tuple */ + offnum = rootoffnum; + + /* while not end of the chain */ + for (;;) + { + ItemId lp; + bool tupdead, + recent_dead; + + /* Sanity check (pure paranoia) */ + if (offnum < FirstOffsetNumber) + break; + + /* + * An offset past the end of page's line pointer array is possible + * when the array was truncated (original item must have been unused) + */ + if (offnum > maxoff) + break; + + /* If item is already processed, stop --- it must not be same chain */ + if (prstate->marked[offnum]) + break; + + lp = PageGetItemId(dp, offnum); + + /* Unused item obviously isn't part of the chain */ + if (!ItemIdIsUsed(lp)) + break; + + /* + * If we are looking at the redirected root line pointer, jump to the + * first normal tuple in the chain. If we find a redirect somewhere + * else, stop --- it must not be same chain. + */ + if (ItemIdIsRedirected(lp)) + { + if (nchain > 0) + break; /* not at start of chain */ + chainitems[nchain++] = offnum; + offnum = ItemIdGetRedirect(rootlp); + continue; + } + + /* + * Likewise, a dead line pointer can't be part of the chain. (We + * already eliminated the case of dead root tuple outside this + * function.) + */ + if (ItemIdIsDead(lp)) + break; + + Assert(ItemIdIsNormal(lp)); + Assert(prstate->htsv[offnum] != -1); + htup = (HeapTupleHeader) PageGetItem(dp, lp); + + /* + * Check the tuple XMIN against prior XMAX, if any + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) + break; + + /* + * OK, this tuple is indeed a member of the chain. + */ + chainitems[nchain++] = offnum; + + /* + * Check tuple's visibility status. + */ + tupdead = recent_dead = false; + + switch ((HTSV_Result) prstate->htsv[offnum]) + { + case HEAPTUPLE_DEAD: + tupdead = true; + break; + + case HEAPTUPLE_RECENTLY_DEAD: + recent_dead = true; + + /* + * This tuple may soon become DEAD. Update the hint field so + * that the page is reconsidered for pruning in future. + */ + tdeheap_prune_record_prunable(prstate, + HeapTupleHeaderGetUpdateXid(htup)); + break; + + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * This tuple may soon become DEAD. Update the hint field so + * that the page is reconsidered for pruning in future. + */ + tdeheap_prune_record_prunable(prstate, + HeapTupleHeaderGetUpdateXid(htup)); + break; + + case HEAPTUPLE_LIVE: + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * If we wanted to optimize for aborts, we might consider + * marking the page prunable when we see INSERT_IN_PROGRESS. + * But we don't. See related decisions about when to mark the + * page prunable in heapam.c. + */ + break; + + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + /* + * Remember the last DEAD tuple seen. We will advance past + * RECENTLY_DEAD tuples just in case there's a DEAD one after them; + * but we can't advance past anything else. We have to make sure that + * we don't miss any DEAD tuples, since DEAD tuples that still have + * tuple storage after pruning will confuse VACUUM. + */ + if (tupdead) + { + latestdead = offnum; + HeapTupleHeaderAdvanceConflictHorizon(htup, + &prstate->snapshotConflictHorizon); + } + else if (!recent_dead) + break; + + /* + * If the tuple is not HOT-updated, then we are at the end of this + * HOT-update chain. + */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + break; + + /* HOT implies it can't have moved to different partition */ + Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); + + /* + * Advance to next chain member. + */ + Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == + BufferGetBlockNumber(buffer)); + offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetUpdateXid(htup); + } + + /* + * If we found a DEAD tuple in the chain, adjust the HOT chain so that all + * the DEAD tuples at the start of the chain are removed and the root line + * pointer is appropriately redirected. + */ + if (OffsetNumberIsValid(latestdead)) + { + /* + * Mark as unused each intermediate item that we are able to remove + * from the chain. + * + * When the previous item is the last dead tuple seen, we are at the + * right candidate for redirection. + */ + for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) + { + tdeheap_prune_record_unused(prstate, chainitems[i]); + ndeleted++; + } + + /* + * If the root entry had been a normal tuple, we are deleting it, so + * count it in the result. But changing a redirect (even to DEAD + * state) doesn't count. + */ + if (ItemIdIsNormal(rootlp)) + ndeleted++; + + /* + * If the DEAD tuple is at the end of the chain, the entire chain is + * dead and the root line pointer can be marked dead. Otherwise just + * redirect the root to the correct chain member. + */ + if (i >= nchain) + tdeheap_prune_record_dead(prstate, rootoffnum); + else + tdeheap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); + } + else if (nchain < 2 && ItemIdIsRedirected(rootlp)) + { + /* + * We found a redirect item that doesn't point to a valid follow-on + * item. This can happen if the loop in tdeheap_page_prune caused us to + * visit the dead successor of a redirect item before visiting the + * redirect item. We can clean up by setting the redirect item to + * DEAD state. + */ + tdeheap_prune_record_dead(prstate, rootoffnum); + } + + return ndeleted; +} + +/* Record lowest soon-prunable XID */ +static void +tdeheap_prune_record_prunable(PruneState *prstate, TransactionId xid) +{ + /* + * This should exactly match the PageSetPrunable macro. We can't store + * directly into the page header yet, so we update working state. + */ + Assert(TransactionIdIsNormal(xid)); + if (!TransactionIdIsValid(prstate->new_prune_xid) || + TransactionIdPrecedes(xid, prstate->new_prune_xid)) + prstate->new_prune_xid = xid; +} + +/* Record line pointer to be redirected */ +static void +tdeheap_prune_record_redirect(PruneState *prstate, + OffsetNumber offnum, OffsetNumber rdoffnum) +{ + Assert(prstate->nredirected < MaxHeapTuplesPerPage); + prstate->redirected[prstate->nredirected * 2] = offnum; + prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; + prstate->nredirected++; + Assert(!prstate->marked[offnum]); + prstate->marked[offnum] = true; + Assert(!prstate->marked[rdoffnum]); + prstate->marked[rdoffnum] = true; +} + +/* Record line pointer to be marked dead */ +static void +tdeheap_prune_record_dead(PruneState *prstate, OffsetNumber offnum) +{ + Assert(prstate->ndead < MaxHeapTuplesPerPage); + prstate->nowdead[prstate->ndead] = offnum; + prstate->ndead++; + Assert(!prstate->marked[offnum]); + prstate->marked[offnum] = true; +} + +/* Record line pointer to be marked unused */ +static void +tdeheap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) +{ + Assert(prstate->nunused < MaxHeapTuplesPerPage); + prstate->nowunused[prstate->nunused] = offnum; + prstate->nunused++; + Assert(!prstate->marked[offnum]); + prstate->marked[offnum] = true; +} + + +/* + * Perform the actual page changes needed by tdeheap_page_prune. + * It is expected that the caller has a full cleanup lock on the + * buffer. + */ +void +tdeheap_page_prune_execute(Buffer buffer, + OffsetNumber *redirected, int nredirected, + OffsetNumber *nowdead, int ndead, + OffsetNumber *nowunused, int nunused) +{ + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *offnum; + HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY; + + /* Shouldn't be called unless there's something to do */ + Assert(nredirected > 0 || ndead > 0 || nunused > 0); + + /* Update all redirected line pointers */ + offnum = redirected; + for (int i = 0; i < nredirected; i++) + { + OffsetNumber fromoff = *offnum++; + OffsetNumber tooff = *offnum++; + ItemId fromlp = PageGetItemId(page, fromoff); + ItemId tolp PG_USED_FOR_ASSERTS_ONLY; + +#ifdef USE_ASSERT_CHECKING + + /* + * Any existing item that we set as an LP_REDIRECT (any 'from' item) + * must be the first item from a HOT chain. If the item has tuple + * storage then it can't be a heap-only tuple. Otherwise we are just + * maintaining an existing LP_REDIRECT from an existing HOT chain that + * has been pruned at least once before now. + */ + if (!ItemIdIsRedirected(fromlp)) + { + Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp)); + + htup = (HeapTupleHeader) PageGetItem(page, fromlp); + Assert(!HeapTupleHeaderIsHeapOnly(htup)); + } + else + { + /* We shouldn't need to redundantly set the redirect */ + Assert(ItemIdGetRedirect(fromlp) != tooff); + } + + /* + * The item that we're about to set as an LP_REDIRECT (the 'from' + * item) will point to an existing item (the 'to' item) that is + * already a heap-only tuple. There can be at most one LP_REDIRECT + * item per HOT chain. + * + * We need to keep around an LP_REDIRECT item (after original + * non-heap-only root tuple gets pruned away) so that it's always + * possible for VACUUM to easily figure out what TID to delete from + * indexes when an entire HOT chain becomes dead. A heap-only tuple + * can never become LP_DEAD; an LP_REDIRECT item or a regular heap + * tuple can. + * + * This check may miss problems, e.g. the target of a redirect could + * be marked as unused subsequently. The page_verify_redirects() check + * below will catch such problems. + */ + tolp = PageGetItemId(page, tooff); + Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp)); + htup = (HeapTupleHeader) PageGetItem(page, tolp); + Assert(HeapTupleHeaderIsHeapOnly(htup)); +#endif + + ItemIdSetRedirect(fromlp, tooff); + } + + /* Update all now-dead line pointers */ + offnum = nowdead; + for (int i = 0; i < ndead; i++) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + +#ifdef USE_ASSERT_CHECKING + + /* + * An LP_DEAD line pointer must be left behind when the original item + * (which is dead to everybody) could still be referenced by a TID in + * an index. This should never be necessary with any individual + * heap-only tuple item, though. (It's not clear how much of a problem + * that would be, but there is no reason to allow it.) + */ + if (ItemIdHasStorage(lp)) + { + Assert(ItemIdIsNormal(lp)); + htup = (HeapTupleHeader) PageGetItem(page, lp); + Assert(!HeapTupleHeaderIsHeapOnly(htup)); + } + else + { + /* Whole HOT chain becomes dead */ + Assert(ItemIdIsRedirected(lp)); + } +#endif + + ItemIdSetDead(lp); + } + + /* Update all now-unused line pointers */ + offnum = nowunused; + for (int i = 0; i < nunused; i++) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + +#ifdef USE_ASSERT_CHECKING + + /* + * Only heap-only tuples can become LP_UNUSED during pruning. They + * don't need to be left in place as LP_DEAD items until VACUUM gets + * around to doing index vacuuming. + */ + Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp)); + htup = (HeapTupleHeader) PageGetItem(page, lp); + Assert(HeapTupleHeaderIsHeapOnly(htup)); +#endif + + ItemIdSetUnused(lp); + } + + /* + * Finally, repair any fragmentation, and update the page's hint bit about + * whether it has free pointers. + */ + PageRepairFragmentation(page); + + /* + * Now that the page has been modified, assert that redirect items still + * point to valid targets. + */ + page_verify_redirects(page); +} + + +/* + * If built with assertions, verify that all LP_REDIRECT items point to a + * valid item. + * + * One way that bugs related to HOT pruning show is redirect items pointing to + * removed tuples. It's not trivial to reliably check that marking an item + * unused will not orphan a redirect item during tdeheap_prune_chain() / + * tdeheap_page_prune_execute(), so we additionally check the whole page after + * pruning. Without this check such bugs would typically only cause asserts + * later, potentially well after the corruption has been introduced. + * + * Also check comments in tdeheap_page_prune_execute()'s redirection loop. + */ +static void +page_verify_redirects(Page page) +{ +#ifdef USE_ASSERT_CHECKING + OffsetNumber offnum; + OffsetNumber maxoff; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + OffsetNumber targoff; + ItemId targitem; + HeapTupleHeader htup; + + if (!ItemIdIsRedirected(itemid)) + continue; + + targoff = ItemIdGetRedirect(itemid); + targitem = PageGetItemId(page, targoff); + + Assert(ItemIdIsUsed(targitem)); + Assert(ItemIdIsNormal(targitem)); + Assert(ItemIdHasStorage(targitem)); + htup = (HeapTupleHeader) PageGetItem(page, targitem); + Assert(HeapTupleHeaderIsHeapOnly(htup)); + } +#endif +} + + +/* + * For all items in this page, find their respective root line pointers. + * If item k is part of a HOT-chain with root at item j, then we set + * root_offsets[k - 1] = j. + * + * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. + * Unused entries are filled with InvalidOffsetNumber (zero). + * + * The function must be called with at least share lock on the buffer, to + * prevent concurrent prune operations. + * + * Note: The information collected here is valid only as long as the caller + * holds a pin on the buffer. Once pin is released, a tuple might be pruned + * and reused by a completely unrelated tuple. + */ +void +tdeheap_get_root_tuples(Page page, OffsetNumber *root_offsets) +{ + OffsetNumber offnum, + maxoff; + + MemSet(root_offsets, InvalidOffsetNumber, + MaxHeapTuplesPerPage * sizeof(OffsetNumber)); + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId lp = PageGetItemId(page, offnum); + HeapTupleHeader htup; + OffsetNumber nextoffnum; + TransactionId priorXmax; + + /* skip unused and dead items */ + if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) + continue; + + if (ItemIdIsNormal(lp)) + { + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Check if this tuple is part of a HOT-chain rooted at some other + * tuple. If so, skip it for now; we'll process it when we find + * its root. + */ + if (HeapTupleHeaderIsHeapOnly(htup)) + continue; + + /* + * This is either a plain tuple or the root of a HOT-chain. + * Remember it in the mapping. + */ + root_offsets[offnum - 1] = offnum; + + /* If it's not the start of a HOT-chain, we're done with it */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + continue; + + /* Set up to scan the HOT-chain */ + nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetUpdateXid(htup); + } + else + { + /* Must be a redirect item. We do not set its root_offsets entry */ + Assert(ItemIdIsRedirected(lp)); + /* Set up to scan the HOT-chain */ + nextoffnum = ItemIdGetRedirect(lp); + priorXmax = InvalidTransactionId; + } + + /* + * Now follow the HOT-chain and collect other tuples in the chain. + * + * Note: Even though this is a nested loop, the complexity of the + * function is O(N) because a tuple in the page should be visited not + * more than twice, once in the outer loop and once in HOT-chain + * chases. + */ + for (;;) + { + /* Sanity check (pure paranoia) */ + if (offnum < FirstOffsetNumber) + break; + + /* + * An offset past the end of page's line pointer array is possible + * when the array was truncated + */ + if (offnum > maxoff) + break; + + lp = PageGetItemId(page, nextoffnum); + + /* Check for broken chains */ + if (!ItemIdIsNormal(lp)) + break; + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) + break; + + /* Remember the root line pointer for this item */ + root_offsets[nextoffnum - 1] = offnum; + + /* Advance to next chain member, if any */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + break; + + /* HOT implies it can't have moved to different partition */ + Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); + + nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetUpdateXid(htup); + } + } +} diff --git a/src16/access/pg_tde_rewrite.c b/src16/access/pg_tde_rewrite.c new file mode 100644 index 00000000..7744cb84 --- /dev/null +++ b/src16/access/pg_tde_rewrite.c @@ -0,0 +1,1287 @@ +/*------------------------------------------------------------------------- + * + * rewriteheap.c + * Support functions to rewrite tables. + * + * These functions provide a facility to completely rewrite a heap, while + * preserving visibility information and update chains. + * + * INTERFACE + * + * The caller is responsible for creating the new heap, all catalog + * changes, supplying the tuples to be written to the new heap, and + * rebuilding indexes. The caller must hold AccessExclusiveLock on the + * target table, because we assume no one else is writing into it. + * + * To use the facility: + * + * begin_tdeheap_rewrite + * while (fetch next tuple) + * { + * if (tuple is dead) + * rewrite_tdeheap_dead_tuple + * else + * { + * // do any transformations here if required + * rewrite_tdeheap_tuple + * } + * } + * end_tdeheap_rewrite + * + * The contents of the new relation shouldn't be relied on until after + * end_tdeheap_rewrite is called. + * + * + * IMPLEMENTATION + * + * This would be a fairly trivial affair, except that we need to maintain + * the ctid chains that link versions of an updated tuple together. + * Since the newly stored tuples will have tids different from the original + * ones, if we just copied t_ctid fields to the new table the links would + * be wrong. When we are required to copy a (presumably recently-dead or + * delete-in-progress) tuple whose ctid doesn't point to itself, we have + * to substitute the correct ctid instead. + * + * For each ctid reference from A -> B, we might encounter either A first + * or B first. (Note that a tuple in the middle of a chain is both A and B + * of different pairs.) + * + * If we encounter A first, we'll store the tuple in the unresolved_tups + * hash table. When we later encounter B, we remove A from the hash table, + * fix the ctid to point to the new location of B, and insert both A and B + * to the new heap. + * + * If we encounter B first, we can insert B to the new heap right away. + * We then add an entry to the old_new_tid_map hash table showing B's + * original tid (in the old heap) and new tid (in the new heap). + * When we later encounter A, we get the new location of B from the table, + * and can write A immediately with the correct ctid. + * + * Entries in the hash tables can be removed as soon as the later tuple + * is encountered. That helps to keep the memory usage down. At the end, + * both tables are usually empty; we should have encountered both A and B + * of each pair. However, it's possible for A to be RECENTLY_DEAD and B + * entirely DEAD according to HeapTupleSatisfiesVacuum, because the test + * for deadness using OldestXmin is not exact. In such a case we might + * encounter B first, and skip it, and find A later. Then A would be added + * to unresolved_tups, and stay there until end of the rewrite. Since + * this case is very unusual, we don't worry about the memory usage. + * + * Using in-memory hash tables means that we use some memory for each live + * update chain in the table, from the time we find one end of the + * reference until we find the other end. That shouldn't be a problem in + * practice, but if you do something like an UPDATE without a where-clause + * on a large table, and then run CLUSTER in the same transaction, you + * could run out of memory. It doesn't seem worthwhile to add support for + * spill-to-disk, as there shouldn't be that many RECENTLY_DEAD tuples in a + * table under normal circumstances. Furthermore, in the typical scenario + * of CLUSTERing on an unchanging key column, we'll see all the versions + * of a given tuple together anyway, and so the peak memory usage is only + * proportional to the number of RECENTLY_DEAD versions of a single row, not + * in the whole table. Note that if we do fail halfway through a CLUSTER, + * the old table is still valid, so failure is not catastrophic. + * + * We can't use the normal tdeheap_insert function to insert into the new + * heap, because tdeheap_insert overwrites the visibility information. + * We use a special-purpose raw_tdeheap_insert function instead, which + * is optimized for bulk inserting a lot of tuples, knowing that we have + * exclusive access to the heap. raw_tdeheap_insert builds new pages in + * local storage. When a page is full, or at the end of the process, + * we insert it to WAL as a single record and then write it to disk + * directly through smgr. Note, however, that any data sent to the new + * heap's TOAST table will go through the normal bufmgr. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994-5, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/heap/rewriteheap.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/heapam.h" +#include "access/pg_tdeam_xlog.h" +#include "access/heaptoast.h" +#include "access/rewriteheap.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/catalog.h" +#include "common/file_utils.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/slot.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* + * State associated with a rewrite operation. This is opaque to the user + * of the rewrite facility. + */ +typedef struct RewriteStateData +{ + Relation rs_old_rel; /* source heap */ + Relation rs_new_rel; /* destination heap */ + Page rs_buffer; /* page currently being built */ + BlockNumber rs_blockno; /* block where page will go */ + bool rs_buffer_valid; /* T if any tuples in buffer */ + bool rs_logical_rewrite; /* do we need to do logical rewriting */ + TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine + * tuple visibility */ + TransactionId rs_freeze_xid; /* Xid that will be used as freeze cutoff + * point */ + TransactionId rs_logical_xmin; /* Xid that will be used as cutoff point + * for logical rewrites */ + MultiXactId rs_cutoff_multi; /* MultiXactId that will be used as cutoff + * point for multixacts */ + MemoryContext rs_cxt; /* for hash tables and entries and tuples in + * them */ + XLogRecPtr rs_begin_lsn; /* XLogInsertLsn when starting the rewrite */ + HTAB *rs_unresolved_tups; /* unmatched A tuples */ + HTAB *rs_old_new_tid_map; /* unmatched B tuples */ + HTAB *rs_logical_mappings; /* logical remapping files */ + uint32 rs_num_rewrite_mappings; /* # in memory mappings */ +} RewriteStateData; + +/* + * The lookup keys for the hash tables are tuple TID and xmin (we must check + * both to avoid false matches from dead tuples). Beware that there is + * probably some padding space in this struct; it must be zeroed out for + * correct hashtable operation. + */ +typedef struct +{ + TransactionId xmin; /* tuple xmin */ + ItemPointerData tid; /* tuple location in old heap */ +} TidHashKey; + +/* + * Entry structures for the hash tables + */ +typedef struct +{ + TidHashKey key; /* expected xmin/old location of B tuple */ + ItemPointerData old_tid; /* A's location in the old heap */ + HeapTuple tuple; /* A's tuple contents */ +} UnresolvedTupData; + +typedef UnresolvedTupData *UnresolvedTup; + +typedef struct +{ + TidHashKey key; /* actual xmin/old location of B tuple */ + ItemPointerData new_tid; /* where we put it in the new heap */ +} OldToNewMappingData; + +typedef OldToNewMappingData *OldToNewMapping; + +/* + * In-Memory data for an xid that might need logical remapping entries + * to be logged. + */ +typedef struct RewriteMappingFile +{ + TransactionId xid; /* xid that might need to see the row */ + int vfd; /* fd of mappings file */ + off_t off; /* how far have we written yet */ + dclist_head mappings; /* list of in-memory mappings */ + char path[MAXPGPATH]; /* path, for error messages */ +} RewriteMappingFile; + +/* + * A single In-Memory logical rewrite mapping, hanging off + * RewriteMappingFile->mappings. + */ +typedef struct RewriteMappingDataEntry +{ + LogicalRewriteMappingData map; /* map between old and new location of the + * tuple */ + dlist_node node; +} RewriteMappingDataEntry; + + +/* prototypes for internal functions */ +static void raw_tdeheap_insert(RewriteState state, HeapTuple tup); + +/* internal logical remapping prototypes */ +static void logical_begin_tdeheap_rewrite(RewriteState state); +static void logical_rewrite_tdeheap_tuple(RewriteState state, ItemPointerData old_tid, HeapTuple new_tuple); +static void logical_end_tdeheap_rewrite(RewriteState state); + + +/* + * Begin a rewrite of a table + * + * old_heap old, locked heap relation tuples will be read from + * new_heap new, locked heap relation to insert tuples to + * oldest_xmin xid used by the caller to determine which tuples are dead + * freeze_xid xid before which tuples will be frozen + * cutoff_multi multixact before which multis will be removed + * + * Returns an opaque RewriteState, allocated in current memory context, + * to be used in subsequent calls to the other functions. + */ +RewriteState +begin_tdeheap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin, + TransactionId freeze_xid, MultiXactId cutoff_multi) +{ + RewriteState state; + MemoryContext rw_cxt; + MemoryContext old_cxt; + HASHCTL hash_ctl; + + /* + * To ease cleanup, make a separate context that will contain the + * RewriteState struct itself plus all subsidiary data. + */ + rw_cxt = AllocSetContextCreate(CurrentMemoryContext, + "Table rewrite", + ALLOCSET_DEFAULT_SIZES); + old_cxt = MemoryContextSwitchTo(rw_cxt); + + /* Create and fill in the state struct */ + state = palloc0(sizeof(RewriteStateData)); + + state->rs_old_rel = old_heap; + state->rs_new_rel = new_heap; + state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + /* new_heap needn't be empty, just locked */ + state->rs_blockno = RelationGetNumberOfBlocks(new_heap); + state->rs_buffer_valid = false; + state->rs_oldest_xmin = oldest_xmin; + state->rs_freeze_xid = freeze_xid; + state->rs_cutoff_multi = cutoff_multi; + state->rs_cxt = rw_cxt; + + /* Initialize hash tables used to track update chains */ + hash_ctl.keysize = sizeof(TidHashKey); + hash_ctl.entrysize = sizeof(UnresolvedTupData); + hash_ctl.hcxt = state->rs_cxt; + + state->rs_unresolved_tups = + hash_create("Rewrite / Unresolved ctids", + 128, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + hash_ctl.entrysize = sizeof(OldToNewMappingData); + + state->rs_old_new_tid_map = + hash_create("Rewrite / Old to new tid map", + 128, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + MemoryContextSwitchTo(old_cxt); + + logical_begin_tdeheap_rewrite(state); + + return state; +} + +/* + * End a rewrite. + * + * state and any other resources are freed. + */ +void +end_tdeheap_rewrite(RewriteState state) +{ + HASH_SEQ_STATUS seq_status; + UnresolvedTup unresolved; + + /* + * Write any remaining tuples in the UnresolvedTups table. If we have any + * left, they should in fact be dead, but let's err on the safe side. + */ + hash_seq_init(&seq_status, state->rs_unresolved_tups); + + while ((unresolved = hash_seq_search(&seq_status)) != NULL) + { + ItemPointerSetInvalid(&unresolved->tuple->t_data->t_ctid); + raw_tdeheap_insert(state, unresolved->tuple); + } + + /* Write the last page, if any */ + if (state->rs_buffer_valid) + { + if (RelationNeedsWAL(state->rs_new_rel)) + log_newpage(&state->rs_new_rel->rd_locator, + MAIN_FORKNUM, + state->rs_blockno, + state->rs_buffer, + true); + + PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); + + smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, + state->rs_blockno, state->rs_buffer, true); + } + + /* + * When we WAL-logged rel pages, we must nonetheless fsync them. The + * reason is the same as in storage.c's RelationCopyStorage(): we're + * writing data that's not in shared buffers, and so a CHECKPOINT + * occurring during the rewriteheap operation won't have fsync'd data we + * wrote before the checkpoint. + */ + if (RelationNeedsWAL(state->rs_new_rel)) + smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM); + + logical_end_tdeheap_rewrite(state); + + /* Deleting the context frees everything */ + MemoryContextDelete(state->rs_cxt); +} + +/* + * Add a tuple to the new heap. + * + * Visibility information is copied from the original tuple, except that + * we "freeze" very-old tuples. Note that since we scribble on new_tuple, + * it had better be temp storage not a pointer to the original tuple. + * + * state opaque state as returned by begin_tdeheap_rewrite + * old_tuple original tuple in the old heap + * new_tuple new, rewritten tuple to be inserted to new heap + */ +void +rewrite_tdeheap_tuple(RewriteState state, + HeapTuple old_tuple, HeapTuple new_tuple) +{ + MemoryContext old_cxt; + ItemPointerData old_tid; + TidHashKey hashkey; + bool found; + bool free_new; + + old_cxt = MemoryContextSwitchTo(state->rs_cxt); + + /* + * Copy the original tuple's visibility information into new_tuple. + * + * XXX we might later need to copy some t_infomask2 bits, too? Right now, + * we intentionally clear the HOT status bits. + */ + memcpy(&new_tuple->t_data->t_choice.t_heap, + &old_tuple->t_data->t_choice.t_heap, + sizeof(HeapTupleFields)); + + new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; + new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; + new_tuple->t_data->t_infomask |= + old_tuple->t_data->t_infomask & HEAP_XACT_MASK; + + /* + * While we have our hands on the tuple, we may as well freeze any + * eligible xmin or xmax, so that future VACUUM effort can be saved. + */ + tdeheap_freeze_tuple(new_tuple->t_data, + state->rs_old_rel->rd_rel->relfrozenxid, + state->rs_old_rel->rd_rel->relminmxid, + state->rs_freeze_xid, + state->rs_cutoff_multi); + + /* + * Invalid ctid means that ctid should point to the tuple itself. We'll + * override it later if the tuple is part of an update chain. + */ + ItemPointerSetInvalid(&new_tuple->t_data->t_ctid); + + /* + * If the tuple has been updated, check the old-to-new mapping hash table. + */ + if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) && + !HeapTupleHeaderIndicatesMovedPartitions(old_tuple->t_data) && + !(ItemPointerEquals(&(old_tuple->t_self), + &(old_tuple->t_data->t_ctid)))) + { + OldToNewMapping mapping; + + memset(&hashkey, 0, sizeof(hashkey)); + hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); + hashkey.tid = old_tuple->t_data->t_ctid; + + mapping = (OldToNewMapping) + hash_search(state->rs_old_new_tid_map, &hashkey, + HASH_FIND, NULL); + + if (mapping != NULL) + { + /* + * We've already copied the tuple that t_ctid points to, so we can + * set the ctid of this tuple to point to the new location, and + * insert it right away. + */ + new_tuple->t_data->t_ctid = mapping->new_tid; + + /* We don't need the mapping entry anymore */ + hash_search(state->rs_old_new_tid_map, &hashkey, + HASH_REMOVE, &found); + Assert(found); + } + else + { + /* + * We haven't seen the tuple t_ctid points to yet. Stash this + * tuple into unresolved_tups to be written later. + */ + UnresolvedTup unresolved; + + unresolved = hash_search(state->rs_unresolved_tups, &hashkey, + HASH_ENTER, &found); + Assert(!found); + + unresolved->old_tid = old_tuple->t_self; + unresolved->tuple = tdeheap_copytuple(new_tuple); + + /* + * We can't do anything more now, since we don't know where the + * tuple will be written. + */ + MemoryContextSwitchTo(old_cxt); + return; + } + } + + /* + * Now we will write the tuple, and then check to see if it is the B tuple + * in any new or known pair. When we resolve a known pair, we will be + * able to write that pair's A tuple, and then we have to check if it + * resolves some other pair. Hence, we need a loop here. + */ + old_tid = old_tuple->t_self; + free_new = false; + + for (;;) + { + ItemPointerData new_tid; + + /* Insert the tuple and find out where it's put in new_heap */ + raw_tdeheap_insert(state, new_tuple); + new_tid = new_tuple->t_self; + + logical_rewrite_tdeheap_tuple(state, old_tid, new_tuple); + + /* + * If the tuple is the updated version of a row, and the prior version + * wouldn't be DEAD yet, then we need to either resolve the prior + * version (if it's waiting in rs_unresolved_tups), or make an entry + * in rs_old_new_tid_map (so we can resolve it when we do see it). The + * previous tuple's xmax would equal this one's xmin, so it's + * RECENTLY_DEAD if and only if the xmin is not before OldestXmin. + */ + if ((new_tuple->t_data->t_infomask & HEAP_UPDATED) && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(new_tuple->t_data), + state->rs_oldest_xmin)) + { + /* + * Okay, this is B in an update pair. See if we've seen A. + */ + UnresolvedTup unresolved; + + memset(&hashkey, 0, sizeof(hashkey)); + hashkey.xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); + hashkey.tid = old_tid; + + unresolved = hash_search(state->rs_unresolved_tups, &hashkey, + HASH_FIND, NULL); + + if (unresolved != NULL) + { + /* + * We have seen and memorized the previous tuple already. Now + * that we know where we inserted the tuple its t_ctid points + * to, fix its t_ctid and insert it to the new heap. + */ + if (free_new) + tdeheap_freetuple(new_tuple); + new_tuple = unresolved->tuple; + free_new = true; + old_tid = unresolved->old_tid; + new_tuple->t_data->t_ctid = new_tid; + + /* + * We don't need the hash entry anymore, but don't free its + * tuple just yet. + */ + hash_search(state->rs_unresolved_tups, &hashkey, + HASH_REMOVE, &found); + Assert(found); + + /* loop back to insert the previous tuple in the chain */ + continue; + } + else + { + /* + * Remember the new tid of this tuple. We'll use it to set the + * ctid when we find the previous tuple in the chain. + */ + OldToNewMapping mapping; + + mapping = hash_search(state->rs_old_new_tid_map, &hashkey, + HASH_ENTER, &found); + Assert(!found); + + mapping->new_tid = new_tid; + } + } + + /* Done with this (chain of) tuples, for now */ + if (free_new) + tdeheap_freetuple(new_tuple); + break; + } + + MemoryContextSwitchTo(old_cxt); +} + +/* + * Register a dead tuple with an ongoing rewrite. Dead tuples are not + * copied to the new table, but we still make note of them so that we + * can release some resources earlier. + * + * Returns true if a tuple was removed from the unresolved_tups table. + * This indicates that that tuple, previously thought to be "recently dead", + * is now known really dead and won't be written to the output. + */ +bool +rewrite_tdeheap_dead_tuple(RewriteState state, HeapTuple old_tuple) +{ + /* + * If we have already seen an earlier tuple in the update chain that + * points to this tuple, let's forget about that earlier tuple. It's in + * fact dead as well, our simple xmax < OldestXmin test in + * HeapTupleSatisfiesVacuum just wasn't enough to detect it. It happens + * when xmin of a tuple is greater than xmax, which sounds + * counter-intuitive but is perfectly valid. + * + * We don't bother to try to detect the situation the other way round, + * when we encounter the dead tuple first and then the recently dead one + * that points to it. If that happens, we'll have some unmatched entries + * in the UnresolvedTups hash table at the end. That can happen anyway, + * because a vacuum might have removed the dead tuple in the chain before + * us. + */ + UnresolvedTup unresolved; + TidHashKey hashkey; + bool found; + + memset(&hashkey, 0, sizeof(hashkey)); + hashkey.xmin = HeapTupleHeaderGetXmin(old_tuple->t_data); + hashkey.tid = old_tuple->t_self; + + unresolved = hash_search(state->rs_unresolved_tups, &hashkey, + HASH_FIND, NULL); + + if (unresolved != NULL) + { + /* Need to free the contained tuple as well as the hashtable entry */ + tdeheap_freetuple(unresolved->tuple); + hash_search(state->rs_unresolved_tups, &hashkey, + HASH_REMOVE, &found); + Assert(found); + return true; + } + + return false; +} + +/* + * Insert a tuple to the new relation. This has to track tdeheap_insert + * and its subsidiary functions! + * + * t_self of the tuple is set to the new TID of the tuple. If t_ctid of the + * tuple is invalid on entry, it's replaced with the new TID as well (in + * the inserted data only, not in the caller's copy). + */ +static void +raw_tdeheap_insert(RewriteState state, HeapTuple tup) +{ + Page page = state->rs_buffer; + Size pageFreeSpace, + saveFreeSpace; + Size len; + OffsetNumber newoff; + HeapTuple heaptup; + + /* + * If the new tuple is too big for storage or contains already toasted + * out-of-line attributes from some other relation, invoke the toaster. + * + * Note: below this point, heaptup is the data we actually intend to store + * into the relation; tup is the caller's original untoasted data. + */ + if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE) + { + /* toast table entries should never be recursively toasted */ + Assert(!HeapTupleHasExternal(tup)); + heaptup = tup; + } + else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) + { + int options = HEAP_INSERT_SKIP_FSM; + + /* + * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data + * for the TOAST table are not logically decoded. The main heap is + * WAL-logged as XLOG FPI records, which are not logically decoded. + */ + options |= HEAP_INSERT_NO_LOGICAL; + + heaptup = tdeheap_toast_insert_or_update(state->rs_new_rel, tup, NULL, + options); + } + else + heaptup = tup; + + len = MAXALIGN(heaptup->t_len); /* be conservative */ + + /* + * If we're gonna fail for oversize tuple, do it right away + */ + if (len > MaxHeapTupleSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("row is too big: size %zu, maximum size %zu", + len, MaxHeapTupleSize))); + + /* Compute desired extra freespace due to fillfactor option */ + saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel, + HEAP_DEFAULT_FILLFACTOR); + + /* Now we can check to see if there's enough free space already. */ + if (state->rs_buffer_valid) + { + pageFreeSpace = PageGetHeapFreeSpace(page); + + if (len + saveFreeSpace > pageFreeSpace) + { + /* + * Doesn't fit, so write out the existing page. It always + * contains a tuple. Hence, unlike tdeheap_RelationGetBufferForTuple(), + * enforce saveFreeSpace unconditionally. + */ + + /* XLOG stuff */ + if (RelationNeedsWAL(state->rs_new_rel)) + log_newpage(&state->rs_new_rel->rd_locator, + MAIN_FORKNUM, + state->rs_blockno, + page, + true); + + /* + * Now write the page. We say skipFsync = true because there's no + * need for smgr to schedule an fsync for this write; we'll do it + * ourselves in end_tdeheap_rewrite. + */ + PageSetChecksumInplace(page, state->rs_blockno); + + smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, + state->rs_blockno, page, true); + + state->rs_blockno++; + state->rs_buffer_valid = false; + } + } + + if (!state->rs_buffer_valid) + { + /* Initialize a new empty page */ + PageInit(page, BLCKSZ, 0); + state->rs_buffer_valid = true; + } + + /* And now we can insert the tuple into the page */ + newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, + InvalidOffsetNumber, false, true); + if (newoff == InvalidOffsetNumber) + elog(ERROR, "failed to add tuple"); + + /* Update caller's t_self to the actual position where it was stored */ + ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff); + + /* + * Insert the correct position into CTID of the stored tuple, too, if the + * caller didn't supply a valid CTID. + */ + if (!ItemPointerIsValid(&tup->t_data->t_ctid)) + { + ItemId newitemid; + HeapTupleHeader onpage_tup; + + newitemid = PageGetItemId(page, newoff); + onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); + + onpage_tup->t_ctid = tup->t_self; + } + + /* If heaptup is a private copy, release it. */ + if (heaptup != tup) + tdeheap_freetuple(heaptup); +} + +/* ------------------------------------------------------------------------ + * Logical rewrite support + * + * When doing logical decoding - which relies on using cmin/cmax of catalog + * tuples, via xl_tdeheap_new_cid records - heap rewrites have to log enough + * information to allow the decoding backend to update its internal mapping + * of (relfilelocator,ctid) => (cmin, cmax) to be correct for the rewritten heap. + * + * For that, every time we find a tuple that's been modified in a catalog + * relation within the xmin horizon of any decoding slot, we log a mapping + * from the old to the new location. + * + * To deal with rewrites that abort the filename of a mapping file contains + * the xid of the transaction performing the rewrite, which then can be + * checked before being read in. + * + * For efficiency we don't immediately spill every single map mapping for a + * row to disk but only do so in batches when we've collected several of them + * in memory or when end_tdeheap_rewrite() has been called. + * + * Crash-Safety: This module diverts from the usual patterns of doing WAL + * since it cannot rely on checkpoint flushing out all buffers and thus + * waiting for exclusive locks on buffers. Usually the XLogInsert() covering + * buffer modifications is performed while the buffer(s) that are being + * modified are exclusively locked guaranteeing that both the WAL record and + * the modified heap are on either side of the checkpoint. But since the + * mapping files we log aren't in shared_buffers that interlock doesn't work. + * + * Instead we simply write the mapping files out to disk, *before* the + * XLogInsert() is performed. That guarantees that either the XLogInsert() is + * inserted after the checkpoint's redo pointer or that the checkpoint (via + * CheckPointLogicalRewriteHeap()) has flushed the (partial) mapping file to + * disk. That leaves the tail end that has not yet been flushed open to + * corruption, which is solved by including the current offset in the + * xl_tdeheap_rewrite_mapping records and truncating the mapping file to it + * during replay. Every time a rewrite is finished all generated mapping files + * are synced to disk. + * + * Note that if we were only concerned about crash safety we wouldn't have to + * deal with WAL logging at all - an fsync() at the end of a rewrite would be + * sufficient for crash safety. Any mapping that hasn't been safely flushed to + * disk has to be by an aborted (explicitly or via a crash) transaction and is + * ignored by virtue of the xid in its name being subject to a + * TransactionDidCommit() check. But we want to support having standbys via + * physical replication, both for availability and to do logical decoding + * there. + * ------------------------------------------------------------------------ + */ + +/* + * Do preparations for logging logical mappings during a rewrite if + * necessary. If we detect that we don't need to log anything we'll prevent + * any further action by the various logical rewrite functions. + */ +static void +logical_begin_tdeheap_rewrite(RewriteState state) +{ + HASHCTL hash_ctl; + TransactionId logical_xmin; + + /* + * We only need to persist these mappings if the rewritten table can be + * accessed during logical decoding, if not, we can skip doing any + * additional work. + */ + state->rs_logical_rewrite = + RelationIsAccessibleInLogicalDecoding(state->rs_old_rel); + + if (!state->rs_logical_rewrite) + return; + + ProcArrayGetReplicationSlotXmin(NULL, &logical_xmin); + + /* + * If there are no logical slots in progress we don't need to do anything, + * there cannot be any remappings for relevant rows yet. The relation's + * lock protects us against races. + */ + if (logical_xmin == InvalidTransactionId) + { + state->rs_logical_rewrite = false; + return; + } + + state->rs_logical_xmin = logical_xmin; + state->rs_begin_lsn = GetXLogInsertRecPtr(); + state->rs_num_rewrite_mappings = 0; + + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(RewriteMappingFile); + hash_ctl.hcxt = state->rs_cxt; + + state->rs_logical_mappings = + hash_create("Logical rewrite mapping", + 128, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); +} + +/* + * Flush all logical in-memory mappings to disk, but don't fsync them yet. + */ +static void +logical_tdeheap_rewrite_flush_mappings(RewriteState state) +{ + HASH_SEQ_STATUS seq_status; + RewriteMappingFile *src; + dlist_mutable_iter iter; + + Assert(state->rs_logical_rewrite); + + /* no logical rewrite in progress, no need to iterate over mappings */ + if (state->rs_num_rewrite_mappings == 0) + return; + + elog(DEBUG1, "flushing %u logical rewrite mapping entries", + state->rs_num_rewrite_mappings); + + hash_seq_init(&seq_status, state->rs_logical_mappings); + while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) + { + char *waldata; + char *waldata_start; + xl_tdeheap_rewrite_mapping xlrec; + Oid dboid; + uint32 len; + int written; + uint32 num_mappings = dclist_count(&src->mappings); + + /* this file hasn't got any new mappings */ + if (num_mappings == 0) + continue; + + if (state->rs_old_rel->rd_rel->relisshared) + dboid = InvalidOid; + else + dboid = MyDatabaseId; + + xlrec.num_mappings = num_mappings; + xlrec.mapped_rel = RelationGetRelid(state->rs_old_rel); + xlrec.mapped_xid = src->xid; + xlrec.mapped_db = dboid; + xlrec.offset = src->off; + xlrec.start_lsn = state->rs_begin_lsn; + + /* write all mappings consecutively */ + len = num_mappings * sizeof(LogicalRewriteMappingData); + waldata_start = waldata = palloc(len); + + /* + * collect data we need to write out, but don't modify ondisk data yet + */ + dclist_foreach_modify(iter, &src->mappings) + { + RewriteMappingDataEntry *pmap; + + pmap = dclist_container(RewriteMappingDataEntry, node, iter.cur); + + memcpy(waldata, &pmap->map, sizeof(pmap->map)); + waldata += sizeof(pmap->map); + + /* remove from the list and free */ + dclist_delete_from(&src->mappings, &pmap->node); + pfree(pmap); + + /* update bookkeeping */ + state->rs_num_rewrite_mappings--; + } + + Assert(dclist_count(&src->mappings) == 0); + Assert(waldata == waldata_start + len); + + /* + * Note that we deviate from the usual WAL coding practices here, + * check the above "Logical rewrite support" comment for reasoning. + */ + written = FileWrite(src->vfd, waldata_start, len, src->off, + WAIT_EVENT_LOGICAL_REWRITE_WRITE); + if (written != len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path, + written, len))); + src->off += len; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + XLogRegisterData(waldata_start, len); + + /* write xlog record */ + XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE); + + pfree(waldata_start); + } + Assert(state->rs_num_rewrite_mappings == 0); +} + +/* + * Logical remapping part of end_tdeheap_rewrite(). + */ +static void +logical_end_tdeheap_rewrite(RewriteState state) +{ + HASH_SEQ_STATUS seq_status; + RewriteMappingFile *src; + + /* done, no logical rewrite in progress */ + if (!state->rs_logical_rewrite) + return; + + /* writeout remaining in-memory entries */ + if (state->rs_num_rewrite_mappings > 0) + logical_tdeheap_rewrite_flush_mappings(state); + + /* Iterate over all mappings we have written and fsync the files. */ + hash_seq_init(&seq_status, state->rs_logical_mappings); + while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) + { + if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", src->path))); + FileClose(src->vfd); + } + /* memory context cleanup will deal with the rest */ +} + +/* + * Log a single (old->new) mapping for 'xid'. + */ +static void +logical_rewrite_log_mapping(RewriteState state, TransactionId xid, + LogicalRewriteMappingData *map) +{ + RewriteMappingFile *src; + RewriteMappingDataEntry *pmap; + Oid relid; + bool found; + + relid = RelationGetRelid(state->rs_old_rel); + + /* look for existing mappings for this 'mapped' xid */ + src = hash_search(state->rs_logical_mappings, &xid, + HASH_ENTER, &found); + + /* + * We haven't yet had the need to map anything for this xid, create + * per-xid data structures. + */ + if (!found) + { + char path[MAXPGPATH]; + Oid dboid; + + if (state->rs_old_rel->rd_rel->relisshared) + dboid = InvalidOid; + else + dboid = MyDatabaseId; + + snprintf(path, MAXPGPATH, + "pg_logical/mappings/" LOGICAL_REWRITE_FORMAT, + dboid, relid, + LSN_FORMAT_ARGS(state->rs_begin_lsn), + xid, GetCurrentTransactionId()); + + dclist_init(&src->mappings); + src->off = 0; + memcpy(src->path, path, sizeof(path)); + src->vfd = PathNameOpenFile(path, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (src->vfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + } + + pmap = MemoryContextAlloc(state->rs_cxt, + sizeof(RewriteMappingDataEntry)); + memcpy(&pmap->map, map, sizeof(LogicalRewriteMappingData)); + dclist_push_tail(&src->mappings, &pmap->node); + state->rs_num_rewrite_mappings++; + + /* + * Write out buffer every time we've too many in-memory entries across all + * mapping files. + */ + if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */ ) + logical_tdeheap_rewrite_flush_mappings(state); +} + +/* + * Perform logical remapping for a tuple that's mapped from old_tid to + * new_tuple->t_self by rewrite_tdeheap_tuple() if necessary for the tuple. + */ +static void +logical_rewrite_tdeheap_tuple(RewriteState state, ItemPointerData old_tid, + HeapTuple new_tuple) +{ + ItemPointerData new_tid = new_tuple->t_self; + TransactionId cutoff = state->rs_logical_xmin; + TransactionId xmin; + TransactionId xmax; + bool do_log_xmin = false; + bool do_log_xmax = false; + LogicalRewriteMappingData map; + + /* no logical rewrite in progress, we don't need to log anything */ + if (!state->rs_logical_rewrite) + return; + + xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); + /* use *GetUpdateXid to correctly deal with multixacts */ + xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data); + + /* + * Log the mapping iff the tuple has been created recently. + */ + if (TransactionIdIsNormal(xmin) && !TransactionIdPrecedes(xmin, cutoff)) + do_log_xmin = true; + + if (!TransactionIdIsNormal(xmax)) + { + /* + * no xmax is set, can't have any permanent ones, so this check is + * sufficient + */ + } + else if (HEAP_XMAX_IS_LOCKED_ONLY(new_tuple->t_data->t_infomask)) + { + /* only locked, we don't care */ + } + else if (!TransactionIdPrecedes(xmax, cutoff)) + { + /* tuple has been deleted recently, log */ + do_log_xmax = true; + } + + /* if neither needs to be logged, we're done */ + if (!do_log_xmin && !do_log_xmax) + return; + + /* fill out mapping information */ + map.old_locator = state->rs_old_rel->rd_locator; + map.old_tid = old_tid; + map.new_locator = state->rs_new_rel->rd_locator; + map.new_tid = new_tid; + + /* --- + * Now persist the mapping for the individual xids that are affected. We + * need to log for both xmin and xmax if they aren't the same transaction + * since the mapping files are per "affected" xid. + * We don't muster all that much effort detecting whether xmin and xmax + * are actually the same transaction, we just check whether the xid is the + * same disregarding subtransactions. Logging too much is relatively + * harmless and we could never do the check fully since subtransaction + * data is thrown away during restarts. + * --- + */ + if (do_log_xmin) + logical_rewrite_log_mapping(state, xmin, &map); + /* separately log mapping for xmax unless it'd be redundant */ + if (do_log_xmax && !TransactionIdEquals(xmin, xmax)) + logical_rewrite_log_mapping(state, xmax, &map); +} + +/* + * Replay XLOG_HEAP2_REWRITE records + */ +void +tdeheap_xlog_logical_rewrite(XLogReaderState *r) +{ + char path[MAXPGPATH]; + int fd; + xl_tdeheap_rewrite_mapping *xlrec; + uint32 len; + char *data; + + xlrec = (xl_tdeheap_rewrite_mapping *) XLogRecGetData(r); + + snprintf(path, MAXPGPATH, + "pg_logical/mappings/" LOGICAL_REWRITE_FORMAT, + xlrec->mapped_db, xlrec->mapped_rel, + LSN_FORMAT_ARGS(xlrec->start_lsn), + xlrec->mapped_xid, XLogRecGetXid(r)); + + fd = OpenTransientFile(path, + O_CREAT | O_WRONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + + /* + * Truncate all data that's not guaranteed to have been safely fsynced (by + * previous record or by the last checkpoint). + */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE); + if (ftruncate(fd, xlrec->offset) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\" to %u: %m", + path, (uint32) xlrec->offset))); + pgstat_report_wait_end(); + + data = XLogRecGetData(r) + sizeof(*xlrec); + + len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData); + + /* write out tail end of mapping file (again) */ + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE); + if (pg_pwrite(fd, data, len, xlrec->offset) != len) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + } + pgstat_report_wait_end(); + + /* + * Now fsync all previously written data. We could improve things and only + * do this for the last write to a file, but the required bookkeeping + * doesn't seem worth the trouble. + */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} + +/* --- + * Perform a checkpoint for logical rewrite mappings + * + * This serves two tasks: + * 1) Remove all mappings not needed anymore based on the logical restart LSN + * 2) Flush all remaining mappings to disk, so that replay after a checkpoint + * only has to deal with the parts of a mapping that have been written out + * after the checkpoint started. + * --- + */ +void +CheckPointLogicalRewriteHeap(void) +{ + XLogRecPtr cutoff; + XLogRecPtr redo; + DIR *mappings_dir; + struct dirent *mapping_de; + char path[MAXPGPATH + 20]; + + /* + * We start of with a minimum of the last redo pointer. No new decoding + * slot will start before that, so that's a safe upper bound for removal. + */ + redo = GetRedoRecPtr(); + + /* now check for the restart ptrs from existing slots */ + cutoff = ReplicationSlotsComputeLogicalRestartLSN(); + + /* don't start earlier than the restart lsn */ + if (cutoff != InvalidXLogRecPtr && redo < cutoff) + cutoff = redo; + + mappings_dir = AllocateDir("pg_logical/mappings"); + while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL) + { + Oid dboid; + Oid relid; + XLogRecPtr lsn; + TransactionId rewrite_xid; + TransactionId create_xid; + uint32 hi, + lo; + PGFileType de_type; + + if (strcmp(mapping_de->d_name, ".") == 0 || + strcmp(mapping_de->d_name, "..") == 0) + continue; + + snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name); + de_type = get_dirent_type(path, mapping_de, false, DEBUG1); + + if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_REG) + continue; + + /* Skip over files that cannot be ours. */ + if (strncmp(mapping_de->d_name, "map-", 4) != 0) + continue; + + if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, + &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6) + elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name); + + lsn = ((uint64) hi) << 32 | lo; + + if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + { + elog(DEBUG1, "removing logical rewrite file \"%s\"", path); + if (unlink(path) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } + else + { + /* on some operating systems fsyncing a file requires O_RDWR */ + int fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + + /* + * The file cannot vanish due to concurrency since this function + * is the only one removing logical mappings and only one + * checkpoint can be in progress at a time. + */ + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* + * We could try to avoid fsyncing files that either haven't + * changed or have only been created since the checkpoint's start, + * but it's currently not deemed worth the effort. + */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + } + } + FreeDir(mappings_dir); + + /* persist directory entries to disk */ + fsync_fname("pg_logical/mappings", true); +} diff --git a/src16/access/pg_tde_vacuumlazy.c b/src16/access/pg_tde_vacuumlazy.c new file mode 100644 index 00000000..ed318621 --- /dev/null +++ b/src16/access/pg_tde_vacuumlazy.c @@ -0,0 +1,3472 @@ +/*------------------------------------------------------------------------- + * + * vacuumlazy.c + * Concurrent ("lazy") vacuuming. + * + * The major space usage for vacuuming is storage for the array of dead TIDs + * that are to be removed from indexes. We want to ensure we can vacuum even + * the very largest relations with finite memory space usage. To do that, we + * set upper bounds on the number of TIDs we can keep track of at once. + * + * We are willing to use at most maintenance_work_mem (or perhaps + * autovacuum_work_mem) memory space to keep track of dead TIDs. We initially + * allocate an array of TIDs of that size, with an upper limit that depends on + * table size (this limit ensures we don't allocate a huge area uselessly for + * vacuuming small tables). If the array threatens to overflow, we must call + * lazy_vacuum to vacuum indexes (and to vacuum the pages that we've pruned). + * This frees up the memory space dedicated to storing dead TIDs. + * + * In practice VACUUM will often complete its initial pass over the target + * heap relation without ever running out of space to store TIDs. This means + * that there only needs to be one call to lazy_vacuum, after the initial pass + * completes. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/vacuumlazy.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/amapi.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/pg_tdeam_xlog.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/transam.h" +#include "access/visibilitymap.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "catalog/storage.h" +#include "commands/dbcommands.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "optimizer/paths.h" +#include "pgstat.h" +#include "portability/instr_time.h" +#include "postmaster/autovacuum.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "tcop/tcopprot.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/timestamp.h" + + +/* + * Space/time tradeoff parameters: do these need to be user-tunable? + * + * To consider truncating the relation, we want there to be at least + * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever + * is less) potentially-freeable pages. + */ +#define REL_TRUNCATE_MINIMUM 1000 +#define REL_TRUNCATE_FRACTION 16 + +/* + * Timing parameters for truncate locking heuristics. + * + * These were not exposed as user tunable GUC values because it didn't seem + * that the potential for improvement was great enough to merit the cost of + * supporting them. + */ +#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */ +#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */ +#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ + +/* + * Threshold that controls whether we bypass index vacuuming and heap + * vacuuming as an optimization + */ +#define BYPASS_THRESHOLD_PAGES 0.02 /* i.e. 2% of rel_pages */ + +/* + * Perform a failsafe check each time we scan another 4GB of pages. + * (Note that this is deliberately kept to a power-of-two, usually 2^19.) + */ +#define FAILSAFE_EVERY_PAGES \ + ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ)) + +/* + * When a table has no indexes, vacuum the FSM after every 8GB, approximately + * (it won't be exact because we only vacuum FSM after processing a heap page + * that has some removable tuples). When there are indexes, this is ignored, + * and we vacuum FSM after each index/heap cleaning pass. + */ +#define VACUUM_FSM_EVERY_PAGES \ + ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) + +/* + * Before we consider skipping a page that's marked as clean in + * visibility map, we must've seen at least this many clean pages. + */ +#define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) + +/* + * Size of the prefetch window for lazy vacuum backwards truncation scan. + * Needs to be a power of 2. + */ +#define PREFETCH_SIZE ((BlockNumber) 32) + +/* + * Macro to check if we are in a parallel vacuum. If true, we are in the + * parallel mode and the DSM segment is initialized. + */ +#define ParallelVacuumIsActive(vacrel) ((vacrel)->pvs != NULL) + +/* Phases of vacuum during which we report error context. */ +typedef enum +{ + VACUUM_ERRCB_PHASE_UNKNOWN, + VACUUM_ERRCB_PHASE_SCAN_HEAP, + VACUUM_ERRCB_PHASE_VACUUM_INDEX, + VACUUM_ERRCB_PHASE_VACUUM_HEAP, + VACUUM_ERRCB_PHASE_INDEX_CLEANUP, + VACUUM_ERRCB_PHASE_TRUNCATE +} VacErrPhase; + +typedef struct LVRelState +{ + /* Target heap relation and its indexes */ + Relation rel; + Relation *indrels; + int nindexes; + + /* Buffer access strategy and parallel vacuum state */ + BufferAccessStrategy bstrategy; + ParallelVacuumState *pvs; + + /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */ + bool aggressive; + /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */ + bool skipwithvm; + /* Consider index vacuuming bypass optimization? */ + bool consider_bypass_optimization; + + /* Doing index vacuuming, index cleanup, rel truncation? */ + bool do_index_vacuuming; + bool do_index_cleanup; + bool do_rel_truncate; + + /* VACUUM operation's cutoffs for freezing and pruning */ + struct VacuumCutoffs cutoffs; + GlobalVisState *vistest; + /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */ + TransactionId NewRelfrozenXid; + MultiXactId NewRelminMxid; + bool skippedallvis; + + /* Error reporting state */ + char *dbname; + char *relnamespace; + char *relname; + char *indname; /* Current index name */ + BlockNumber blkno; /* used only for heap operations */ + OffsetNumber offnum; /* used only for heap operations */ + VacErrPhase phase; + bool verbose; /* VACUUM VERBOSE? */ + + /* + * dead_items stores TIDs whose index tuples are deleted by index + * vacuuming. Each TID points to an LP_DEAD line pointer from a heap page + * that has been processed by lazy_scan_prune. Also needed by + * lazy_vacuum_tdeheap_rel, which marks the same LP_DEAD line pointers as + * LP_UNUSED during second heap pass. + */ + VacDeadItems *dead_items; /* TIDs whose index tuples we'll delete */ + BlockNumber rel_pages; /* total number of pages */ + BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ + BlockNumber removed_pages; /* # pages removed by relation truncation */ + BlockNumber frozen_pages; /* # pages with newly frozen tuples */ + BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */ + BlockNumber missed_dead_pages; /* # pages with missed dead tuples */ + BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + + /* Statistics output by us, for table */ + double new_rel_tuples; /* new estimated total # of tuples */ + double new_live_tuples; /* new estimated total # of live tuples */ + /* Statistics output by index AMs */ + IndexBulkDeleteResult **indstats; + + /* Instrumentation counters */ + int num_index_scans; + /* Counters that follow are only for scanned_pages */ + int64 tuples_deleted; /* # deleted from table */ + int64 tuples_frozen; /* # newly frozen */ + int64 lpdead_items; /* # deleted from indexes */ + int64 live_tuples; /* # live tuples remaining */ + int64 recently_dead_tuples; /* # dead, but not yet removable */ + int64 missed_dead_tuples; /* # removable, but not removed */ +} LVRelState; + +/* + * State returned by lazy_scan_prune() + */ +typedef struct LVPagePruneState +{ + bool hastup; /* Page prevents rel truncation? */ + bool has_lpdead_items; /* includes existing LP_DEAD items */ + + /* + * State describes the proper VM bit states to set for the page following + * pruning and freezing. all_visible implies !has_lpdead_items, but don't + * trust all_frozen result unless all_visible is also set to true. + */ + bool all_visible; /* Every item visible to all? */ + bool all_frozen; /* provided all_visible is also true */ + TransactionId visibility_cutoff_xid; /* For recovery conflicts */ +} LVPagePruneState; + +/* Struct for saving and restoring vacuum error information. */ +typedef struct LVSavedErrInfo +{ + BlockNumber blkno; + OffsetNumber offnum; + VacErrPhase phase; +} LVSavedErrInfo; + + +/* non-export function prototypes */ +static void lazy_scan_heap(LVRelState *vacrel); +static BlockNumber lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, + BlockNumber next_block, + bool *next_unskippable_allvis, + bool *skipping_current_range); +static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, + BlockNumber blkno, Page page, + bool sharelock, Buffer vmbuffer); +static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, + BlockNumber blkno, Page page, + LVPagePruneState *prunestate); +static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf, + BlockNumber blkno, Page page, + bool *hastup, bool *recordfreespace); +static void lazy_vacuum(LVRelState *vacrel); +static bool lazy_vacuum_all_indexes(LVRelState *vacrel); +static void lazy_vacuum_tdeheap_rel(LVRelState *vacrel); +static int lazy_vacuum_tdeheap_page(LVRelState *vacrel, BlockNumber blkno, + Buffer buffer, int index, Buffer vmbuffer); +static bool lazy_check_wraparound_failsafe(LVRelState *vacrel); +static void lazy_cleanup_all_indexes(LVRelState *vacrel); +static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel, + IndexBulkDeleteResult *istat, + double reltuples, + LVRelState *vacrel); +static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel, + IndexBulkDeleteResult *istat, + double reltuples, + bool estimated_count, + LVRelState *vacrel); +static bool should_attempt_truncation(LVRelState *vacrel); +static void lazy_truncate_heap(LVRelState *vacrel); +static BlockNumber count_nondeletable_pages(LVRelState *vacrel, + bool *lock_waiter_detected); +static void dead_items_alloc(LVRelState *vacrel, int nworkers); +static void dead_items_cleanup(LVRelState *vacrel); +static bool tdeheap_page_is_all_visible(LVRelState *vacrel, Buffer buf, + TransactionId *visibility_cutoff_xid, bool *all_frozen); +static void update_relstats_all_indexes(LVRelState *vacrel); +static void vacuum_error_callback(void *arg); +static void update_vacuum_error_info(LVRelState *vacrel, + LVSavedErrInfo *saved_vacrel, + int phase, BlockNumber blkno, + OffsetNumber offnum); +static void restore_vacuum_error_info(LVRelState *vacrel, + const LVSavedErrInfo *saved_vacrel); + + +/* + * tdeheap_vacuum_rel() -- perform VACUUM for one heap relation + * + * This routine sets things up for and then calls lazy_scan_heap, where + * almost all work actually takes place. Finalizes everything after call + * returns by managing relation truncation and updating rel's pg_class + * entry. (Also updates pg_class entries for any indexes that need it.) + * + * At entry, we have already established a transaction and opened + * and locked the relation. + */ +void +tdeheap_vacuum_rel(Relation rel, VacuumParams *params, + BufferAccessStrategy bstrategy) +{ + LVRelState *vacrel; + bool verbose, + instrument, + skipwithvm, + frozenxid_updated, + minmulti_updated; + BlockNumber orig_rel_pages, + new_rel_pages, + new_rel_allvisible; + PGRUsage ru0; + TimestampTz starttime = 0; + PgStat_Counter startreadtime = 0, + startwritetime = 0; + WalUsage startwalusage = pgWalUsage; + BufferUsage startbufferusage = pgBufferUsage; + ErrorContextCallback errcallback; + char **indnames = NULL; + + verbose = (params->options & VACOPT_VERBOSE) != 0; + instrument = (verbose || (IsAutoVacuumWorkerProcess() && + params->log_min_duration >= 0)); + if (instrument) + { + pg_rusage_init(&ru0); + starttime = GetCurrentTimestamp(); + if (track_io_timing) + { + startreadtime = pgStatBlockReadTime; + startwritetime = pgStatBlockWriteTime; + } + } + + pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM, + RelationGetRelid(rel)); + + /* + * Setup error traceback support for ereport() first. The idea is to set + * up an error context callback to display additional information on any + * error during a vacuum. During different phases of vacuum, we update + * the state so that the error context callback always display current + * information. + * + * Copy the names of heap rel into local memory for error reporting + * purposes, too. It isn't always safe to assume that we can get the name + * of each rel. It's convenient for code in lazy_scan_heap to always use + * these temp copies. + */ + vacrel = (LVRelState *) palloc0(sizeof(LVRelState)); + vacrel->dbname = get_database_name(MyDatabaseId); + vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel)); + vacrel->relname = pstrdup(RelationGetRelationName(rel)); + vacrel->indname = NULL; + vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN; + vacrel->verbose = verbose; + errcallback.callback = vacuum_error_callback; + errcallback.arg = vacrel; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* Set up high level stuff about rel and its indexes */ + vacrel->rel = rel; + vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes, + &vacrel->indrels); + vacrel->bstrategy = bstrategy; + if (instrument && vacrel->nindexes > 0) + { + /* Copy index names used by instrumentation (not error reporting) */ + indnames = palloc(sizeof(char *) * vacrel->nindexes); + for (int i = 0; i < vacrel->nindexes; i++) + indnames[i] = pstrdup(RelationGetRelationName(vacrel->indrels[i])); + } + + /* + * The index_cleanup param either disables index vacuuming and cleanup or + * forces it to go ahead when we would otherwise apply the index bypass + * optimization. The default is 'auto', which leaves the final decision + * up to lazy_vacuum(). + * + * The truncate param allows user to avoid attempting relation truncation, + * though it can't force truncation to happen. + */ + Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED); + Assert(params->truncate != VACOPTVALUE_UNSPECIFIED && + params->truncate != VACOPTVALUE_AUTO); + + /* + * While VacuumFailSafeActive is reset to false before calling this, we + * still need to reset it here due to recursive calls. + */ + VacuumFailsafeActive = false; + vacrel->consider_bypass_optimization = true; + vacrel->do_index_vacuuming = true; + vacrel->do_index_cleanup = true; + vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED); + if (params->index_cleanup == VACOPTVALUE_DISABLED) + { + /* Force disable index vacuuming up-front */ + vacrel->do_index_vacuuming = false; + vacrel->do_index_cleanup = false; + } + else if (params->index_cleanup == VACOPTVALUE_ENABLED) + { + /* Force index vacuuming. Note that failsafe can still bypass. */ + vacrel->consider_bypass_optimization = false; + } + else + { + /* Default/auto, make all decisions dynamically */ + Assert(params->index_cleanup == VACOPTVALUE_AUTO); + } + + /* Initialize page counters explicitly (be tidy) */ + vacrel->scanned_pages = 0; + vacrel->removed_pages = 0; + vacrel->frozen_pages = 0; + vacrel->lpdead_item_pages = 0; + vacrel->missed_dead_pages = 0; + vacrel->nonempty_pages = 0; + /* dead_items_alloc allocates vacrel->dead_items later on */ + + /* Allocate/initialize output statistics state */ + vacrel->new_rel_tuples = 0; + vacrel->new_live_tuples = 0; + vacrel->indstats = (IndexBulkDeleteResult **) + palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *)); + + /* Initialize remaining counters (be tidy) */ + vacrel->num_index_scans = 0; + vacrel->tuples_deleted = 0; + vacrel->tuples_frozen = 0; + vacrel->lpdead_items = 0; + vacrel->live_tuples = 0; + vacrel->recently_dead_tuples = 0; + vacrel->missed_dead_tuples = 0; + + /* + * Get cutoffs that determine which deleted tuples are considered DEAD, + * not just RECENTLY_DEAD, and which XIDs/MXIDs to freeze. Then determine + * the extent of the blocks that we'll scan in lazy_scan_heap. It has to + * happen in this order to ensure that the OldestXmin cutoff field works + * as an upper bound on the XIDs stored in the pages we'll actually scan + * (NewRelfrozenXid tracking must never be allowed to miss unfrozen XIDs). + * + * Next acquire vistest, a related cutoff that's used in tdeheap_page_prune. + * We expect vistest will always make tdeheap_page_prune remove any deleted + * tuple whose xmax is < OldestXmin. lazy_scan_prune must never become + * confused about whether a tuple should be frozen or removed. (In the + * future we might want to teach lazy_scan_prune to recompute vistest from + * time to time, to increase the number of dead tuples it can prune away.) + */ + vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs); + vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); + vacrel->vistest = GlobalVisTestFor(rel); + /* Initialize state used to track oldest extant XID/MXID */ + vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; + vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact; + vacrel->skippedallvis = false; + skipwithvm = true; + if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) + { + /* + * Force aggressive mode, and disable skipping blocks using the + * visibility map (even those set all-frozen) + */ + vacrel->aggressive = true; + skipwithvm = false; + } + + vacrel->skipwithvm = skipwithvm; + + if (verbose) + { + if (vacrel->aggressive) + ereport(INFO, + (errmsg("aggressively vacuuming \"%s.%s.%s\"", + vacrel->dbname, vacrel->relnamespace, + vacrel->relname))); + else + ereport(INFO, + (errmsg("vacuuming \"%s.%s.%s\"", + vacrel->dbname, vacrel->relnamespace, + vacrel->relname))); + } + + /* + * Allocate dead_items array memory using dead_items_alloc. This handles + * parallel VACUUM initialization as part of allocating shared memory + * space used for dead_items. (But do a failsafe precheck first, to + * ensure that parallel VACUUM won't be attempted at all when relfrozenxid + * is already dangerously old.) + */ + lazy_check_wraparound_failsafe(vacrel); + dead_items_alloc(vacrel, params->nworkers); + + /* + * Call lazy_scan_heap to perform all required heap pruning, index + * vacuuming, and heap vacuuming (plus related processing) + */ + lazy_scan_heap(vacrel); + + /* + * Free resources managed by dead_items_alloc. This ends parallel mode in + * passing when necessary. + */ + dead_items_cleanup(vacrel); + Assert(!IsInParallelMode()); + + /* + * Update pg_class entries for each of rel's indexes where appropriate. + * + * Unlike the later update to rel's pg_class entry, this is not critical. + * Maintains relpages/reltuples statistics used by the planner only. + */ + if (vacrel->do_index_cleanup) + update_relstats_all_indexes(vacrel); + + /* Done with rel's indexes */ + vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock); + + /* Optionally truncate rel */ + if (should_attempt_truncation(vacrel)) + lazy_truncate_heap(vacrel); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + /* Report that we are now doing final cleanup */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_FINAL_CLEANUP); + + /* + * Prepare to update rel's pg_class entry. + * + * Aggressive VACUUMs must always be able to advance relfrozenxid to a + * value >= FreezeLimit, and relminmxid to a value >= MultiXactCutoff. + * Non-aggressive VACUUMs may advance them by any amount, or not at all. + */ + Assert(vacrel->NewRelfrozenXid == vacrel->cutoffs.OldestXmin || + TransactionIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.FreezeLimit : + vacrel->cutoffs.relfrozenxid, + vacrel->NewRelfrozenXid)); + Assert(vacrel->NewRelminMxid == vacrel->cutoffs.OldestMxact || + MultiXactIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.MultiXactCutoff : + vacrel->cutoffs.relminmxid, + vacrel->NewRelminMxid)); + if (vacrel->skippedallvis) + { + /* + * Must keep original relfrozenxid in a non-aggressive VACUUM that + * chose to skip an all-visible page range. The state that tracks new + * values will have missed unfrozen XIDs from the pages we skipped. + */ + Assert(!vacrel->aggressive); + vacrel->NewRelfrozenXid = InvalidTransactionId; + vacrel->NewRelminMxid = InvalidMultiXactId; + } + + /* + * For safety, clamp relallvisible to be not more than what we're setting + * pg_class.relpages to + */ + new_rel_pages = vacrel->rel_pages; /* After possible rel truncation */ + tdeheap_visibilitymap_count(rel, &new_rel_allvisible, NULL); + if (new_rel_allvisible > new_rel_pages) + new_rel_allvisible = new_rel_pages; + + /* + * Now actually update rel's pg_class entry. + * + * In principle new_live_tuples could be -1 indicating that we (still) + * don't know the tuple count. In practice that can't happen, since we + * scan every page that isn't skipped using the visibility map. + */ + vac_update_relstats(rel, new_rel_pages, vacrel->new_live_tuples, + new_rel_allvisible, vacrel->nindexes > 0, + vacrel->NewRelfrozenXid, vacrel->NewRelminMxid, + &frozenxid_updated, &minmulti_updated, false); + + /* + * Report results to the cumulative stats system, too. + * + * Deliberately avoid telling the stats system about LP_DEAD items that + * remain in the table due to VACUUM bypassing index and heap vacuuming. + * ANALYZE will consider the remaining LP_DEAD items to be dead "tuples". + * It seems like a good idea to err on the side of not vacuuming again too + * soon in cases where the failsafe prevented significant amounts of heap + * vacuuming. + */ + pgstat_report_vacuum(RelationGetRelid(rel), + rel->rd_rel->relisshared, + Max(vacrel->new_live_tuples, 0), + vacrel->recently_dead_tuples + + vacrel->missed_dead_tuples); + pgstat_progress_end_command(); + + if (instrument) + { + TimestampTz endtime = GetCurrentTimestamp(); + + if (verbose || params->log_min_duration == 0 || + TimestampDifferenceExceeds(starttime, endtime, + params->log_min_duration)) + { + long secs_dur; + int usecs_dur; + WalUsage walusage; + BufferUsage bufferusage; + StringInfoData buf; + char *msgfmt; + int32 diff; + double read_rate = 0, + write_rate = 0; + + TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); + memset(&walusage, 0, sizeof(WalUsage)); + WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); + memset(&bufferusage, 0, sizeof(BufferUsage)); + BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); + + initStringInfo(&buf); + if (verbose) + { + /* + * Aggressiveness already reported earlier, in dedicated + * VACUUM VERBOSE ereport + */ + Assert(!params->is_wraparound); + msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n"); + } + else if (params->is_wraparound) + { + /* + * While it's possible for a VACUUM to be both is_wraparound + * and !aggressive, that's just a corner-case -- is_wraparound + * implies aggressive. Produce distinct output for the corner + * case all the same, just in case. + */ + if (vacrel->aggressive) + msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n"); + else + msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n"); + } + else + { + if (vacrel->aggressive) + msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n"); + else + msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"); + } + appendStringInfo(&buf, msgfmt, + vacrel->dbname, + vacrel->relnamespace, + vacrel->relname, + vacrel->num_index_scans); + appendStringInfo(&buf, _("pages: %u removed, %u remain, %u scanned (%.2f%% of total)\n"), + vacrel->removed_pages, + new_rel_pages, + vacrel->scanned_pages, + orig_rel_pages == 0 ? 100.0 : + 100.0 * vacrel->scanned_pages / orig_rel_pages); + appendStringInfo(&buf, + _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable\n"), + (long long) vacrel->tuples_deleted, + (long long) vacrel->new_rel_tuples, + (long long) vacrel->recently_dead_tuples); + if (vacrel->missed_dead_tuples > 0) + appendStringInfo(&buf, + _("tuples missed: %lld dead from %u pages not removed due to cleanup lock contention\n"), + (long long) vacrel->missed_dead_tuples, + vacrel->missed_dead_pages); + diff = (int32) (ReadNextTransactionId() - + vacrel->cutoffs.OldestXmin); + appendStringInfo(&buf, + _("removable cutoff: %u, which was %d XIDs old when operation ended\n"), + vacrel->cutoffs.OldestXmin, diff); + if (frozenxid_updated) + { + diff = (int32) (vacrel->NewRelfrozenXid - + vacrel->cutoffs.relfrozenxid); + appendStringInfo(&buf, + _("new relfrozenxid: %u, which is %d XIDs ahead of previous value\n"), + vacrel->NewRelfrozenXid, diff); + } + if (minmulti_updated) + { + diff = (int32) (vacrel->NewRelminMxid - + vacrel->cutoffs.relminmxid); + appendStringInfo(&buf, + _("new relminmxid: %u, which is %d MXIDs ahead of previous value\n"), + vacrel->NewRelminMxid, diff); + } + appendStringInfo(&buf, _("frozen: %u pages from table (%.2f%% of total) had %lld tuples frozen\n"), + vacrel->frozen_pages, + orig_rel_pages == 0 ? 100.0 : + 100.0 * vacrel->frozen_pages / orig_rel_pages, + (long long) vacrel->tuples_frozen); + if (vacrel->do_index_vacuuming) + { + if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0) + appendStringInfoString(&buf, _("index scan not needed: ")); + else + appendStringInfoString(&buf, _("index scan needed: ")); + + msgfmt = _("%u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n"); + } + else + { + if (!VacuumFailsafeActive) + appendStringInfoString(&buf, _("index scan bypassed: ")); + else + appendStringInfoString(&buf, _("index scan bypassed by failsafe: ")); + + msgfmt = _("%u pages from table (%.2f%% of total) have %lld dead item identifiers\n"); + } + appendStringInfo(&buf, msgfmt, + vacrel->lpdead_item_pages, + orig_rel_pages == 0 ? 100.0 : + 100.0 * vacrel->lpdead_item_pages / orig_rel_pages, + (long long) vacrel->lpdead_items); + for (int i = 0; i < vacrel->nindexes; i++) + { + IndexBulkDeleteResult *istat = vacrel->indstats[i]; + + if (!istat) + continue; + + appendStringInfo(&buf, + _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"), + indnames[i], + istat->num_pages, + istat->pages_newly_deleted, + istat->pages_deleted, + istat->pages_free); + } + if (track_io_timing) + { + double read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000; + double write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000; + + appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"), + read_ms, write_ms); + } + if (secs_dur > 0 || usecs_dur > 0) + { + read_rate = (double) BLCKSZ * (bufferusage.shared_blks_read + bufferusage.local_blks_read) / + (1024 * 1024) / (secs_dur + usecs_dur / 1000000.0); + write_rate = (double) BLCKSZ * (bufferusage.shared_blks_dirtied + bufferusage.local_blks_dirtied) / + (1024 * 1024) / (secs_dur + usecs_dur / 1000000.0); + } + appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"), + read_rate, write_rate); + appendStringInfo(&buf, + _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"), + (long long) (bufferusage.shared_blks_hit + bufferusage.local_blks_hit), + (long long) (bufferusage.shared_blks_read + bufferusage.local_blks_read), + (long long) (bufferusage.shared_blks_dirtied + bufferusage.local_blks_dirtied)); + appendStringInfo(&buf, + _("WAL usage: %lld records, %lld full page images, %llu bytes\n"), + (long long) walusage.wal_records, + (long long) walusage.wal_fpi, + (unsigned long long) walusage.wal_bytes); + appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0)); + + ereport(verbose ? INFO : LOG, + (errmsg_internal("%s", buf.data))); + pfree(buf.data); + } + } + + /* Cleanup index statistics and index names */ + for (int i = 0; i < vacrel->nindexes; i++) + { + if (vacrel->indstats[i]) + pfree(vacrel->indstats[i]); + + if (instrument) + pfree(indnames[i]); + } +} + +/* + * lazy_scan_heap() -- workhorse function for VACUUM + * + * This routine prunes each page in the heap, and considers the need to + * freeze remaining tuples with storage (not including pages that can be + * skipped using the visibility map). Also performs related maintenance + * of the FSM and visibility map. These steps all take place during an + * initial pass over the target heap relation. + * + * Also invokes lazy_vacuum_all_indexes to vacuum indexes, which largely + * consists of deleting index tuples that point to LP_DEAD items left in + * heap pages following pruning. Earlier initial pass over the heap will + * have collected the TIDs whose index tuples need to be removed. + * + * Finally, invokes lazy_vacuum_tdeheap_rel to vacuum heap pages, which + * largely consists of marking LP_DEAD items (from collected TID array) + * as LP_UNUSED. This has to happen in a second, final pass over the + * heap, to preserve a basic invariant that all index AMs rely on: no + * extant index tuple can ever be allowed to contain a TID that points to + * an LP_UNUSED line pointer in the heap. We must disallow premature + * recycling of line pointers to avoid index scans that get confused + * about which TID points to which tuple immediately after recycling. + * (Actually, this isn't a concern when target heap relation happens to + * have no indexes, which allows us to safely apply the one-pass strategy + * as an optimization). + * + * In practice we often have enough space to fit all TIDs, and so won't + * need to call lazy_vacuum more than once, after our initial pass over + * the heap has totally finished. Otherwise things are slightly more + * complicated: our "initial pass" over the heap applies only to those + * pages that were pruned before we needed to call lazy_vacuum, and our + * "final pass" over the heap only vacuums these same heap pages. + * However, we process indexes in full every time lazy_vacuum is called, + * which makes index processing very inefficient when memory is in short + * supply. + */ +static void +lazy_scan_heap(LVRelState *vacrel) +{ + BlockNumber rel_pages = vacrel->rel_pages, + blkno, + next_unskippable_block, + next_fsm_block_to_vacuum = 0; + VacDeadItems *dead_items = vacrel->dead_items; + Buffer vmbuffer = InvalidBuffer; + bool next_unskippable_allvis, + skipping_current_range; + const int initprog_index[] = { + PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_TOTAL_HEAP_BLKS, + PROGRESS_VACUUM_MAX_DEAD_TUPLES + }; + int64 initprog_val[3]; + + /* Report that we're scanning the heap, advertising total # of blocks */ + initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; + initprog_val[1] = rel_pages; + initprog_val[2] = dead_items->max_items; + pgstat_progress_update_multi_param(3, initprog_index, initprog_val); + + /* Set up an initial range of skippable blocks using the visibility map */ + next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0, + &next_unskippable_allvis, + &skipping_current_range); + for (blkno = 0; blkno < rel_pages; blkno++) + { + Buffer buf; + Page page; + bool all_visible_according_to_vm; + LVPagePruneState prunestate; + + if (blkno == next_unskippable_block) + { + /* + * Can't skip this page safely. Must scan the page. But + * determine the next skippable range after the page first. + */ + all_visible_according_to_vm = next_unskippable_allvis; + next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, + blkno + 1, + &next_unskippable_allvis, + &skipping_current_range); + + Assert(next_unskippable_block >= blkno + 1); + } + else + { + /* Last page always scanned (may need to set nonempty_pages) */ + Assert(blkno < rel_pages - 1); + + if (skipping_current_range) + continue; + + /* Current range is too small to skip -- just scan the page */ + all_visible_according_to_vm = true; + } + + vacrel->scanned_pages++; + + /* Report as block scanned, update error traceback information */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP, + blkno, InvalidOffsetNumber); + + vacuum_delay_point(); + + /* + * Regularly check if wraparound failsafe should trigger. + * + * There is a similar check inside lazy_vacuum_all_indexes(), but + * relfrozenxid might start to look dangerously old before we reach + * that point. This check also provides failsafe coverage for the + * one-pass strategy, and the two-pass strategy with the index_cleanup + * param set to 'off'. + */ + if (vacrel->scanned_pages % FAILSAFE_EVERY_PAGES == 0) + lazy_check_wraparound_failsafe(vacrel); + + /* + * Consider if we definitely have enough space to process TIDs on page + * already. If we are close to overrunning the available space for + * dead_items TIDs, pause and do a cycle of vacuuming before we tackle + * this page. + */ + Assert(dead_items->max_items >= MaxHeapTuplesPerPage); + if (dead_items->max_items - dead_items->num_items < MaxHeapTuplesPerPage) + { + /* + * Before beginning index vacuuming, we release any pin we may + * hold on the visibility map page. This isn't necessary for + * correctness, but we do it anyway to avoid holding the pin + * across a lengthy, unrelated operation. + */ + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + + /* Perform a round of index and heap vacuuming */ + vacrel->consider_bypass_optimization = false; + lazy_vacuum(vacrel); + + /* + * Vacuum the Free Space Map to make newly-freed space visible on + * upper-level FSM pages. Note we have not yet processed blkno. + */ + FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, + blkno); + next_fsm_block_to_vacuum = blkno; + + /* Report that we are once again scanning the heap */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_SCAN_HEAP); + } + + /* + * Pin the visibility map page in case we need to mark the page + * all-visible. In most cases this will be very cheap, because we'll + * already have the correct page pinned anyway. + */ + tdeheap_visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); + + /* + * We need a buffer cleanup lock to prune HOT chains and defragment + * the page in lazy_scan_prune. But when it's not possible to acquire + * a cleanup lock right away, we may be able to settle for reduced + * processing using lazy_scan_noprune. + */ + buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + vacrel->bstrategy); + page = BufferGetPage(buf); + if (!ConditionalLockBufferForCleanup(buf)) + { + bool hastup, + recordfreespace; + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + /* Check for new or empty pages before lazy_scan_noprune call */ + if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, true, + vmbuffer)) + { + /* Processed as new/empty page (lock and pin released) */ + continue; + } + + /* Collect LP_DEAD items in dead_items array, count tuples */ + if (lazy_scan_noprune(vacrel, buf, blkno, page, &hastup, + &recordfreespace)) + { + Size freespace = 0; + + /* + * Processed page successfully (without cleanup lock) -- just + * need to perform rel truncation and FSM steps, much like the + * lazy_scan_prune case. Don't bother trying to match its + * visibility map setting steps, though. + */ + if (hastup) + vacrel->nonempty_pages = blkno + 1; + if (recordfreespace) + freespace = PageGetHeapFreeSpace(page); + UnlockReleaseBuffer(buf); + if (recordfreespace) + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + continue; + } + + /* + * lazy_scan_noprune could not do all required processing. Wait + * for a cleanup lock, and call lazy_scan_prune in the usual way. + */ + Assert(vacrel->aggressive); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBufferForCleanup(buf); + } + + /* Check for new or empty pages before lazy_scan_prune call */ + if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, false, vmbuffer)) + { + /* Processed as new/empty page (lock and pin released) */ + continue; + } + + /* + * Prune, freeze, and count tuples. + * + * Accumulates details of remaining LP_DEAD line pointers on page in + * dead_items array. This includes LP_DEAD line pointers that we + * pruned ourselves, as well as existing LP_DEAD line pointers that + * were pruned some time earlier. Also considers freezing XIDs in the + * tuple headers of remaining items with storage. + */ + lazy_scan_prune(vacrel, buf, blkno, page, &prunestate); + + Assert(!prunestate.all_visible || !prunestate.has_lpdead_items); + + /* Remember the location of the last page with nonremovable tuples */ + if (prunestate.hastup) + vacrel->nonempty_pages = blkno + 1; + + if (vacrel->nindexes == 0) + { + /* + * Consider the need to do page-at-a-time heap vacuuming when + * using the one-pass strategy now. + * + * The one-pass strategy will never call lazy_vacuum(). The steps + * performed here can be thought of as the one-pass equivalent of + * a call to lazy_vacuum(). + */ + if (prunestate.has_lpdead_items) + { + Size freespace; + + lazy_vacuum_tdeheap_page(vacrel, blkno, buf, 0, vmbuffer); + + /* Forget the LP_DEAD items that we just vacuumed */ + dead_items->num_items = 0; + + /* + * Periodically perform FSM vacuuming to make newly-freed + * space visible on upper FSM pages. Note we have not yet + * performed FSM processing for blkno. + */ + if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) + { + FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, + blkno); + next_fsm_block_to_vacuum = blkno; + } + + /* + * Now perform FSM processing for blkno, and move on to next + * page. + * + * Our call to lazy_vacuum_tdeheap_page() will have considered if + * it's possible to set all_visible/all_frozen independently + * of lazy_scan_prune(). Note that prunestate was invalidated + * by lazy_vacuum_tdeheap_page() call. + */ + freespace = PageGetHeapFreeSpace(page); + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + continue; + } + + /* + * There was no call to lazy_vacuum_tdeheap_page() because pruning + * didn't encounter/create any LP_DEAD items that needed to be + * vacuumed. Prune state has not been invalidated, so proceed + * with prunestate-driven visibility map and FSM steps (just like + * the two-pass strategy). + */ + Assert(dead_items->num_items == 0); + } + + /* + * Handle setting visibility map bit based on information from the VM + * (as of last lazy_scan_skip() call), and from prunestate + */ + if (!all_visible_according_to_vm && prunestate.all_visible) + { + uint8 flags = VISIBILITYMAP_ALL_VISIBLE; + + if (prunestate.all_frozen) + { + Assert(!TransactionIdIsValid(prunestate.visibility_cutoff_xid)); + flags |= VISIBILITYMAP_ALL_FROZEN; + } + + /* + * It should never be the case that the visibility map page is set + * while the page-level bit is clear, but the reverse is allowed + * (if checksums are not enabled). Regardless, set both bits so + * that we get back in sync. + * + * NB: If the heap page is all-visible but the VM bit is not set, + * we don't need to dirty the heap page. However, if checksums + * are enabled, we do need to make sure that the heap page is + * dirtied before passing it to tdeheap_visibilitymap_set(), because it + * may be logged. Given that this situation should only happen in + * rare cases after a crash, it is not worth optimizing. + */ + PageSetAllVisible(page); + MarkBufferDirty(buf); + tdeheap_visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, prunestate.visibility_cutoff_xid, + flags); + } + + /* + * As of PostgreSQL 9.2, the visibility map bit should never be set if + * the page-level bit is clear. However, it's possible that the bit + * got cleared after lazy_scan_skip() was called, so we must recheck + * with buffer lock before concluding that the VM is corrupt. + */ + else if (all_visible_according_to_vm && !PageIsAllVisible(page) && + tdeheap_visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0) + { + elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + vacrel->relname, blkno); + tdeheap_visibilitymap_clear(vacrel->rel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + + /* + * It's possible for the value returned by + * GetOldestNonRemovableTransactionId() to move backwards, so it's not + * wrong for us to see tuples that appear to not be visible to + * everyone yet, while PD_ALL_VISIBLE is already set. The real safe + * xmin value never moves backwards, but + * GetOldestNonRemovableTransactionId() is conservative and sometimes + * returns a value that's unnecessarily small, so if we see that + * contradiction it just means that the tuples that we think are not + * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag + * is correct. + * + * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE + * set, however. + */ + else if (prunestate.has_lpdead_items && PageIsAllVisible(page)) + { + elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", + vacrel->relname, blkno); + PageClearAllVisible(page); + MarkBufferDirty(buf); + tdeheap_visibilitymap_clear(vacrel->rel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + + /* + * If the all-visible page is all-frozen but not marked as such yet, + * mark it as all-frozen. Note that all_frozen is only valid if + * all_visible is true, so we must check both prunestate fields. + */ + else if (all_visible_according_to_vm && prunestate.all_visible && + prunestate.all_frozen && + !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) + { + /* + * Avoid relying on all_visible_according_to_vm as a proxy for the + * page-level PD_ALL_VISIBLE bit being set, since it might have + * become stale -- even when all_visible is set in prunestate + */ + if (!PageIsAllVisible(page)) + { + PageSetAllVisible(page); + MarkBufferDirty(buf); + } + + /* + * Set the page all-frozen (and all-visible) in the VM. + * + * We can pass InvalidTransactionId as our visibility_cutoff_xid, + * since a snapshotConflictHorizon sufficient to make everything + * safe for REDO was logged when the page's tuples were frozen. + */ + Assert(!TransactionIdIsValid(prunestate.visibility_cutoff_xid)); + tdeheap_visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN); + } + + /* + * Final steps for block: drop cleanup lock, record free space in the + * FSM + */ + if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming) + { + /* + * Wait until lazy_vacuum_tdeheap_rel() to save free space. This + * doesn't just save us some cycles; it also allows us to record + * any additional free space that lazy_vacuum_tdeheap_page() will + * make available in cases where it's possible to truncate the + * page's line pointer array. + * + * Note: It's not in fact 100% certain that we really will call + * lazy_vacuum_tdeheap_rel() -- lazy_vacuum() might yet opt to skip + * index vacuuming (and so must skip heap vacuuming). This is + * deemed okay because it only happens in emergencies, or when + * there is very little free space anyway. (Besides, we start + * recording free space in the FSM once index vacuuming has been + * abandoned.) + * + * Note: The one-pass (no indexes) case is only supposed to make + * it this far when there were no LP_DEAD items during pruning. + */ + Assert(vacrel->nindexes > 0); + UnlockReleaseBuffer(buf); + } + else + { + Size freespace = PageGetHeapFreeSpace(page); + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + } + } + + vacrel->blkno = InvalidBlockNumber; + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); + + /* report that everything is now scanned */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + + /* now we can compute the new value for pg_class.reltuples */ + vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages, + vacrel->scanned_pages, + vacrel->live_tuples); + + /* + * Also compute the total number of surviving heap entries. In the + * (unlikely) scenario that new_live_tuples is -1, take it as zero. + */ + vacrel->new_rel_tuples = + Max(vacrel->new_live_tuples, 0) + vacrel->recently_dead_tuples + + vacrel->missed_dead_tuples; + + /* + * Do index vacuuming (call each index's ambulkdelete routine), then do + * related heap vacuuming + */ + if (dead_items->num_items > 0) + lazy_vacuum(vacrel); + + /* + * Vacuum the remainder of the Free Space Map. We must do this whether or + * not there were indexes, and whether or not we bypassed index vacuuming. + */ + if (blkno > next_fsm_block_to_vacuum) + FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno); + + /* report all blocks vacuumed */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); + + /* Do final index cleanup (call each index's amvacuumcleanup routine) */ + if (vacrel->nindexes > 0 && vacrel->do_index_cleanup) + lazy_cleanup_all_indexes(vacrel); +} + +/* + * lazy_scan_skip() -- set up range of skippable blocks using visibility map. + * + * lazy_scan_heap() calls here every time it needs to set up a new range of + * blocks to skip via the visibility map. Caller passes the next block in + * line. We return a next_unskippable_block for this range. When there are + * no skippable blocks we just return caller's next_block. The all-visible + * status of the returned block is set in *next_unskippable_allvis for caller, + * too. Block usually won't be all-visible (since it's unskippable), but it + * can be during aggressive VACUUMs (as well as in certain edge cases). + * + * Sets *skipping_current_range to indicate if caller should skip this range. + * Costs and benefits drive our decision. Very small ranges won't be skipped. + * + * Note: our opinion of which blocks can be skipped can go stale immediately. + * It's okay if caller "misses" a page whose all-visible or all-frozen marking + * was concurrently cleared, though. All that matters is that caller scan all + * pages whose tuples might contain XIDs < OldestXmin, or MXIDs < OldestMxact. + * (Actually, non-aggressive VACUUMs can choose to skip all-visible pages with + * older XIDs/MXIDs. The vacrel->skippedallvis flag will be set here when the + * choice to skip such a range is actually made, making everything safe.) + */ +static BlockNumber +lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, BlockNumber next_block, + bool *next_unskippable_allvis, bool *skipping_current_range) +{ + BlockNumber rel_pages = vacrel->rel_pages, + next_unskippable_block = next_block, + nskippable_blocks = 0; + bool skipsallvis = false; + + *next_unskippable_allvis = true; + while (next_unskippable_block < rel_pages) + { + uint8 mapbits = tdeheap_visibilitymap_get_status(vacrel->rel, + next_unskippable_block, + vmbuffer); + + if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) == 0) + { + Assert((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0); + *next_unskippable_allvis = false; + break; + } + + /* + * Caller must scan the last page to determine whether it has tuples + * (caller must have the opportunity to set vacrel->nonempty_pages). + * This rule avoids having lazy_truncate_heap() take access-exclusive + * lock on rel to attempt a truncation that fails anyway, just because + * there are tuples on the last page (it is likely that there will be + * tuples on other nearby pages as well, but those can be skipped). + * + * Implement this by always treating the last block as unsafe to skip. + */ + if (next_unskippable_block == rel_pages - 1) + break; + + /* DISABLE_PAGE_SKIPPING makes all skipping unsafe */ + if (!vacrel->skipwithvm) + break; + + /* + * Aggressive VACUUM caller can't skip pages just because they are + * all-visible. They may still skip all-frozen pages, which can't + * contain XIDs < OldestXmin (XIDs that aren't already frozen by now). + */ + if ((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0) + { + if (vacrel->aggressive) + break; + + /* + * All-visible block is safe to skip in non-aggressive case. But + * remember that the final range contains such a block for later. + */ + skipsallvis = true; + } + + vacuum_delay_point(); + next_unskippable_block++; + nskippable_blocks++; + } + + /* + * We only skip a range with at least SKIP_PAGES_THRESHOLD consecutive + * pages. Since we're reading sequentially, the OS should be doing + * readahead for us, so there's no gain in skipping a page now and then. + * Skipping such a range might even discourage sequential detection. + * + * This test also enables more frequent relfrozenxid advancement during + * non-aggressive VACUUMs. If the range has any all-visible pages then + * skipping makes updating relfrozenxid unsafe, which is a real downside. + */ + if (nskippable_blocks < SKIP_PAGES_THRESHOLD) + *skipping_current_range = false; + else + { + *skipping_current_range = true; + if (skipsallvis) + vacrel->skippedallvis = true; + } + + return next_unskippable_block; +} + +/* + * lazy_scan_new_or_empty() -- lazy_scan_heap() new/empty page handling. + * + * Must call here to handle both new and empty pages before calling + * lazy_scan_prune or lazy_scan_noprune, since they're not prepared to deal + * with new or empty pages. + * + * It's necessary to consider new pages as a special case, since the rules for + * maintaining the visibility map and FSM with empty pages are a little + * different (though new pages can be truncated away during rel truncation). + * + * Empty pages are not really a special case -- they're just heap pages that + * have no allocated tuples (including even LP_UNUSED items). You might + * wonder why we need to handle them here all the same. It's only necessary + * because of a corner-case involving a hard crash during heap relation + * extension. If we ever make relation-extension crash safe, then it should + * no longer be necessary to deal with empty pages here (or new pages, for + * that matter). + * + * Caller must hold at least a shared lock. We might need to escalate the + * lock in that case, so the type of lock caller holds needs to be specified + * using 'sharelock' argument. + * + * Returns false in common case where caller should go on to call + * lazy_scan_prune (or lazy_scan_noprune). Otherwise returns true, indicating + * that lazy_scan_heap is done processing the page, releasing lock on caller's + * behalf. + */ +static bool +lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, + Page page, bool sharelock, Buffer vmbuffer) +{ + Size freespace; + + if (PageIsNew(page)) + { + /* + * All-zeroes pages can be left over if either a backend extends the + * relation by a single page, but crashes before the newly initialized + * page has been written out, or when bulk-extending the relation + * (which creates a number of empty pages at the tail end of the + * relation), and then enters them into the FSM. + * + * Note we do not enter the page into the visibilitymap. That has the + * downside that we repeatedly visit this page in subsequent vacuums, + * but otherwise we'll never discover the space on a promoted standby. + * The harm of repeated checking ought to normally not be too bad. The + * space usually should be used at some point, otherwise there + * wouldn't be any regular vacuums. + * + * Make sure these pages are in the FSM, to ensure they can be reused. + * Do that by testing if there's any space recorded for the page. If + * not, enter it. We do so after releasing the lock on the heap page, + * the FSM is approximate, after all. + */ + UnlockReleaseBuffer(buf); + + if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0) + { + freespace = BLCKSZ - SizeOfPageHeaderData; + + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + } + + return true; + } + + if (PageIsEmpty(page)) + { + /* + * It seems likely that caller will always be able to get a cleanup + * lock on an empty page. But don't take any chances -- escalate to + * an exclusive lock (still don't need a cleanup lock, though). + */ + if (sharelock) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + if (!PageIsEmpty(page)) + { + /* page isn't new or empty -- keep lock and pin for now */ + return false; + } + } + else + { + /* Already have a full cleanup lock (which is more than enough) */ + } + + /* + * Unlike new pages, empty pages are always set all-visible and + * all-frozen. + */ + if (!PageIsAllVisible(page)) + { + START_CRIT_SECTION(); + + /* mark buffer dirty before writing a WAL record */ + MarkBufferDirty(buf); + + /* + * It's possible that another backend has extended the heap, + * initialized the page, and then failed to WAL-log the page due + * to an ERROR. Since heap extension is not WAL-logged, recovery + * might try to replay our record setting the page all-visible and + * find that the page isn't initialized, which will cause a PANIC. + * To prevent that, check whether the page has been previously + * WAL-logged, and if not, do that now. + */ + if (RelationNeedsWAL(vacrel->rel) && + PageGetLSN(page) == InvalidXLogRecPtr) + log_newpage_buffer(buf, true); + + PageSetAllVisible(page); + tdeheap_visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); + END_CRIT_SECTION(); + } + + freespace = PageGetHeapFreeSpace(page); + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + return true; + } + + /* page isn't new or empty -- keep lock and pin */ + return false; +} + +/* + * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing. + * + * Caller must hold pin and buffer cleanup lock on the buffer. + * + * Prior to PostgreSQL 14 there were very rare cases where tdeheap_page_prune() + * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about + * whether or not a tuple should be considered DEAD. This happened when an + * inserting transaction concurrently aborted (after our tdeheap_page_prune() + * call, before our HeapTupleSatisfiesVacuum() call). There was rather a lot + * of complexity just so we could deal with tuples that were DEAD to VACUUM, + * but nevertheless were left with storage after pruning. + * + * The approach we take now is to restart pruning when the race condition is + * detected. This allows tdeheap_page_prune() to prune the tuples inserted by + * the now-aborted transaction. This is a little crude, but it guarantees + * that any items that make it into the dead_items array are simple LP_DEAD + * line pointers, and that every remaining item with tuple storage is + * considered as a candidate for freezing. + */ +static void +lazy_scan_prune(LVRelState *vacrel, + Buffer buf, + BlockNumber blkno, + Page page, + LVPagePruneState *prunestate) +{ + Relation rel = vacrel->rel; + OffsetNumber offnum, + maxoff; + ItemId itemid; + HeapTupleData tuple; + HTSV_Result res; + int tuples_deleted, + tuples_frozen, + lpdead_items, + live_tuples, + recently_dead_tuples; + int nnewlpdead; + HeapPageFreeze pagefrz; + int64 fpi_before = pgWalUsage.wal_fpi; + OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; + HeapTupleFreeze frozen[MaxHeapTuplesPerPage]; + + Assert(BufferGetBlockNumber(buf) == blkno); + + /* + * maxoff might be reduced following line pointer array truncation in + * tdeheap_page_prune. That's safe for us to ignore, since the reclaimed + * space will continue to look like LP_UNUSED items below. + */ + maxoff = PageGetMaxOffsetNumber(page); + +retry: + + /* Initialize (or reset) page-level state */ + pagefrz.freeze_required = false; + pagefrz.FreezePageRelfrozenXid = vacrel->NewRelfrozenXid; + pagefrz.FreezePageRelminMxid = vacrel->NewRelminMxid; + pagefrz.NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid; + pagefrz.NoFreezePageRelminMxid = vacrel->NewRelminMxid; + tuples_deleted = 0; + tuples_frozen = 0; + lpdead_items = 0; + live_tuples = 0; + recently_dead_tuples = 0; + + /* + * Prune all HOT-update chains in this page. + * + * We count tuples removed by the pruning step as tuples_deleted. Its + * final value can be thought of as the number of tuples that have been + * deleted from the table. It should not be confused with lpdead_items; + * lpdead_items's final value can be thought of as the number of tuples + * that were deleted from indexes. + */ + tuples_deleted = tdeheap_page_prune(rel, buf, vacrel->vistest, + InvalidTransactionId, 0, &nnewlpdead, + &vacrel->offnum); + + /* + * Now scan the page to collect LP_DEAD items and check for tuples + * requiring freezing among remaining tuples with storage + */ + prunestate->hastup = false; + prunestate->has_lpdead_items = false; + prunestate->all_visible = true; + prunestate->all_frozen = true; + prunestate->visibility_cutoff_xid = InvalidTransactionId; + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + bool totally_frozen; + + /* + * Set the offset number so that we can display it along with any + * error that occurred while processing this tuple. + */ + vacrel->offnum = offnum; + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsUsed(itemid)) + continue; + + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + { + /* page makes rel truncation unsafe */ + prunestate->hastup = true; + continue; + } + + if (ItemIdIsDead(itemid)) + { + /* + * Deliberately don't set hastup for LP_DEAD items. We make the + * soft assumption that any LP_DEAD items encountered here will + * become LP_UNUSED later on, before count_nondeletable_pages is + * reached. If we don't make this assumption then rel truncation + * will only happen every other VACUUM, at most. Besides, VACUUM + * must treat hastup/nonempty_pages as provisional no matter how + * LP_DEAD items are handled (handled here, or handled later on). + * + * Also deliberately delay unsetting all_visible until just before + * we return to lazy_scan_heap caller, as explained in full below. + * (This is another case where it's useful to anticipate that any + * LP_DEAD items will become LP_UNUSED during the ongoing VACUUM.) + */ + deadoffsets[lpdead_items++] = offnum; + continue; + } + + Assert(ItemIdIsNormal(itemid)); + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(rel); + + /* + * DEAD tuples are almost always pruned into LP_DEAD line pointers by + * tdeheap_page_prune(), but it's possible that the tuple state changed + * since tdeheap_page_prune() looked. Handle that here by restarting. + * (See comments at the top of function for a full explanation.) + */ + res = HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin, + buf); + + if (unlikely(res == HEAPTUPLE_DEAD)) + goto retry; + + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM + * and ANALYZE may produce wildly different reltuples values, e.g. + * when there are many recently-dead tuples. + * + * The logic here is a bit simpler than acquire_sample_rows(), as + * VACUUM can't run inside a transaction block, which makes some cases + * impossible (e.g. in-progress insert from the same transaction). + * + * We treat LP_DEAD items (which are the closest thing to DEAD tuples + * that might be seen here) differently, too: we assume that they'll + * become LP_UNUSED before VACUUM finishes. This difference is only + * superficial. VACUUM effectively agrees with ANALYZE about DEAD + * items, in the end. VACUUM won't remember LP_DEAD items, but only + * because they're not supposed to be left behind when it is done. + * (Cases where we bypass index vacuuming will violate this optimistic + * assumption, but the overall impact of that should be negligible.) + */ + switch (res) + { + case HEAPTUPLE_LIVE: + + /* + * Count it as live. Not only is this natural, but it's also + * what acquire_sample_rows() does. + */ + live_tuples++; + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * PD_ALL_VISIBLE flag if the inserter committed + * asynchronously. See SetHintBits for more info. Check that + * the tuple is hinted xmin-committed because of that. + */ + if (prunestate->all_visible) + { + TransactionId xmin; + + if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + { + prunestate->all_visible = false; + break; + } + + /* + * The inserter definitely committed. But is it old enough + * that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, + vacrel->cutoffs.OldestXmin)) + { + prunestate->all_visible = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid) && + TransactionIdIsNormal(xmin)) + prunestate->visibility_cutoff_xid = xmin; + } + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * If tuple is recently dead then we must not remove it from + * the relation. (We only remove items that are LP_DEAD from + * pruning.) + */ + recently_dead_tuples++; + prunestate->all_visible = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * We do not count these rows as live, because we expect the + * inserting transaction to update the counters at commit, and + * we assume that will happen only after we report our + * results. This assumption is a bit shaky, but it is what + * acquire_sample_rows() does, so be consistent. + */ + prunestate->all_visible = false; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + /* This is an expected case during concurrent vacuum */ + prunestate->all_visible = false; + + /* + * Count such rows as live. As above, we assume the deleting + * transaction will commit and update the counters after we + * report. + */ + live_tuples++; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + prunestate->hastup = true; /* page makes rel truncation unsafe */ + + /* Tuple with storage -- consider need to freeze */ + if (tdeheap_prepare_freeze_tuple(tuple.t_data, &vacrel->cutoffs, &pagefrz, + &frozen[tuples_frozen], &totally_frozen)) + { + /* Save prepared freeze plan for later */ + frozen[tuples_frozen++].offset = offnum; + } + + /* + * If any tuple isn't either totally frozen already or eligible to + * become totally frozen (according to its freeze plan), then the page + * definitely cannot be set all-frozen in the visibility map later on + */ + if (!totally_frozen) + prunestate->all_frozen = false; + } + + /* + * We have now divided every item on the page into either an LP_DEAD item + * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple + * that remains and needs to be considered for freezing now (LP_UNUSED and + * LP_REDIRECT items also remain, but are of no further interest to us). + */ + vacrel->offnum = InvalidOffsetNumber; + + /* + * Freeze the page when tdeheap_prepare_freeze_tuple indicates that at least + * one XID/MXID from before FreezeLimit/MultiXactCutoff is present. Also + * freeze when pruning generated an FPI, if doing so means that we set the + * page all-frozen afterwards (might not happen until final heap pass). + */ + if (pagefrz.freeze_required || tuples_frozen == 0 || + (prunestate->all_visible && prunestate->all_frozen && + fpi_before != pgWalUsage.wal_fpi)) + { + /* + * We're freezing the page. Our final NewRelfrozenXid doesn't need to + * be affected by the XIDs that are just about to be frozen anyway. + */ + vacrel->NewRelfrozenXid = pagefrz.FreezePageRelfrozenXid; + vacrel->NewRelminMxid = pagefrz.FreezePageRelminMxid; + + if (tuples_frozen == 0) + { + /* + * We have no freeze plans to execute, so there's no added cost + * from following the freeze path. That's why it was chosen. This + * is important in the case where the page only contains totally + * frozen tuples at this point (perhaps only following pruning). + * Such pages can be marked all-frozen in the VM by our caller, + * even though none of its tuples were newly frozen here (note + * that the "no freeze" path never sets pages all-frozen). + * + * We never increment the frozen_pages instrumentation counter + * here, since it only counts pages with newly frozen tuples + * (don't confuse that with pages newly set all-frozen in VM). + */ + } + else + { + TransactionId snapshotConflictHorizon; + + vacrel->frozen_pages++; + + /* + * We can use visibility_cutoff_xid as our cutoff for conflicts + * when the whole page is eligible to become all-frozen in the VM + * once we're done with it. Otherwise we generate a conservative + * cutoff by stepping back from OldestXmin. + */ + if (prunestate->all_visible && prunestate->all_frozen) + { + /* Using same cutoff when setting VM is now unnecessary */ + snapshotConflictHorizon = prunestate->visibility_cutoff_xid; + prunestate->visibility_cutoff_xid = InvalidTransactionId; + } + else + { + /* Avoids false conflicts when hot_standby_feedback in use */ + snapshotConflictHorizon = vacrel->cutoffs.OldestXmin; + TransactionIdRetreat(snapshotConflictHorizon); + } + + /* Execute all freeze plans for page as a single atomic action */ + tdeheap_freeze_execute_prepared(vacrel->rel, buf, + snapshotConflictHorizon, + frozen, tuples_frozen); + } + } + else + { + /* + * Page requires "no freeze" processing. It might be set all-visible + * in the visibility map, but it can never be set all-frozen. + */ + vacrel->NewRelfrozenXid = pagefrz.NoFreezePageRelfrozenXid; + vacrel->NewRelminMxid = pagefrz.NoFreezePageRelminMxid; + prunestate->all_frozen = false; + tuples_frozen = 0; /* avoid miscounts in instrumentation */ + } + + /* + * VACUUM will call tdeheap_page_is_all_visible() during the second pass over + * the heap to determine all_visible and all_frozen for the page -- this + * is a specialized version of the logic from this function. Now that + * we've finished pruning and freezing, make sure that we're in total + * agreement with tdeheap_page_is_all_visible() using an assertion. + */ +#ifdef USE_ASSERT_CHECKING + /* Note that all_frozen value does not matter when !all_visible */ + if (prunestate->all_visible && lpdead_items == 0) + { + TransactionId cutoff; + bool all_frozen; + + if (!tdeheap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen)) + Assert(false); + + Assert(!TransactionIdIsValid(cutoff) || + cutoff == prunestate->visibility_cutoff_xid); + } +#endif + + /* + * Now save details of the LP_DEAD items from the page in vacrel + */ + if (lpdead_items > 0) + { + VacDeadItems *dead_items = vacrel->dead_items; + ItemPointerData tmp; + + vacrel->lpdead_item_pages++; + prunestate->has_lpdead_items = true; + + ItemPointerSetBlockNumber(&tmp, blkno); + + for (int i = 0; i < lpdead_items; i++) + { + ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); + dead_items->items[dead_items->num_items++] = tmp; + } + + Assert(dead_items->num_items <= dead_items->max_items); + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, + dead_items->num_items); + + /* + * It was convenient to ignore LP_DEAD items in all_visible earlier on + * to make the choice of whether or not to freeze the page unaffected + * by the short-term presence of LP_DEAD items. These LP_DEAD items + * were effectively assumed to be LP_UNUSED items in the making. It + * doesn't matter which heap pass (initial pass or final pass) ends up + * setting the page all-frozen, as long as the ongoing VACUUM does it. + * + * Now that freezing has been finalized, unset all_visible. It needs + * to reflect the present state of things, as expected by our caller. + */ + prunestate->all_visible = false; + } + + /* Finally, add page-local counts to whole-VACUUM counts */ + vacrel->tuples_deleted += tuples_deleted; + vacrel->tuples_frozen += tuples_frozen; + vacrel->lpdead_items += lpdead_items; + vacrel->live_tuples += live_tuples; + vacrel->recently_dead_tuples += recently_dead_tuples; +} + +/* + * lazy_scan_noprune() -- lazy_scan_prune() without pruning or freezing + * + * Caller need only hold a pin and share lock on the buffer, unlike + * lazy_scan_prune, which requires a full cleanup lock. While pruning isn't + * performed here, it's quite possible that an earlier opportunistic pruning + * operation left LP_DEAD items behind. We'll at least collect any such items + * in the dead_items array for removal from indexes. + * + * For aggressive VACUUM callers, we may return false to indicate that a full + * cleanup lock is required for processing by lazy_scan_prune. This is only + * necessary when the aggressive VACUUM needs to freeze some tuple XIDs from + * one or more tuples on the page. We always return true for non-aggressive + * callers. + * + * See lazy_scan_prune for an explanation of hastup return flag. + * recordfreespace flag instructs caller on whether or not it should do + * generic FSM processing for page. + */ +static bool +lazy_scan_noprune(LVRelState *vacrel, + Buffer buf, + BlockNumber blkno, + Page page, + bool *hastup, + bool *recordfreespace) +{ + OffsetNumber offnum, + maxoff; + int lpdead_items, + live_tuples, + recently_dead_tuples, + missed_dead_tuples; + HeapTupleHeader tupleheader; + TransactionId NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid; + MultiXactId NoFreezePageRelminMxid = vacrel->NewRelminMxid; + OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; + + Assert(BufferGetBlockNumber(buf) == blkno); + + *hastup = false; /* for now */ + *recordfreespace = false; /* for now */ + + lpdead_items = 0; + live_tuples = 0; + recently_dead_tuples = 0; + missed_dead_tuples = 0; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleData tuple; + + vacrel->offnum = offnum; + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsUsed(itemid)) + continue; + + if (ItemIdIsRedirected(itemid)) + { + *hastup = true; + continue; + } + + if (ItemIdIsDead(itemid)) + { + /* + * Deliberately don't set hastup=true here. See same point in + * lazy_scan_prune for an explanation. + */ + deadoffsets[lpdead_items++] = offnum; + continue; + } + + *hastup = true; /* page prevents rel truncation */ + tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); + if (tdeheap_tuple_should_freeze(tupleheader, &vacrel->cutoffs, + &NoFreezePageRelfrozenXid, + &NoFreezePageRelminMxid)) + { + /* Tuple with XID < FreezeLimit (or MXID < MultiXactCutoff) */ + if (vacrel->aggressive) + { + /* + * Aggressive VACUUMs must always be able to advance rel's + * relfrozenxid to a value >= FreezeLimit (and be able to + * advance rel's relminmxid to a value >= MultiXactCutoff). + * The ongoing aggressive VACUUM won't be able to do that + * unless it can freeze an XID (or MXID) from this tuple now. + * + * The only safe option is to have caller perform processing + * of this page using lazy_scan_prune. Caller might have to + * wait a while for a cleanup lock, but it can't be helped. + */ + vacrel->offnum = InvalidOffsetNumber; + return false; + } + + /* + * Non-aggressive VACUUMs are under no obligation to advance + * relfrozenxid (even by one XID). We can be much laxer here. + * + * Currently we always just accept an older final relfrozenxid + * and/or relminmxid value. We never make caller wait or work a + * little harder, even when it likely makes sense to do so. + */ + } + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(vacrel->rel); + + switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin, + buf)) + { + case HEAPTUPLE_DELETE_IN_PROGRESS: + case HEAPTUPLE_LIVE: + + /* + * Count both cases as live, just like lazy_scan_prune + */ + live_tuples++; + + break; + case HEAPTUPLE_DEAD: + + /* + * There is some useful work for pruning to do, that won't be + * done due to failure to get a cleanup lock. + */ + missed_dead_tuples++; + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * Count in recently_dead_tuples, just like lazy_scan_prune + */ + recently_dead_tuples++; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * Do not count these rows as live, just like lazy_scan_prune + */ + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + } + + vacrel->offnum = InvalidOffsetNumber; + + /* + * By here we know for sure that caller can put off freezing and pruning + * this particular page until the next VACUUM. Remember its details now. + * (lazy_scan_prune expects a clean slate, so we have to do this last.) + */ + vacrel->NewRelfrozenXid = NoFreezePageRelfrozenXid; + vacrel->NewRelminMxid = NoFreezePageRelminMxid; + + /* Save any LP_DEAD items found on the page in dead_items array */ + if (vacrel->nindexes == 0) + { + /* Using one-pass strategy (since table has no indexes) */ + if (lpdead_items > 0) + { + /* + * Perfunctory handling for the corner case where a single pass + * strategy VACUUM cannot get a cleanup lock, and it turns out + * that there is one or more LP_DEAD items: just count the LP_DEAD + * items as missed_dead_tuples instead. (This is a bit dishonest, + * but it beats having to maintain specialized heap vacuuming code + * forever, for vanishingly little benefit.) + */ + *hastup = true; + missed_dead_tuples += lpdead_items; + } + + *recordfreespace = true; + } + else if (lpdead_items == 0) + { + /* + * Won't be vacuuming this page later, so record page's freespace in + * the FSM now + */ + *recordfreespace = true; + } + else + { + VacDeadItems *dead_items = vacrel->dead_items; + ItemPointerData tmp; + + /* + * Page has LP_DEAD items, and so any references/TIDs that remain in + * indexes will be deleted during index vacuuming (and then marked + * LP_UNUSED in the heap) + */ + vacrel->lpdead_item_pages++; + + ItemPointerSetBlockNumber(&tmp, blkno); + + for (int i = 0; i < lpdead_items; i++) + { + ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); + dead_items->items[dead_items->num_items++] = tmp; + } + + Assert(dead_items->num_items <= dead_items->max_items); + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, + dead_items->num_items); + + vacrel->lpdead_items += lpdead_items; + + /* + * Assume that we'll go on to vacuum this heap page during final pass + * over the heap. Don't record free space until then. + */ + *recordfreespace = false; + } + + /* + * Finally, add relevant page-local counts to whole-VACUUM counts + */ + vacrel->live_tuples += live_tuples; + vacrel->recently_dead_tuples += recently_dead_tuples; + vacrel->missed_dead_tuples += missed_dead_tuples; + if (missed_dead_tuples > 0) + vacrel->missed_dead_pages++; + + /* Caller won't need to call lazy_scan_prune with same page */ + return true; +} + +/* + * Main entry point for index vacuuming and heap vacuuming. + * + * Removes items collected in dead_items from table's indexes, then marks the + * same items LP_UNUSED in the heap. See the comments above lazy_scan_heap + * for full details. + * + * Also empties dead_items, freeing up space for later TIDs. + * + * We may choose to bypass index vacuuming at this point, though only when the + * ongoing VACUUM operation will definitely only have one index scan/round of + * index vacuuming. + */ +static void +lazy_vacuum(LVRelState *vacrel) +{ + bool bypass; + + /* Should not end up here with no indexes */ + Assert(vacrel->nindexes > 0); + Assert(vacrel->lpdead_item_pages > 0); + + if (!vacrel->do_index_vacuuming) + { + Assert(!vacrel->do_index_cleanup); + vacrel->dead_items->num_items = 0; + return; + } + + /* + * Consider bypassing index vacuuming (and heap vacuuming) entirely. + * + * We currently only do this in cases where the number of LP_DEAD items + * for the entire VACUUM operation is close to zero. This avoids sharp + * discontinuities in the duration and overhead of successive VACUUM + * operations that run against the same table with a fixed workload. + * Ideally, successive VACUUM operations will behave as if there are + * exactly zero LP_DEAD items in cases where there are close to zero. + * + * This is likely to be helpful with a table that is continually affected + * by UPDATEs that can mostly apply the HOT optimization, but occasionally + * have small aberrations that lead to just a few heap pages retaining + * only one or two LP_DEAD items. This is pretty common; even when the + * DBA goes out of their way to make UPDATEs use HOT, it is practically + * impossible to predict whether HOT will be applied in 100% of cases. + * It's far easier to ensure that 99%+ of all UPDATEs against a table use + * HOT through careful tuning. + */ + bypass = false; + if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0) + { + BlockNumber threshold; + + Assert(vacrel->num_index_scans == 0); + Assert(vacrel->lpdead_items == vacrel->dead_items->num_items); + Assert(vacrel->do_index_vacuuming); + Assert(vacrel->do_index_cleanup); + + /* + * This crossover point at which we'll start to do index vacuuming is + * expressed as a percentage of the total number of heap pages in the + * table that are known to have at least one LP_DEAD item. This is + * much more important than the total number of LP_DEAD items, since + * it's a proxy for the number of heap pages whose visibility map bits + * cannot be set on account of bypassing index and heap vacuuming. + * + * We apply one further precautionary test: the space currently used + * to store the TIDs (TIDs that now all point to LP_DEAD items) must + * not exceed 32MB. This limits the risk that we will bypass index + * vacuuming again and again until eventually there is a VACUUM whose + * dead_items space is not CPU cache resident. + * + * We don't take any special steps to remember the LP_DEAD items (such + * as counting them in our final update to the stats system) when the + * optimization is applied. Though the accounting used in analyze.c's + * acquire_sample_rows() will recognize the same LP_DEAD items as dead + * rows in its own stats report, that's okay. The discrepancy should + * be negligible. If this optimization is ever expanded to cover more + * cases then this may need to be reconsidered. + */ + threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES; + bypass = (vacrel->lpdead_item_pages < threshold && + vacrel->lpdead_items < MAXDEADITEMS(32L * 1024L * 1024L)); + } + + if (bypass) + { + /* + * There are almost zero TIDs. Behave as if there were precisely + * zero: bypass index vacuuming, but do index cleanup. + * + * We expect that the ongoing VACUUM operation will finish very + * quickly, so there is no point in considering speeding up as a + * failsafe against wraparound failure. (Index cleanup is expected to + * finish very quickly in cases where there were no ambulkdelete() + * calls.) + */ + vacrel->do_index_vacuuming = false; + } + else if (lazy_vacuum_all_indexes(vacrel)) + { + /* + * We successfully completed a round of index vacuuming. Do related + * heap vacuuming now. + */ + lazy_vacuum_tdeheap_rel(vacrel); + } + else + { + /* + * Failsafe case. + * + * We attempted index vacuuming, but didn't finish a full round/full + * index scan. This happens when relfrozenxid or relminmxid is too + * far in the past. + * + * From this point on the VACUUM operation will do no further index + * vacuuming or heap vacuuming. This VACUUM operation won't end up + * back here again. + */ + Assert(VacuumFailsafeActive); + } + + /* + * Forget the LP_DEAD items that we just vacuumed (or just decided to not + * vacuum) + */ + vacrel->dead_items->num_items = 0; +} + +/* + * lazy_vacuum_all_indexes() -- Main entry for index vacuuming + * + * Returns true in the common case when all indexes were successfully + * vacuumed. Returns false in rare cases where we determined that the ongoing + * VACUUM operation is at risk of taking too long to finish, leading to + * wraparound failure. + */ +static bool +lazy_vacuum_all_indexes(LVRelState *vacrel) +{ + bool allindexes = true; + double old_live_tuples = vacrel->rel->rd_rel->reltuples; + + Assert(vacrel->nindexes > 0); + Assert(vacrel->do_index_vacuuming); + Assert(vacrel->do_index_cleanup); + + /* Precheck for XID wraparound emergencies */ + if (lazy_check_wraparound_failsafe(vacrel)) + { + /* Wraparound emergency -- don't even start an index scan */ + return false; + } + + /* Report that we are now vacuuming indexes */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_VACUUM_INDEX); + + if (!ParallelVacuumIsActive(vacrel)) + { + for (int idx = 0; idx < vacrel->nindexes; idx++) + { + Relation indrel = vacrel->indrels[idx]; + IndexBulkDeleteResult *istat = vacrel->indstats[idx]; + + vacrel->indstats[idx] = lazy_vacuum_one_index(indrel, istat, + old_live_tuples, + vacrel); + + if (lazy_check_wraparound_failsafe(vacrel)) + { + /* Wraparound emergency -- end current index scan */ + allindexes = false; + break; + } + } + } + else + { + /* Outsource everything to parallel variant */ + parallel_vacuum_bulkdel_all_indexes(vacrel->pvs, old_live_tuples, + vacrel->num_index_scans); + + /* + * Do a postcheck to consider applying wraparound failsafe now. Note + * that parallel VACUUM only gets the precheck and this postcheck. + */ + if (lazy_check_wraparound_failsafe(vacrel)) + allindexes = false; + } + + /* + * We delete all LP_DEAD items from the first heap pass in all indexes on + * each call here (except calls where we choose to do the failsafe). This + * makes the next call to lazy_vacuum_tdeheap_rel() safe (except in the event + * of the failsafe triggering, which prevents the next call from taking + * place). + */ + Assert(vacrel->num_index_scans > 0 || + vacrel->dead_items->num_items == vacrel->lpdead_items); + Assert(allindexes || VacuumFailsafeActive); + + /* + * Increase and report the number of index scans. + * + * We deliberately include the case where we started a round of bulk + * deletes that we weren't able to finish due to the failsafe triggering. + */ + vacrel->num_index_scans++; + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS, + vacrel->num_index_scans); + + return allindexes; +} + +/* + * lazy_vacuum_tdeheap_rel() -- second pass over the heap for two pass strategy + * + * This routine marks LP_DEAD items in vacrel->dead_items array as LP_UNUSED. + * Pages that never had lazy_scan_prune record LP_DEAD items are not visited + * at all. + * + * We may also be able to truncate the line pointer array of the heap pages we + * visit. If there is a contiguous group of LP_UNUSED items at the end of the + * array, it can be reclaimed as free space. These LP_UNUSED items usually + * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from + * each page to LP_UNUSED, and then consider if it's possible to truncate the + * page's line pointer array). + * + * Note: the reason for doing this as a second pass is we cannot remove the + * tuples until we've removed their index entries, and we want to process + * index entry removal in batches as large as possible. + */ +static void +lazy_vacuum_tdeheap_rel(LVRelState *vacrel) +{ + int index = 0; + BlockNumber vacuumed_pages = 0; + Buffer vmbuffer = InvalidBuffer; + LVSavedErrInfo saved_err_info; + + Assert(vacrel->do_index_vacuuming); + Assert(vacrel->do_index_cleanup); + Assert(vacrel->num_index_scans > 0); + + /* Report that we are now vacuuming the heap */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_VACUUM_HEAP); + + /* Update error traceback information */ + update_vacuum_error_info(vacrel, &saved_err_info, + VACUUM_ERRCB_PHASE_VACUUM_HEAP, + InvalidBlockNumber, InvalidOffsetNumber); + + while (index < vacrel->dead_items->num_items) + { + BlockNumber blkno; + Buffer buf; + Page page; + Size freespace; + + vacuum_delay_point(); + + blkno = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); + vacrel->blkno = blkno; + + /* + * Pin the visibility map page in case we need to mark the page + * all-visible. In most cases this will be very cheap, because we'll + * already have the correct page pinned anyway. + */ + tdeheap_visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); + + /* We need a non-cleanup exclusive lock to mark dead_items unused */ + buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + vacrel->bstrategy); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + index = lazy_vacuum_tdeheap_page(vacrel, blkno, buf, index, vmbuffer); + + /* Now that we've vacuumed the page, record its available space */ + page = BufferGetPage(buf); + freespace = PageGetHeapFreeSpace(page); + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + vacuumed_pages++; + } + + vacrel->blkno = InvalidBlockNumber; + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); + + /* + * We set all LP_DEAD items from the first heap pass to LP_UNUSED during + * the second heap pass. No more, no less. + */ + Assert(index > 0); + Assert(vacrel->num_index_scans > 1 || + (index == vacrel->lpdead_items && + vacuumed_pages == vacrel->lpdead_item_pages)); + + ereport(DEBUG2, + (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages", + vacrel->relname, (long long) index, vacuumed_pages))); + + /* Revert to the previous phase information for error traceback */ + restore_vacuum_error_info(vacrel, &saved_err_info); +} + +/* + * lazy_vacuum_tdeheap_page() -- free page's LP_DEAD items listed in the + * vacrel->dead_items array. + * + * Caller must have an exclusive buffer lock on the buffer (though a full + * cleanup lock is also acceptable). vmbuffer must be valid and already have + * a pin on blkno's visibility map page. + * + * index is an offset into the vacrel->dead_items array for the first listed + * LP_DEAD item on the page. The return value is the first index immediately + * after all LP_DEAD items for the same page in the array. + */ +static int +lazy_vacuum_tdeheap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, + int index, Buffer vmbuffer) +{ + VacDeadItems *dead_items = vacrel->dead_items; + Page page = BufferGetPage(buffer); + OffsetNumber unused[MaxHeapTuplesPerPage]; + int nunused = 0; + TransactionId visibility_cutoff_xid; + bool all_frozen; + LVSavedErrInfo saved_err_info; + + Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming); + + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); + + /* Update error traceback information */ + update_vacuum_error_info(vacrel, &saved_err_info, + VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno, + InvalidOffsetNumber); + + START_CRIT_SECTION(); + + for (; index < dead_items->num_items; index++) + { + BlockNumber tblk; + OffsetNumber toff; + ItemId itemid; + + tblk = ItemPointerGetBlockNumber(&dead_items->items[index]); + if (tblk != blkno) + break; /* past end of tuples for this block */ + toff = ItemPointerGetOffsetNumber(&dead_items->items[index]); + itemid = PageGetItemId(page, toff); + + Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid)); + ItemIdSetUnused(itemid); + unused[nunused++] = toff; + } + + Assert(nunused > 0); + + /* Attempt to truncate line pointer array now */ + PageTruncateLinePointerArray(page); + + /* + * Mark buffer dirty before we write WAL. + */ + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(vacrel->rel)) + { + xl_tdeheap_vacuum xlrec; + XLogRecPtr recptr; + + xlrec.nunused = nunused; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) unused, nunused * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM); + + PageSetLSN(page, recptr); + } + + /* + * End critical section, so we safely can do visibility tests (which + * possibly need to perform IO and allocate memory!). If we crash now the + * page (including the corresponding vm bit) might not be marked all + * visible, but that's fine. A later vacuum will fix that. + */ + END_CRIT_SECTION(); + + /* + * Now that we have removed the LP_DEAD items from the page, once again + * check if the page has become all-visible. The page is already marked + * dirty, exclusively locked, and, if needed, a full page image has been + * emitted. + */ + Assert(!PageIsAllVisible(page)); + if (tdeheap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid, + &all_frozen)) + { + uint8 flags = VISIBILITYMAP_ALL_VISIBLE; + + if (all_frozen) + { + Assert(!TransactionIdIsValid(visibility_cutoff_xid)); + flags |= VISIBILITYMAP_ALL_FROZEN; + } + + PageSetAllVisible(page); + tdeheap_visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr, + vmbuffer, visibility_cutoff_xid, flags); + } + + /* Revert to the previous phase information for error traceback */ + restore_vacuum_error_info(vacrel, &saved_err_info); + return index; +} + +/* + * Trigger the failsafe to avoid wraparound failure when vacrel table has a + * relfrozenxid and/or relminmxid that is dangerously far in the past. + * Triggering the failsafe makes the ongoing VACUUM bypass any further index + * vacuuming and heap vacuuming. Truncating the heap is also bypassed. + * + * Any remaining work (work that VACUUM cannot just bypass) is typically sped + * up when the failsafe triggers. VACUUM stops applying any cost-based delay + * that it started out with. + * + * Returns true when failsafe has been triggered. + */ +static bool +lazy_check_wraparound_failsafe(LVRelState *vacrel) +{ + /* Don't warn more than once per VACUUM */ + if (VacuumFailsafeActive) + return true; + + if (unlikely(vacuum_xid_failsafe_check(&vacrel->cutoffs))) + { + VacuumFailsafeActive = true; + + /* + * Abandon use of a buffer access strategy to allow use of all of + * shared buffers. We assume the caller who allocated the memory for + * the BufferAccessStrategy will free it. + */ + vacrel->bstrategy = NULL; + + /* Disable index vacuuming, index cleanup, and heap rel truncation */ + vacrel->do_index_vacuuming = false; + vacrel->do_index_cleanup = false; + vacrel->do_rel_truncate = false; + + ereport(WARNING, + (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans", + vacrel->dbname, vacrel->relnamespace, vacrel->relname, + vacrel->num_index_scans), + errdetail("The table's relfrozenxid or relminmxid is too far in the past."), + errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n" + "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs."))); + + /* Stop applying cost limits from this point on */ + VacuumCostActive = false; + VacuumCostBalance = 0; + + return true; + } + + return false; +} + +/* + * lazy_cleanup_all_indexes() -- cleanup all indexes of relation. + */ +static void +lazy_cleanup_all_indexes(LVRelState *vacrel) +{ + double reltuples = vacrel->new_rel_tuples; + bool estimated_count = vacrel->scanned_pages < vacrel->rel_pages; + + Assert(vacrel->do_index_cleanup); + Assert(vacrel->nindexes > 0); + + /* Report that we are now cleaning up indexes */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_INDEX_CLEANUP); + + if (!ParallelVacuumIsActive(vacrel)) + { + for (int idx = 0; idx < vacrel->nindexes; idx++) + { + Relation indrel = vacrel->indrels[idx]; + IndexBulkDeleteResult *istat = vacrel->indstats[idx]; + + vacrel->indstats[idx] = + lazy_cleanup_one_index(indrel, istat, reltuples, + estimated_count, vacrel); + } + } + else + { + /* Outsource everything to parallel variant */ + parallel_vacuum_cleanup_all_indexes(vacrel->pvs, reltuples, + vacrel->num_index_scans, + estimated_count); + } +} + +/* + * lazy_vacuum_one_index() -- vacuum index relation. + * + * Delete all the index tuples containing a TID collected in + * vacrel->dead_items array. Also update running statistics. + * Exact details depend on index AM's ambulkdelete routine. + * + * reltuples is the number of heap tuples to be passed to the + * bulkdelete callback. It's always assumed to be estimated. + * See indexam.sgml for more info. + * + * Returns bulk delete stats derived from input stats + */ +static IndexBulkDeleteResult * +lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat, + double reltuples, LVRelState *vacrel) +{ + IndexVacuumInfo ivinfo; + LVSavedErrInfo saved_err_info; + + ivinfo.index = indrel; + ivinfo.heaprel = vacrel->rel; + ivinfo.analyze_only = false; + ivinfo.report_progress = false; + ivinfo.estimated_count = true; + ivinfo.message_level = DEBUG2; + ivinfo.num_heap_tuples = reltuples; + ivinfo.strategy = vacrel->bstrategy; + + /* + * Update error traceback information. + * + * The index name is saved during this phase and restored immediately + * after this phase. See vacuum_error_callback. + */ + Assert(vacrel->indname == NULL); + vacrel->indname = pstrdup(RelationGetRelationName(indrel)); + update_vacuum_error_info(vacrel, &saved_err_info, + VACUUM_ERRCB_PHASE_VACUUM_INDEX, + InvalidBlockNumber, InvalidOffsetNumber); + + /* Do bulk deletion */ + istat = vac_bulkdel_one_index(&ivinfo, istat, (void *) vacrel->dead_items); + + /* Revert to the previous phase information for error traceback */ + restore_vacuum_error_info(vacrel, &saved_err_info); + pfree(vacrel->indname); + vacrel->indname = NULL; + + return istat; +} + +/* + * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation. + * + * Calls index AM's amvacuumcleanup routine. reltuples is the number + * of heap tuples and estimated_count is true if reltuples is an + * estimated value. See indexam.sgml for more info. + * + * Returns bulk delete stats derived from input stats + */ +static IndexBulkDeleteResult * +lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat, + double reltuples, bool estimated_count, + LVRelState *vacrel) +{ + IndexVacuumInfo ivinfo; + LVSavedErrInfo saved_err_info; + + ivinfo.index = indrel; + ivinfo.heaprel = vacrel->rel; + ivinfo.analyze_only = false; + ivinfo.report_progress = false; + ivinfo.estimated_count = estimated_count; + ivinfo.message_level = DEBUG2; + + ivinfo.num_heap_tuples = reltuples; + ivinfo.strategy = vacrel->bstrategy; + + /* + * Update error traceback information. + * + * The index name is saved during this phase and restored immediately + * after this phase. See vacuum_error_callback. + */ + Assert(vacrel->indname == NULL); + vacrel->indname = pstrdup(RelationGetRelationName(indrel)); + update_vacuum_error_info(vacrel, &saved_err_info, + VACUUM_ERRCB_PHASE_INDEX_CLEANUP, + InvalidBlockNumber, InvalidOffsetNumber); + + istat = vac_cleanup_one_index(&ivinfo, istat); + + /* Revert to the previous phase information for error traceback */ + restore_vacuum_error_info(vacrel, &saved_err_info); + pfree(vacrel->indname); + vacrel->indname = NULL; + + return istat; +} + +/* + * should_attempt_truncation - should we attempt to truncate the heap? + * + * Don't even think about it unless we have a shot at releasing a goodly + * number of pages. Otherwise, the time taken isn't worth it, mainly because + * an AccessExclusive lock must be replayed on any hot standby, where it can + * be particularly disruptive. + * + * Also don't attempt it if wraparound failsafe is in effect. The entire + * system might be refusing to allocate new XIDs at this point. The system + * definitely won't return to normal unless and until VACUUM actually advances + * the oldest relfrozenxid -- which hasn't happened for target rel just yet. + * If lazy_truncate_heap attempted to acquire an AccessExclusiveLock to + * truncate the table under these circumstances, an XID exhaustion error might + * make it impossible for VACUUM to fix the underlying XID exhaustion problem. + * There is very little chance of truncation working out when the failsafe is + * in effect in any case. lazy_scan_prune makes the optimistic assumption + * that any LP_DEAD items it encounters will always be LP_UNUSED by the time + * we're called. + * + * Also don't attempt it if we are doing early pruning/vacuuming, because a + * scan which cannot find a truncated heap page cannot determine that the + * snapshot is too old to read that page. + */ +static bool +should_attempt_truncation(LVRelState *vacrel) +{ + BlockNumber possibly_freeable; + + if (!vacrel->do_rel_truncate || VacuumFailsafeActive || + old_snapshot_threshold >= 0) + return false; + + possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages; + if (possibly_freeable > 0 && + (possibly_freeable >= REL_TRUNCATE_MINIMUM || + possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION)) + return true; + + return false; +} + +/* + * lazy_truncate_heap - try to truncate off any empty pages at the end + */ +static void +lazy_truncate_heap(LVRelState *vacrel) +{ + BlockNumber orig_rel_pages = vacrel->rel_pages; + BlockNumber new_rel_pages; + bool lock_waiter_detected; + int lock_retry; + + /* Report that we are now truncating */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_TRUNCATE); + + /* Update error traceback information one last time */ + update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE, + vacrel->nonempty_pages, InvalidOffsetNumber); + + /* + * Loop until no more truncating can be done. + */ + do + { + /* + * We need full exclusive lock on the relation in order to do + * truncation. If we can't get it, give up rather than waiting --- we + * don't want to block other backends, and we don't want to deadlock + * (which is quite possible considering we already hold a lower-grade + * lock). + */ + lock_waiter_detected = false; + lock_retry = 0; + while (true) + { + if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock)) + break; + + /* + * Check for interrupts while trying to (re-)acquire the exclusive + * lock. + */ + CHECK_FOR_INTERRUPTS(); + + if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT / + VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) + { + /* + * We failed to establish the lock in the specified number of + * retries. This means we give up truncating. + */ + ereport(vacrel->verbose ? INFO : DEBUG2, + (errmsg("\"%s\": stopping truncate due to conflicting lock request", + vacrel->relname))); + return; + } + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL, + WAIT_EVENT_VACUUM_TRUNCATE); + ResetLatch(MyLatch); + } + + /* + * Now that we have exclusive lock, look to see if the rel has grown + * whilst we were vacuuming with non-exclusive lock. If so, give up; + * the newly added pages presumably contain non-deletable tuples. + */ + new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel); + if (new_rel_pages != orig_rel_pages) + { + /* + * Note: we intentionally don't update vacrel->rel_pages with the + * new rel size here. If we did, it would amount to assuming that + * the new pages are empty, which is unlikely. Leaving the numbers + * alone amounts to assuming that the new pages have the same + * tuple density as existing ones, which is less unlikely. + */ + UnlockRelation(vacrel->rel, AccessExclusiveLock); + return; + } + + /* + * Scan backwards from the end to verify that the end pages actually + * contain no tuples. This is *necessary*, not optional, because + * other backends could have added tuples to these pages whilst we + * were vacuuming. + */ + new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected); + vacrel->blkno = new_rel_pages; + + if (new_rel_pages >= orig_rel_pages) + { + /* can't do anything after all */ + UnlockRelation(vacrel->rel, AccessExclusiveLock); + return; + } + + /* + * Okay to truncate. + */ + RelationTruncate(vacrel->rel, new_rel_pages); + + /* + * We can release the exclusive lock as soon as we have truncated. + * Other backends can't safely access the relation until they have + * processed the smgr invalidation that smgrtruncate sent out ... but + * that should happen as part of standard invalidation processing once + * they acquire lock on the relation. + */ + UnlockRelation(vacrel->rel, AccessExclusiveLock); + + /* + * Update statistics. Here, it *is* correct to adjust rel_pages + * without also touching reltuples, since the tuple count wasn't + * changed by the truncation. + */ + vacrel->removed_pages += orig_rel_pages - new_rel_pages; + vacrel->rel_pages = new_rel_pages; + + ereport(vacrel->verbose ? INFO : DEBUG2, + (errmsg("table \"%s\": truncated %u to %u pages", + vacrel->relname, + orig_rel_pages, new_rel_pages))); + orig_rel_pages = new_rel_pages; + } while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected); +} + +/* + * Rescan end pages to verify that they are (still) empty of tuples. + * + * Returns number of nondeletable pages (last nonempty page + 1). + */ +static BlockNumber +count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) +{ + BlockNumber blkno; + BlockNumber prefetchedUntil; + instr_time starttime; + + /* Initialize the starttime if we check for conflicting lock requests */ + INSTR_TIME_SET_CURRENT(starttime); + + /* + * Start checking blocks at what we believe relation end to be and move + * backwards. (Strange coding of loop control is needed because blkno is + * unsigned.) To make the scan faster, we prefetch a few blocks at a time + * in forward direction, so that OS-level readahead can kick in. + */ + blkno = vacrel->rel_pages; + StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0, + "prefetch size must be power of 2"); + prefetchedUntil = InvalidBlockNumber; + while (blkno > vacrel->nonempty_pages) + { + Buffer buf; + Page page; + OffsetNumber offnum, + maxoff; + bool hastup; + + /* + * Check if another process requests a lock on our relation. We are + * holding an AccessExclusiveLock here, so they will be waiting. We + * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we + * only check if that interval has elapsed once every 32 blocks to + * keep the number of system calls and actual shared lock table + * lookups to a minimum. + */ + if ((blkno % 32) == 0) + { + instr_time currenttime; + instr_time elapsed; + + INSTR_TIME_SET_CURRENT(currenttime); + elapsed = currenttime; + INSTR_TIME_SUBTRACT(elapsed, starttime); + if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) + >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) + { + if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock)) + { + ereport(vacrel->verbose ? INFO : DEBUG2, + (errmsg("table \"%s\": suspending truncate due to conflicting lock request", + vacrel->relname))); + + *lock_waiter_detected = true; + return blkno; + } + starttime = currenttime; + } + } + + /* + * We don't insert a vacuum delay point here, because we have an + * exclusive lock on the table which we want to hold for as short a + * time as possible. We still need to check for interrupts however. + */ + CHECK_FOR_INTERRUPTS(); + + blkno--; + + /* If we haven't prefetched this lot yet, do so now. */ + if (prefetchedUntil > blkno) + { + BlockNumber prefetchStart; + BlockNumber pblkno; + + prefetchStart = blkno & ~(PREFETCH_SIZE - 1); + for (pblkno = prefetchStart; pblkno <= blkno; pblkno++) + { + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno); + CHECK_FOR_INTERRUPTS(); + } + prefetchedUntil = prefetchStart; + } + + buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + vacrel->bstrategy); + + /* In this phase we only need shared access to the buffer */ + LockBuffer(buf, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buf); + + if (PageIsNew(page) || PageIsEmpty(page)) + { + UnlockReleaseBuffer(buf); + continue; + } + + hastup = false; + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + /* + * Note: any non-unused item should be taken as a reason to keep + * this page. Even an LP_DEAD item makes truncation unsafe, since + * we must not have cleaned out its index entries. + */ + if (ItemIdIsUsed(itemid)) + { + hastup = true; + break; /* can stop scanning */ + } + } /* scan along page */ + + UnlockReleaseBuffer(buf); + + /* Done scanning if we found a tuple here */ + if (hastup) + return blkno + 1; + } + + /* + * If we fall out of the loop, all the previously-thought-to-be-empty + * pages still are; we need not bother to look at the last known-nonempty + * page. + */ + return vacrel->nonempty_pages; +} + +/* + * Returns the number of dead TIDs that VACUUM should allocate space to + * store, given a heap rel of size vacrel->rel_pages, and given current + * maintenance_work_mem setting (or current autovacuum_work_mem setting, + * when applicable). + * + * See the comments at the head of this file for rationale. + */ +static int +dead_items_max_items(LVRelState *vacrel) +{ + int64 max_items; + int vac_work_mem = IsAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem : maintenance_work_mem; + + if (vacrel->nindexes > 0) + { + BlockNumber rel_pages = vacrel->rel_pages; + + max_items = MAXDEADITEMS(vac_work_mem * 1024L); + max_items = Min(max_items, INT_MAX); + max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize)); + + /* curious coding here to ensure the multiplication can't overflow */ + if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages) + max_items = rel_pages * MaxHeapTuplesPerPage; + + /* stay sane if small maintenance_work_mem */ + max_items = Max(max_items, MaxHeapTuplesPerPage); + } + else + { + /* One-pass case only stores a single heap page's TIDs at a time */ + max_items = MaxHeapTuplesPerPage; + } + + return (int) max_items; +} + +/* + * Allocate dead_items (either using palloc, or in dynamic shared memory). + * Sets dead_items in vacrel for caller. + * + * Also handles parallel initialization as part of allocating dead_items in + * DSM when required. + */ +static void +dead_items_alloc(LVRelState *vacrel, int nworkers) +{ + VacDeadItems *dead_items; + int max_items; + + max_items = dead_items_max_items(vacrel); + Assert(max_items >= MaxHeapTuplesPerPage); + + /* + * Initialize state for a parallel vacuum. As of now, only one worker can + * be used for an index, so we invoke parallelism only if there are at + * least two indexes on a table. + */ + if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming) + { + /* + * Since parallel workers cannot access data in temporary tables, we + * can't perform parallel vacuum on them. + */ + if (RelationUsesLocalBuffers(vacrel->rel)) + { + /* + * Give warning only if the user explicitly tries to perform a + * parallel vacuum on the temporary table. + */ + if (nworkers > 0) + ereport(WARNING, + (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel", + vacrel->relname))); + } + else + vacrel->pvs = parallel_vacuum_init(vacrel->rel, vacrel->indrels, + vacrel->nindexes, nworkers, + max_items, + vacrel->verbose ? INFO : DEBUG2, + vacrel->bstrategy); + + /* If parallel mode started, dead_items space is allocated in DSM */ + if (ParallelVacuumIsActive(vacrel)) + { + vacrel->dead_items = parallel_vacuum_get_dead_items(vacrel->pvs); + return; + } + } + + /* Serial VACUUM case */ + dead_items = (VacDeadItems *) palloc(vac_max_items_to_alloc_size(max_items)); + dead_items->max_items = max_items; + dead_items->num_items = 0; + + vacrel->dead_items = dead_items; +} + +/* + * Perform cleanup for resources allocated in dead_items_alloc + */ +static void +dead_items_cleanup(LVRelState *vacrel) +{ + if (!ParallelVacuumIsActive(vacrel)) + { + /* Don't bother with pfree here */ + return; + } + + /* End parallel mode */ + parallel_vacuum_end(vacrel->pvs, vacrel->indstats); + vacrel->pvs = NULL; +} + +/* + * Check if every tuple in the given page is visible to all current and future + * transactions. Also return the visibility_cutoff_xid which is the highest + * xmin amongst the visible tuples. Set *all_frozen to true if every tuple + * on this page is frozen. + * + * This is a stripped down version of lazy_scan_prune(). If you change + * anything here, make sure that everything stays in sync. Note that an + * assertion calls us to verify that everybody still agrees. Be sure to avoid + * introducing new side-effects here. + */ +static bool +tdeheap_page_is_all_visible(LVRelState *vacrel, Buffer buf, + TransactionId *visibility_cutoff_xid, + bool *all_frozen) +{ + Page page = BufferGetPage(buf); + BlockNumber blockno = BufferGetBlockNumber(buf); + OffsetNumber offnum, + maxoff; + bool all_visible = true; + + *visibility_cutoff_xid = InvalidTransactionId; + *all_frozen = true; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff && all_visible; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleData tuple; + + /* + * Set the offset number so that we can display it along with any + * error that occurred while processing this tuple. + */ + vacrel->offnum = offnum; + itemid = PageGetItemId(page, offnum); + + /* Unused or redirect line pointers are of no interest */ + if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid)) + continue; + + ItemPointerSet(&(tuple.t_self), blockno, offnum); + + /* + * Dead line pointers can have index pointers pointing to them. So + * they can't be treated as visible + */ + if (ItemIdIsDead(itemid)) + { + all_visible = false; + *all_frozen = false; + break; + } + + Assert(ItemIdIsNormal(itemid)); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(vacrel->rel); + + switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin, + buf)) + { + case HEAPTUPLE_LIVE: + { + TransactionId xmin; + + /* Check comments in lazy_scan_prune. */ + if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + { + all_visible = false; + *all_frozen = false; + break; + } + + /* + * The inserter definitely committed. But is it old enough + * that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, + vacrel->cutoffs.OldestXmin)) + { + all_visible = false; + *all_frozen = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, *visibility_cutoff_xid) && + TransactionIdIsNormal(xmin)) + *visibility_cutoff_xid = xmin; + + /* Check whether this tuple is already frozen or not */ + if (all_visible && *all_frozen && + tdeheap_tuple_needs_eventual_freeze(tuple.t_data)) + *all_frozen = false; + } + break; + + case HEAPTUPLE_DEAD: + case HEAPTUPLE_RECENTLY_DEAD: + case HEAPTUPLE_INSERT_IN_PROGRESS: + case HEAPTUPLE_DELETE_IN_PROGRESS: + { + all_visible = false; + *all_frozen = false; + break; + } + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + } /* scan along page */ + + /* Clear the offset information once we have processed the given page. */ + vacrel->offnum = InvalidOffsetNumber; + + return all_visible; +} + +/* + * Update index statistics in pg_class if the statistics are accurate. + */ +static void +update_relstats_all_indexes(LVRelState *vacrel) +{ + Relation *indrels = vacrel->indrels; + int nindexes = vacrel->nindexes; + IndexBulkDeleteResult **indstats = vacrel->indstats; + + Assert(vacrel->do_index_cleanup); + + for (int idx = 0; idx < nindexes; idx++) + { + Relation indrel = indrels[idx]; + IndexBulkDeleteResult *istat = indstats[idx]; + + if (istat == NULL || istat->estimated_count) + continue; + + /* Update index statistics */ + vac_update_relstats(indrel, + istat->num_pages, + istat->num_index_tuples, + 0, + false, + InvalidTransactionId, + InvalidMultiXactId, + NULL, NULL, false); + } +} + +/* + * Error context callback for errors occurring during vacuum. The error + * context messages for index phases should match the messages set in parallel + * vacuum. If you change this function for those phases, change + * parallel_vacuum_error_callback() as well. + */ +static void +vacuum_error_callback(void *arg) +{ + LVRelState *errinfo = arg; + + switch (errinfo->phase) + { + case VACUUM_ERRCB_PHASE_SCAN_HEAP: + if (BlockNumberIsValid(errinfo->blkno)) + { + if (OffsetNumberIsValid(errinfo->offnum)) + errcontext("while scanning block %u offset %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname); + else + errcontext("while scanning block %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->relnamespace, errinfo->relname); + } + else + errcontext("while scanning relation \"%s.%s\"", + errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_VACUUM_HEAP: + if (BlockNumberIsValid(errinfo->blkno)) + { + if (OffsetNumberIsValid(errinfo->offnum)) + errcontext("while vacuuming block %u offset %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname); + else + errcontext("while vacuuming block %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->relnamespace, errinfo->relname); + } + else + errcontext("while vacuuming relation \"%s.%s\"", + errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_VACUUM_INDEX: + errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"", + errinfo->indname, errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_INDEX_CLEANUP: + errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"", + errinfo->indname, errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_TRUNCATE: + if (BlockNumberIsValid(errinfo->blkno)) + errcontext("while truncating relation \"%s.%s\" to %u blocks", + errinfo->relnamespace, errinfo->relname, errinfo->blkno); + break; + + case VACUUM_ERRCB_PHASE_UNKNOWN: + default: + return; /* do nothing; the errinfo may not be + * initialized */ + } +} + +/* + * Updates the information required for vacuum error callback. This also saves + * the current information which can be later restored via restore_vacuum_error_info. + */ +static void +update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel, + int phase, BlockNumber blkno, OffsetNumber offnum) +{ + if (saved_vacrel) + { + saved_vacrel->offnum = vacrel->offnum; + saved_vacrel->blkno = vacrel->blkno; + saved_vacrel->phase = vacrel->phase; + } + + vacrel->blkno = blkno; + vacrel->offnum = offnum; + vacrel->phase = phase; +} + +/* + * Restores the vacuum information saved via a prior call to update_vacuum_error_info. + */ +static void +restore_vacuum_error_info(LVRelState *vacrel, + const LVSavedErrInfo *saved_vacrel) +{ + vacrel->blkno = saved_vacrel->blkno; + vacrel->offnum = saved_vacrel->offnum; + vacrel->phase = saved_vacrel->phase; +} diff --git a/src16/access/pg_tde_visibilitymap.c b/src16/access/pg_tde_visibilitymap.c new file mode 100644 index 00000000..45e8d627 --- /dev/null +++ b/src16/access/pg_tde_visibilitymap.c @@ -0,0 +1,647 @@ +/*------------------------------------------------------------------------- + * + * visibilitymap.c + * bitmap for tracking visibility of heap tuples + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/visibilitymap.c + * + * INTERFACE ROUTINES + * tdeheap_visibilitymap_clear - clear bits for one page in the visibility map + * tdeheap_visibilitymap_pin - pin a map page for setting a bit + * tdeheap_visibilitymap_pin_ok - check whether correct map page is already pinned + * tdeheap_visibilitymap_set - set a bit in a previously pinned page + * tdeheap_visibilitymap_get_status - get status of bits + * tdeheap_visibilitymap_count - count number of bits set in visibility map + * tdeheap_visibilitymap_prepare_truncate - + * prepare for truncation of the visibility map + * + * NOTES + * + * The visibility map is a bitmap with two bits (all-visible and all-frozen) + * per heap page. A set all-visible bit means that all tuples on the page are + * known visible to all transactions, and therefore the page doesn't need to + * be vacuumed. A set all-frozen bit means that all tuples on the page are + * completely frozen, and therefore the page doesn't need to be vacuumed even + * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum). + * The all-frozen bit must be set only when the page is already all-visible. + * + * The map is conservative in the sense that we make sure that whenever a bit + * is set, we know the condition is true, but if a bit is not set, it might or + * might not be true. + * + * Clearing visibility map bits is not separately WAL-logged. The callers + * must make sure that whenever a bit is cleared, the bit is cleared on WAL + * replay of the updating operation as well. + * + * When we *set* a visibility map during VACUUM, we must write WAL. This may + * seem counterintuitive, since the bit is basically a hint: if it is clear, + * it may still be the case that every tuple on the page is visible to all + * transactions; we just don't know that for certain. The difficulty is that + * there are two bits which are typically set together: the PD_ALL_VISIBLE bit + * on the page itself, and the visibility map bit. If a crash occurs after the + * visibility map page makes it to disk and before the updated heap page makes + * it to disk, redo must set the bit on the heap page. Otherwise, the next + * insert, update, or delete on the heap page will fail to realize that the + * visibility map bit must be cleared, possibly causing index-only scans to + * return wrong answers. + * + * VACUUM will normally skip pages for which the visibility map bit is set; + * such pages can't contain any dead tuples and therefore don't need vacuuming. + * + * LOCKING + * + * In heapam.c, whenever a page is modified so that not all tuples on the + * page are visible to everyone anymore, the corresponding bit in the + * visibility map is cleared. In order to be crash-safe, we need to do this + * while still holding a lock on the heap page and in the same critical + * section that logs the page modification. However, we don't want to hold + * the buffer lock over any I/O that may be required to read in the visibility + * map page. To avoid this, we examine the heap page before locking it; + * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map + * bit. Then, we lock the buffer. But this creates a race condition: there + * is a possibility that in the time it takes to lock the buffer, the + * PD_ALL_VISIBLE bit gets set. If that happens, we have to unlock the + * buffer, pin the visibility map page, and relock the buffer. This shouldn't + * happen often, because only VACUUM currently sets visibility map bits, + * and the race will only occur if VACUUM processes a given page at almost + * exactly the same time that someone tries to further modify it. + * + * To set a bit, you need to hold a lock on the heap page. That prevents + * the race condition where VACUUM sees that all tuples on the page are + * visible to everyone, but another backend modifies the page before VACUUM + * sets the bit in the visibility map. + * + * When a bit is set, the LSN of the visibility map page is updated to make + * sure that the visibility map update doesn't get written to disk before the + * WAL record of the changes that made it possible to set the bit is flushed. + * But when a bit is cleared, we don't have to do that because it's always + * safe to clear a bit in the map from correctness point of view. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/pg_tdeam_xlog.h" +#include "access/visibilitymap.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/inval.h" + + +/*#define TRACE_VISIBILITYMAP */ + +/* + * Size of the bitmap on each visibility map page, in bytes. There's no + * extra headers, so the whole page minus the standard page header is + * used for the bitmap. + */ +#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) + +/* Number of heap blocks we can represent in one byte */ +#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) + +/* Number of heap blocks we can represent in one visibility map page. */ +#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) + +/* Mapping from heap block number to the right bit in the visibility map */ +#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) +#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) +#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) + +/* Masks for counting subsets of bits in the visibility map. */ +#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each + * bit pair */ +#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each + * bit pair */ + +/* prototypes for internal routines */ +static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); +static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks); + + +/* + * tdeheap_visibilitymap_clear - clear specified bits for one page in visibility map + * + * You must pass a buffer containing the correct map page to this function. + * Call tdeheap_visibilitymap_pin first to pin the right one. This function doesn't do + * any I/O. Returns true if any bits have been cleared and false otherwise. + */ +bool +tdeheap_visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + int mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + uint8 mask = flags << mapOffset; + char *map; + bool cleared = false; + + /* Must never clear all_visible bit while leaving all_frozen bit set */ + Assert(flags & VISIBILITYMAP_VALID_BITS); + Assert(flags != VISIBILITYMAP_ALL_VISIBLE); + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + if (!BufferIsValid(vmbuf) || BufferGetBlockNumber(vmbuf) != mapBlock) + elog(ERROR, "wrong buffer passed to tdeheap_visibilitymap_clear"); + + LockBuffer(vmbuf, BUFFER_LOCK_EXCLUSIVE); + map = PageGetContents(BufferGetPage(vmbuf)); + + if (map[mapByte] & mask) + { + map[mapByte] &= ~mask; + + MarkBufferDirty(vmbuf); + cleared = true; + } + + LockBuffer(vmbuf, BUFFER_LOCK_UNLOCK); + + return cleared; +} + +/* + * tdeheap_visibilitymap_pin - pin a map page for setting a bit + * + * Setting a bit in the visibility map is a two-phase operation. First, call + * tdeheap_visibilitymap_pin, to pin the visibility map page containing the bit for + * the heap page. Because that can require I/O to read the map page, you + * shouldn't hold a lock on the heap page while doing that. Then, call + * tdeheap_visibilitymap_set to actually set the bit. + * + * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by + * an earlier call to tdeheap_visibilitymap_pin or tdeheap_visibilitymap_get_status on the same + * relation. On return, *vmbuf is a valid buffer with the map page containing + * the bit for heapBlk. + * + * If the page doesn't exist in the map file yet, it is extended. + */ +void +tdeheap_visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + + /* Reuse the old pinned buffer if possible */ + if (BufferIsValid(*vmbuf)) + { + if (BufferGetBlockNumber(*vmbuf) == mapBlock) + return; + + ReleaseBuffer(*vmbuf); + } + *vmbuf = vm_readbuf(rel, mapBlock, true); +} + +/* + * tdeheap_visibilitymap_pin_ok - do we already have the correct page pinned? + * + * On entry, vmbuf should be InvalidBuffer or a valid buffer returned by + * an earlier call to tdeheap_visibilitymap_pin or tdeheap_visibilitymap_get_status on the same + * relation. The return value indicates whether the buffer covers the + * given heapBlk. + */ +bool +tdeheap_visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + + return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock; +} + +/* + * tdeheap_visibilitymap_set - set bit(s) on a previously pinned page + * + * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, + * or InvalidXLogRecPtr in normal running. The VM page LSN is advanced to the + * one provided; in normal running, we generate a new XLOG record and set the + * page LSN to that value (though the heap page's LSN may *not* be updated; + * see below). cutoff_xid is the largest xmin on the page being marked + * all-visible; it is needed for Hot Standby, and can be InvalidTransactionId + * if the page contains no tuples. It can also be set to InvalidTransactionId + * when a page that is already all-visible is being marked all-frozen. + * + * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling + * this function. Except in recovery, caller should also pass the heap + * buffer. When checksums are enabled and we're not in recovery, we must add + * the heap buffer to the WAL chain to protect it from being torn. + * + * You must pass a buffer containing the correct map page to this function. + * Call tdeheap_visibilitymap_pin first to pin the right one. This function doesn't do + * any I/O. + */ +void +tdeheap_visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + Page page; + uint8 *map; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); + Assert(InRecovery || PageIsAllVisible((Page) BufferGetPage(heapBuf))); + Assert((flags & VISIBILITYMAP_VALID_BITS) == flags); + + /* Must never set all_frozen bit without also setting all_visible bit */ + Assert(flags != VISIBILITYMAP_ALL_FROZEN); + + /* Check that we have the right heap page pinned, if present */ + if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) + elog(ERROR, "wrong heap buffer passed to tdeheap_visibilitymap_set"); + + /* Check that we have the right VM page pinned */ + if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) + elog(ERROR, "wrong VM buffer passed to tdeheap_visibilitymap_set"); + + page = BufferGetPage(vmBuf); + map = (uint8 *) PageGetContents(page); + LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); + + if (flags != (map[mapByte] >> mapOffset & VISIBILITYMAP_VALID_BITS)) + { + START_CRIT_SECTION(); + + map[mapByte] |= (flags << mapOffset); + MarkBufferDirty(vmBuf); + + if (RelationNeedsWAL(rel)) + { + if (XLogRecPtrIsInvalid(recptr)) + { + Assert(!InRecovery); + recptr = log_tdeheap_visible(rel, heapBuf, vmBuf, cutoff_xid, flags); + + /* + * If data checksums are enabled (or wal_log_hints=on), we + * need to protect the heap page from being torn. + * + * If not, then we must *not* update the heap page's LSN. In + * this case, the FPI for the heap page was omitted from the + * WAL record inserted above, so it would be incorrect to + * update the heap page's LSN. + */ + if (XLogHintBitIsNeeded()) + { + Page heapPage = BufferGetPage(heapBuf); + + PageSetLSN(heapPage, recptr); + } + } + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); +} + +/* + * tdeheap_visibilitymap_get_status - get status of bits + * + * Are all tuples on heapBlk visible to all or are marked frozen, according + * to the visibility map? + * + * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by an + * earlier call to tdeheap_visibilitymap_pin or tdeheap_visibilitymap_get_status on the same + * relation. On return, *vmbuf is a valid buffer with the map page containing + * the bit for heapBlk, or InvalidBuffer. The caller is responsible for + * releasing *vmbuf after it's done testing and setting bits. + * + * NOTE: This function is typically called without a lock on the heap page, + * so somebody else could change the bit just after we look at it. In fact, + * since we don't lock the visibility map page either, it's even possible that + * someone else could have changed the bit just before we look at it, but yet + * we might see the old value. It is the caller's responsibility to deal with + * all concurrency issues! + */ +uint8 +tdeheap_visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + char *map; + uint8 result; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + /* Reuse the old pinned buffer if possible */ + if (BufferIsValid(*vmbuf)) + { + if (BufferGetBlockNumber(*vmbuf) != mapBlock) + { + ReleaseBuffer(*vmbuf); + *vmbuf = InvalidBuffer; + } + } + + if (!BufferIsValid(*vmbuf)) + { + *vmbuf = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(*vmbuf)) + return false; + } + + map = PageGetContents(BufferGetPage(*vmbuf)); + + /* + * A single byte read is atomic. There could be memory-ordering effects + * here, but for performance reasons we make it the caller's job to worry + * about that. + */ + result = ((map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS); + return result; +} + +/* + * tdeheap_visibilitymap_count - count number of bits set in visibility map + * + * Note: we ignore the possibility of race conditions when the table is being + * extended concurrently with the call. New pages added to the table aren't + * going to be marked all-visible or all-frozen, so they won't affect the result. + */ +void +tdeheap_visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen) +{ + BlockNumber mapBlock; + BlockNumber nvisible = 0; + BlockNumber nfrozen = 0; + + /* all_visible must be specified */ + Assert(all_visible); + + for (mapBlock = 0;; mapBlock++) + { + Buffer mapBuffer; + uint64 *map; + int i; + + /* + * Read till we fall off the end of the map. We assume that any extra + * bytes in the last page are zeroed, so we don't bother excluding + * them from the count. + */ + mapBuffer = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(mapBuffer)) + break; + + /* + * We choose not to lock the page, since the result is going to be + * immediately stale anyway if anyone is concurrently setting or + * clearing bits, and we only really need an approximate value. + */ + map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer)); + + StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0, + "unsupported MAPSIZE"); + if (all_frozen == NULL) + { + for (i = 0; i < MAPSIZE / sizeof(uint64); i++) + nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); + } + else + { + for (i = 0; i < MAPSIZE / sizeof(uint64); i++) + { + nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); + nfrozen += pg_popcount64(map[i] & FROZEN_MASK64); + } + } + + ReleaseBuffer(mapBuffer); + } + + *all_visible = nvisible; + if (all_frozen) + *all_frozen = nfrozen; +} + +/* + * tdeheap_visibilitymap_prepare_truncate - + * prepare for truncation of the visibility map + * + * nheapblocks is the new size of the heap. + * + * Return the number of blocks of new visibility map. + * If it's InvalidBlockNumber, there is nothing to truncate; + * otherwise the caller is responsible for calling smgrtruncate() + * to truncate the visibility map pages. + */ +BlockNumber +tdeheap_visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) +{ + BlockNumber newnblocks; + + /* last remaining block, byte, and bit */ + BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); + uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); + uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks); + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); +#endif + + /* + * If no visibility map has been created yet for this relation, there's + * nothing to truncate. + */ + if (!smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM)) + return InvalidBlockNumber; + + /* + * Unless the new size is exactly at a visibility map page boundary, the + * tail bits in the last remaining map page, representing truncated heap + * blocks, need to be cleared. This is not only tidy, but also necessary + * because we don't get a chance to clear the bits if the heap is extended + * again. + */ + if (truncByte != 0 || truncOffset != 0) + { + Buffer mapBuffer; + Page page; + char *map; + + newnblocks = truncBlock + 1; + + mapBuffer = vm_readbuf(rel, truncBlock, false); + if (!BufferIsValid(mapBuffer)) + { + /* nothing to do, the file was already smaller */ + return InvalidBlockNumber; + } + + page = BufferGetPage(mapBuffer); + map = PageGetContents(page); + + LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + /* Clear out the unwanted bytes. */ + MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); + + /*---- + * Mask out the unwanted bits of the last remaining byte. + * + * ((1 << 0) - 1) = 00000000 + * ((1 << 1) - 1) = 00000001 + * ... + * ((1 << 6) - 1) = 00111111 + * ((1 << 7) - 1) = 01111111 + *---- + */ + map[truncByte] &= (1 << truncOffset) - 1; + + /* + * Truncation of a relation is WAL-logged at a higher-level, and we + * will be called at WAL replay. But if checksums are enabled, we need + * to still write a WAL record to protect against a torn page, if the + * page is flushed to disk before the truncation WAL record. We cannot + * use MarkBufferDirtyHint here, because that will not dirty the page + * during recovery. + */ + MarkBufferDirty(mapBuffer); + if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) + log_newpage_buffer(mapBuffer, false); + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(mapBuffer); + } + else + newnblocks = truncBlock; + + if (smgrnblocks(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM) <= newnblocks) + { + /* nothing to do, the file was already smaller than requested size */ + return InvalidBlockNumber; + } + + return newnblocks; +} + +/* + * Read a visibility map page. + * + * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is + * true, the visibility map file is extended. + */ +static Buffer +vm_readbuf(Relation rel, BlockNumber blkno, bool extend) +{ + Buffer buf; + SMgrRelation reln; + + /* + * Caution: re-using this smgr pointer could fail if the relcache entry + * gets closed. It's safe as long as we only do smgr-level operations + * between here and the last use of the pointer. + */ + reln = RelationGetSmgr(rel); + + /* + * If we haven't cached the size of the visibility map fork yet, check it + * first. + */ + if (reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber) + { + if (smgrexists(reln, VISIBILITYMAP_FORKNUM)) + smgrnblocks(reln, VISIBILITYMAP_FORKNUM); + else + reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0; + } + + /* + * For reading we use ZERO_ON_ERROR mode, and initialize the page if + * necessary. It's always safe to clear bits, so it's better to clear + * corrupt pages than error out. + * + * We use the same path below to initialize pages when extending the + * relation, as a concurrent extension can end up with vm_extend() + * returning an already-initialized page. + */ + if (blkno >= reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM]) + { + if (extend) + buf = vm_extend(rel, blkno + 1); + else + return InvalidBuffer; + } + else + buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, + RBM_ZERO_ON_ERROR, NULL); + + /* + * Initializing the page when needed is trickier than it looks, because of + * the possibility of multiple backends doing this concurrently, and our + * desire to not uselessly take the buffer lock in the normal path where + * the page is OK. We must take the lock to initialize the page, so + * recheck page newness after we have the lock, in case someone else + * already did it. Also, because we initially check PageIsNew with no + * lock, it's possible to fall through and return the buffer while someone + * else is still initializing the page (i.e., we might see pd_upper as set + * but other page header fields are still zeroes). This is harmless for + * callers that will take a buffer lock themselves, but some callers + * inspect the page without any lock at all. The latter is OK only so + * long as it doesn't depend on the page header having correct contents. + * Current usage is safe because PageGetContents() does not require that. + */ + if (PageIsNew(BufferGetPage(buf))) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(BufferGetPage(buf))) + PageInit(BufferGetPage(buf), BLCKSZ, 0); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + return buf; +} + +/* + * Ensure that the visibility map fork is at least vm_nblocks long, extending + * it if necessary with zeroed pages. + */ +static Buffer +vm_extend(Relation rel, BlockNumber vm_nblocks) +{ + Buffer buf; + + buf = ExtendBufferedRelTo(BMR_REL(rel), VISIBILITYMAP_FORKNUM, NULL, + EB_CREATE_FORK_IF_NEEDED | + EB_CLEAR_SIZE_CACHE, + vm_nblocks, + RBM_ZERO_ON_ERROR); + + /* + * Send a shared-inval message to force other backends to close any smgr + * references they may have for this rel, which we are about to change. + * This is a useful optimization because it means that backends don't have + * to keep checking for creation or extension of the file, which happens + * infrequently. + */ + CacheInvalidateSmgr(RelationGetSmgr(rel)->smgr_rlocator); + + return buf; +} diff --git a/src16/access/pg_tdeam.c b/src16/access/pg_tdeam.c new file mode 100644 index 00000000..1d59ec7b --- /dev/null +++ b/src16/access/pg_tdeam.c @@ -0,0 +1,10247 @@ +/*------------------------------------------------------------------------- + * + * heapam.c + * heap access method code + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/heapam.c + * + * + * INTERFACE ROUTINES + * tdeheap_beginscan - begin relation scan + * tdeheap_rescan - restart a relation scan + * tdeheap_endscan - end relation scan + * tdeheap_getnext - retrieve next tuple in scan + * tdeheap_fetch - retrieve tuple with given tid + * tdeheap_insert - insert tuple into a relation + * tdeheap_multi_insert - insert multiple tuples into a relation + * tdeheap_delete - delete a tuple from a relation + * tdeheap_update - replace a tuple in a relation with another tuple + * + * NOTES + * This file contains the tdeheap_ routines which implement + * the POSTGRES heap access method used for all POSTGRES + * relations. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/pg_tdeam_xlog.h" +#include "access/heaptoast.h" +#include "access/hio.h" +#include "access/multixact.h" +#include "access/parallel.h" +#include "access/relscan.h" +#include "access/subtrans.h" +#include "access/syncscan.h" +#include "access/sysattr.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/valid.h" +#include "access/visibilitymap.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "catalog/catalog.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "port/pg_bitutils.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "storage/standby.h" +#include "utils/datum.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/relcache.h" +#include "utils/snapmgr.h" +#include "utils/spccache.h" + + +static HeapTuple tdeheap_prepare_insert(Relation relation, HeapTuple tup, + TransactionId xid, CommandId cid, int options); +static XLogRecPtr log_tdeheap_update(Relation reln, Buffer oldbuf, + Buffer newbuf, HeapTuple oldtup, + HeapTuple newtup, HeapTuple old_key_tuple, + bool all_visible_cleared, bool new_all_visible_cleared); +static Bitmapset *HeapDetermineColumnsInfo(Relation relation, + Bitmapset *interesting_cols, + Bitmapset *external_cols, + HeapTuple oldtup, HeapTuple newtup, + bool *has_external); +static bool tdeheap_acquire_tuplock(Relation relation, ItemPointer tid, + LockTupleMode mode, LockWaitPolicy wait_policy, + bool *have_tuple_lock); +static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, + uint16 old_infomask2, TransactionId add_to_xmax, + LockTupleMode mode, bool is_update, + TransactionId *result_xmax, uint16 *result_infomask, + uint16 *result_infomask2); +static TM_Result tdeheap_lock_updated_tuple(Relation rel, HeapTuple tuple, + ItemPointer ctid, TransactionId xid, + LockTupleMode mode); +static int tdeheap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, + xl_tdeheap_freeze_plan *plans_out, + OffsetNumber *offsets_out); +static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, + uint16 *new_infomask2); +static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, + uint16 t_infomask); +static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, + LockTupleMode lockmode, bool *current_is_member); +static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, + Relation rel, ItemPointer ctid, XLTW_Oper oper, + int *remaining); +static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, + uint16 infomask, Relation rel, int *remaining); +static void index_delete_sort(TM_IndexDeleteOp *delstate); +static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); +static XLogRecPtr log_tdeheap_new_cid(Relation relation, HeapTuple tup); +static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, + bool *copy); + + +/* + * Each tuple lock mode has a corresponding heavyweight lock, and one or two + * corresponding MultiXactStatuses (one to merely lock tuples, another one to + * update them). This table (and the macros below) helps us determine the + * heavyweight lock mode and MultiXactStatus values to use for any particular + * tuple lock strength. + * + * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock + * instead. + */ +static const struct +{ + LOCKMODE hwlock; + int lockstatus; + int updstatus; +} + + tupleLockExtraInfo[MaxLockTupleMode + 1] = +{ + { /* LockTupleKeyShare */ + AccessShareLock, + MultiXactStatusForKeyShare, + -1 /* KeyShare does not allow updating tuples */ + }, + { /* LockTupleShare */ + RowShareLock, + MultiXactStatusForShare, + -1 /* Share does not allow updating tuples */ + }, + { /* LockTupleNoKeyExclusive */ + ExclusiveLock, + MultiXactStatusForNoKeyUpdate, + MultiXactStatusNoKeyUpdate + }, + { /* LockTupleExclusive */ + AccessExclusiveLock, + MultiXactStatusForUpdate, + MultiXactStatusUpdate + } +}; + +/* Get the LOCKMODE for a given MultiXactStatus */ +#define LOCKMODE_from_mxstatus(status) \ + (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock) + +/* + * Acquire heavyweight locks on tuples, using a LockTupleMode strength value. + * This is more readable than having every caller translate it to lock.h's + * LOCKMODE. + */ +#define LockTupleTuplock(rel, tup, mode) \ + LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +#define UnlockTupleTuplock(rel, tup, mode) \ + UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +#define ConditionalLockTupleTuplock(rel, tup, mode) \ + ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) + +#ifdef USE_PREFETCH +/* + * tdeheap_index_delete_tuples and index_delete_prefetch_buffer use this + * structure to coordinate prefetching activity + */ +typedef struct +{ + BlockNumber cur_hblkno; + int next_item; + int ndeltids; + TM_IndexDelete *deltids; +} IndexDeletePrefetchState; +#endif + +/* tdeheap_index_delete_tuples bottom-up index deletion costing constants */ +#define BOTTOMUP_MAX_NBLOCKS 6 +#define BOTTOMUP_TOLERANCE_NBLOCKS 3 + +/* + * tdeheap_index_delete_tuples uses this when determining which heap blocks it + * must visit to help its bottom-up index deletion caller + */ +typedef struct IndexDeleteCounts +{ + int16 npromisingtids; /* Number of "promising" TIDs in group */ + int16 ntids; /* Number of TIDs in group */ + int16 ifirsttid; /* Offset to group's first deltid */ +} IndexDeleteCounts; + +/* + * This table maps tuple lock strength values for each particular + * MultiXactStatus value. + */ +static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = +{ + LockTupleKeyShare, /* ForKeyShare */ + LockTupleShare, /* ForShare */ + LockTupleNoKeyExclusive, /* ForNoKeyUpdate */ + LockTupleExclusive, /* ForUpdate */ + LockTupleNoKeyExclusive, /* NoKeyUpdate */ + LockTupleExclusive /* Update */ +}; + +/* Get the LockTupleMode for a given MultiXactStatus */ +#define TUPLOCK_from_mxstatus(status) \ + (MultiXactStatusLock[(status)]) + +/* ---------------------------------------------------------------- + * heap support routines + * ---------------------------------------------------------------- + */ + +/* ---------------- + * initscan - scan code common to tdeheap_beginscan and tdeheap_rescan + * ---------------- + */ +static void +initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) +{ + ParallelBlockTableScanDesc bpscan = NULL; + bool allow_strat; + bool allow_sync; + + /* + * Determine the number of blocks we have to scan. + * + * It is sufficient to do this once at scan start, since any tuples added + * while the scan is in progress will be invisible to my snapshot anyway. + * (That is not true when using a non-MVCC snapshot. However, we couldn't + * guarantee to return tuples added after scan start anyway, since they + * might go into pages we already scanned. To guarantee consistent + * results for a non-MVCC snapshot, the caller must hold some higher-level + * lock that ensures the interesting tuple(s) won't change.) + */ + if (scan->rs_base.rs_parallel != NULL) + { + bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; + scan->rs_nblocks = bpscan->phs_nblocks; + } + else + scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd); + + /* + * If the table is large relative to NBuffers, use a bulk-read access + * strategy and enable synchronized scanning (see syncscan.c). Although + * the thresholds for these features could be different, we make them the + * same so that there are only two behaviors to tune rather than four. + * (However, some callers need to be able to disable one or both of these + * behaviors, independently of the size of the table; also there is a GUC + * variable that can disable synchronized scanning.) + * + * Note that table_block_parallelscan_initialize has a very similar test; + * if you change this, consider changing that one, too. + */ + if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) && + scan->rs_nblocks > NBuffers / 4) + { + allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0; + allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0; + } + else + allow_strat = allow_sync = false; + + if (allow_strat) + { + /* During a rescan, keep the previous strategy object. */ + if (scan->rs_strategy == NULL) + scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD); + } + else + { + if (scan->rs_strategy != NULL) + FreeAccessStrategy(scan->rs_strategy); + scan->rs_strategy = NULL; + } + + if (scan->rs_base.rs_parallel != NULL) + { + /* For parallel scan, believe whatever ParallelTableScanDesc says. */ + if (scan->rs_base.rs_parallel->phs_syncscan) + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + } + else if (keep_startblock) + { + /* + * When rescanning, we want to keep the previous startblock setting, + * so that rewinding a cursor doesn't generate surprising results. + * Reset the active syncscan setting, though. + */ + if (allow_sync && synchronize_seqscans) + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + } + else if (allow_sync && synchronize_seqscans) + { + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks); + } + else + { + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + scan->rs_startblock = 0; + } + + scan->rs_numblocks = InvalidBlockNumber; + scan->rs_inited = false; + scan->rs_ctup.t_data = NULL; + ItemPointerSetInvalid(&scan->rs_ctup.t_self); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + + /* page-at-a-time fields are always invalid when not rs_inited */ + + /* + * copy the scan key, if appropriate + */ + if (key != NULL && scan->rs_base.rs_nkeys > 0) + memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData)); + + /* + * Currently, we only have a stats counter for sequential heap scans (but + * e.g for bitmap scans the underlying bitmap index scans will be counted, + * and for sample scans we update stats for tuple fetches). + */ + if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN) + pgstat_count_tdeheap_scan(scan->rs_base.rs_rd); +} + +/* + * tdeheap_setscanlimits - restrict range of a heapscan + * + * startBlk is the page to start at + * numBlks is number of pages to scan (InvalidBlockNumber means "all") + */ +void +tdeheap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + Assert(!scan->rs_inited); /* else too late to change */ + /* else rs_startblock is significant */ + Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC)); + + /* Check startBlk is valid (but allow case of zero blocks...) */ + Assert(startBlk == 0 || startBlk < scan->rs_nblocks); + + scan->rs_startblock = startBlk; + scan->rs_numblocks = numBlks; +} + +/* + * tdeheapgetpage - subroutine for tdeheapgettup() + * + * This routine reads and pins the specified page of the relation. + * In page-at-a-time mode it performs additional work, namely determining + * which tuples on the page are visible. + */ +void +tdeheapgetpage(TableScanDesc sscan, BlockNumber block) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + Buffer buffer; + Snapshot snapshot; + Page page; + int lines; + int ntup; + OffsetNumber lineoff; + bool all_visible; + + Assert(block < scan->rs_nblocks); + + /* release previous scan buffer, if any */ + if (BufferIsValid(scan->rs_cbuf)) + { + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + } + + /* + * Be sure to check for interrupts at least once per page. Checks at + * higher code levels won't be able to stop a seqscan that encounters many + * pages' worth of consecutive dead tuples. + */ + CHECK_FOR_INTERRUPTS(); + + /* read page using selected strategy */ + scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, block, + RBM_NORMAL, scan->rs_strategy); + scan->rs_cblock = block; + + if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)) + return; + + buffer = scan->rs_cbuf; + snapshot = scan->rs_base.rs_snapshot; + + /* + * Prune and repair fragmentation for the whole page, if possible. + */ + tdeheap_page_prune_opt(scan->rs_base.rs_rd, buffer); + + /* + * We must hold share lock on the buffer content while examining tuple + * visibility. Afterwards, however, the tuples we have found to be + * visible are guaranteed good as long as we hold the buffer pin. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, page); + lines = PageGetMaxOffsetNumber(page); + ntup = 0; + + /* + * If the all-visible flag indicates that all tuples on the page are + * visible to everyone, we can skip the per-tuple visibility tests. + * + * Note: In hot standby, a tuple that's already visible to all + * transactions on the primary might still be invisible to a read-only + * transaction in the standby. We partly handle this problem by tracking + * the minimum xmin of visible tuples as the cut-off XID while marking a + * page all-visible on the primary and WAL log that along with the + * visibility map SET operation. In hot standby, we wait for (or abort) + * all transactions that can potentially may not see one or more tuples on + * the page. That's how index-only scans work fine in hot standby. A + * crucial difference between index-only scans and heap scans is that the + * index-only scan completely relies on the visibility map where as heap + * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if + * the page-level flag can be trusted in the same way, because it might + * get propagated somehow without being explicitly WAL-logged, e.g. via a + * full page write. Until we can prove that beyond doubt, let's check each + * tuple for visibility the hard way. + */ + all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; + + for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++) + { + ItemId lpp = PageGetItemId(page, lineoff); + HeapTupleData loctup; + bool valid; + + if (!ItemIdIsNormal(lpp)) + continue; + + loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); + loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp); + loctup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(loctup.t_self), block, lineoff); + + if (all_visible) + valid = true; + else + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + + HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, + &loctup, buffer, snapshot); + + if (valid) + scan->rs_vistuples[ntup++] = lineoff; + } + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + Assert(ntup <= MaxHeapTuplesPerPage); + scan->rs_ntuples = ntup; +} + +/* + * tdeheapgettup_initial_block - return the first BlockNumber to scan + * + * Returns InvalidBlockNumber when there are no blocks to scan. This can + * occur with empty tables and in parallel scans when parallel workers get all + * of the pages before we can get a chance to get our first page. + */ +static BlockNumber +tdeheapgettup_initial_block(HeapScanDesc scan, ScanDirection dir) +{ + Assert(!scan->rs_inited); + + /* When there are no pages to scan, return InvalidBlockNumber */ + if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0) + return InvalidBlockNumber; + + if (ScanDirectionIsForward(dir)) + { + /* serial scan */ + if (scan->rs_base.rs_parallel == NULL) + return scan->rs_startblock; + else + { + /* parallel scan */ + table_block_parallelscan_startblock_init(scan->rs_base.rs_rd, + scan->rs_parallelworkerdata, + (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel); + + /* may return InvalidBlockNumber if there are no more blocks */ + return table_block_parallelscan_nextpage(scan->rs_base.rs_rd, + scan->rs_parallelworkerdata, + (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel); + } + } + else + { + /* backward parallel scan not supported */ + Assert(scan->rs_base.rs_parallel == NULL); + + /* + * Disable reporting to syncscan logic in a backwards scan; it's not + * very likely anyone else is doing the same thing at the same time, + * and much more likely that we'll just bollix things for forward + * scanners. + */ + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + + /* + * Start from last page of the scan. Ensure we take into account + * rs_numblocks if it's been adjusted by tdeheap_setscanlimits(). + */ + if (scan->rs_numblocks != InvalidBlockNumber) + return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks; + + if (scan->rs_startblock > 0) + return scan->rs_startblock - 1; + + return scan->rs_nblocks - 1; + } +} + + +/* + * tdeheapgettup_start_page - helper function for tdeheapgettup() + * + * Return the next page to scan based on the scan->rs_cbuf and set *linesleft + * to the number of tuples on this page. Also set *lineoff to the first + * offset to scan with forward scans getting the first offset and backward + * getting the final offset on the page. + */ +static Page +tdeheapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, + OffsetNumber *lineoff) +{ + Page page; + + Assert(scan->rs_inited); + Assert(BufferIsValid(scan->rs_cbuf)); + + /* Caller is responsible for ensuring buffer is locked if needed */ + page = BufferGetPage(scan->rs_cbuf); + + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); + + *linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1; + + if (ScanDirectionIsForward(dir)) + *lineoff = FirstOffsetNumber; + else + *lineoff = (OffsetNumber) (*linesleft); + + /* lineoff now references the physically previous or next tid */ + return page; +} + + +/* + * tdeheapgettup_continue_page - helper function for tdeheapgettup() + * + * Return the next page to scan based on the scan->rs_cbuf and set *linesleft + * to the number of tuples left to scan on this page. Also set *lineoff to + * the next offset to scan according to the ScanDirection in 'dir'. + */ +static inline Page +tdeheapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, + OffsetNumber *lineoff) +{ + Page page; + + Assert(scan->rs_inited); + Assert(BufferIsValid(scan->rs_cbuf)); + + /* Caller is responsible for ensuring buffer is locked if needed */ + page = BufferGetPage(scan->rs_cbuf); + + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); + + if (ScanDirectionIsForward(dir)) + { + *lineoff = OffsetNumberNext(scan->rs_coffset); + *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1; + } + else + { + /* + * The previous returned tuple may have been vacuumed since the + * previous scan when we use a non-MVCC snapshot, so we must + * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant + */ + *lineoff = Min(PageGetMaxOffsetNumber(page), OffsetNumberPrev(scan->rs_coffset)); + *linesleft = *lineoff; + } + + /* lineoff now references the physically previous or next tid */ + return page; +} + +/* + * tdeheapgettup_advance_block - helper for tdeheapgettup() and tdeheapgettup_pagemode() + * + * Given the current block number, the scan direction, and various information + * contained in the scan descriptor, calculate the BlockNumber to scan next + * and return it. If there are no further blocks to scan, return + * InvalidBlockNumber to indicate this fact to the caller. + * + * This should not be called to determine the initial block number -- only for + * subsequent blocks. + * + * This also adjusts rs_numblocks when a limit has been imposed by + * tdeheap_setscanlimits(). + */ +static inline BlockNumber +tdeheapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir) +{ + if (ScanDirectionIsForward(dir)) + { + if (scan->rs_base.rs_parallel == NULL) + { + block++; + + /* wrap back to the start of the heap */ + if (block >= scan->rs_nblocks) + block = 0; + + /* + * Report our new scan position for synchronization purposes. We + * don't do that when moving backwards, however. That would just + * mess up any other forward-moving scanners. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_base.rs_flags & SO_ALLOW_SYNC) + ss_report_location(scan->rs_base.rs_rd, block); + + /* we're done if we're back at where we started */ + if (block == scan->rs_startblock) + return InvalidBlockNumber; + + /* check if the limit imposed by tdeheap_setscanlimits() is met */ + if (scan->rs_numblocks != InvalidBlockNumber) + { + if (--scan->rs_numblocks == 0) + return InvalidBlockNumber; + } + + return block; + } + else + { + return table_block_parallelscan_nextpage(scan->rs_base.rs_rd, + scan->rs_parallelworkerdata, (ParallelBlockTableScanDesc) + scan->rs_base.rs_parallel); + } + } + else + { + /* we're done if the last block is the start position */ + if (block == scan->rs_startblock) + return InvalidBlockNumber; + + /* check if the limit imposed by tdeheap_setscanlimits() is met */ + if (scan->rs_numblocks != InvalidBlockNumber) + { + if (--scan->rs_numblocks == 0) + return InvalidBlockNumber; + } + + /* wrap to the end of the heap when the last page was page 0 */ + if (block == 0) + block = scan->rs_nblocks; + + block--; + + return block; + } +} + +/* ---------------- + * tdeheapgettup - fetch next heap tuple + * + * Initialize the scan if not already done; then advance to the next + * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup, + * or set scan->rs_ctup.t_data = NULL if no more tuples. + * + * Note: the reason nkeys/key are passed separately, even though they are + * kept in the scan descriptor, is that the caller may not want us to check + * the scankeys. + * + * Note: when we fall off the end of the scan in either direction, we + * reset rs_inited. This means that a further request with the same + * scan direction will restart the scan, which is a bit odd, but a + * request with the opposite scan direction will start a fresh scan + * in the proper direction. The latter is required behavior for cursors, + * while the former case is generally undefined behavior in Postgres + * so we don't care too much. + * ---------------- + */ +static void +tdeheapgettup(HeapScanDesc scan, + ScanDirection dir, + int nkeys, + ScanKey key) +{ + HeapTuple tuple = &(scan->rs_ctup); + BlockNumber block; + Page page; + OffsetNumber lineoff; + int linesleft; + + if (unlikely(!scan->rs_inited)) + { + block = tdeheapgettup_initial_block(scan, dir); + /* ensure rs_cbuf is invalid when we get InvalidBlockNumber */ + Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf)); + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + block = scan->rs_cblock; + + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + page = tdeheapgettup_continue_page(scan, dir, &linesleft, &lineoff); + goto continue_page; + } + + /* + * advance the scan until we find a qualifying tuple or run out of stuff + * to scan + */ + while (block != InvalidBlockNumber) + { + tdeheapgetpage((TableScanDesc) scan, block); + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + page = tdeheapgettup_start_page(scan, dir, &linesleft, &lineoff); +continue_page: + + /* + * Only continue scanning the page while we have lines left. + * + * Note that this protects us from accessing line pointers past + * PageGetMaxOffsetNumber(); both for forward scans when we resume the + * table scan, and for when we start scanning a new page. + */ + for (; linesleft > 0; linesleft--, lineoff += dir) + { + bool visible; + ItemId lpp = PageGetItemId(page, lineoff); + + if (!ItemIdIsNormal(lpp)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp); + tuple->t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(tuple->t_self), block, lineoff); + + visible = HeapTupleSatisfiesVisibility(tuple, + scan->rs_base.rs_snapshot, + scan->rs_cbuf); + + HeapCheckForSerializableConflictOut(visible, scan->rs_base.rs_rd, + tuple, scan->rs_cbuf, + scan->rs_base.rs_snapshot); + + /* skip tuples not visible to this snapshot */ + if (!visible) + continue; + + /* skip any tuples that don't match the scan key */ + if (key != NULL && + !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd), + nkeys, key)) + continue; + + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + scan->rs_coffset = lineoff; + return; + } + + /* + * if we get here, it means we've exhausted the items on this page and + * it's time to move to the next. + */ + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + /* get the BlockNumber to scan next */ + block = tdeheapgettup_advance_block(scan, block, dir); + } + + /* end of scan */ + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; +} + +/* ---------------- + * tdeheapgettup_pagemode - fetch next heap tuple in page-at-a-time mode + * + * Same API as tdeheapgettup, but used in page-at-a-time mode + * + * The internal logic is much the same as tdeheapgettup's too, but there are some + * differences: we do not take the buffer content lock (that only needs to + * happen inside tdeheapgetpage), and we iterate through just the tuples listed + * in rs_vistuples[] rather than all tuples on the page. Notice that + * lineindex is 0-based, where the corresponding loop variable lineoff in + * tdeheapgettup is 1-based. + * ---------------- + */ +static void +tdeheapgettup_pagemode(HeapScanDesc scan, + ScanDirection dir, + int nkeys, + ScanKey key) +{ + HeapTuple tuple = &(scan->rs_ctup); + BlockNumber block; + Page page; + int lineindex; + int linesleft; + + if (unlikely(!scan->rs_inited)) + { + block = tdeheapgettup_initial_block(scan, dir); + /* ensure rs_cbuf is invalid when we get InvalidBlockNumber */ + Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf)); + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + block = scan->rs_cblock; /* current page */ + page = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); + + lineindex = scan->rs_cindex + dir; + if (ScanDirectionIsForward(dir)) + linesleft = scan->rs_ntuples - lineindex; + else + linesleft = scan->rs_cindex; + /* lineindex now references the next or previous visible tid */ + + goto continue_page; + } + + /* + * advance the scan until we find a qualifying tuple or run out of stuff + * to scan + */ + while (block != InvalidBlockNumber) + { + tdeheapgetpage((TableScanDesc) scan, block); + page = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); + linesleft = scan->rs_ntuples; + lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1; + + /* lineindex now references the next or previous visible tid */ +continue_page: + + for (; linesleft > 0; linesleft--, lineindex += dir) + { + ItemId lpp; + OffsetNumber lineoff; + + lineoff = scan->rs_vistuples[lineindex]; + lpp = PageGetItemId(page, lineoff); + Assert(ItemIdIsNormal(lpp)); + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp); + tuple->t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(tuple->t_self), block, lineoff); + + /* skip any tuples that don't match the scan key */ + if (key != NULL && + !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd), + nkeys, key)) + continue; + + scan->rs_cindex = lineindex; + return; + } + + /* get the BlockNumber to scan next */ + block = tdeheapgettup_advance_block(scan, block, dir); + } + + /* end of scan */ + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; +} + + +/* ---------------------------------------------------------------- + * heap access method interface + * ---------------------------------------------------------------- + */ + + +TableScanDesc +tdeheap_beginscan(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + ParallelTableScanDesc parallel_scan, + uint32 flags) +{ + HeapScanDesc scan; + + /* + * increment relation ref count while scanning relation + * + * This is just to make really sure the relcache entry won't go away while + * the scan has a pointer to it. Caller should be holding the rel open + * anyway, so this is redundant in all normal scenarios... + */ + RelationIncrementReferenceCount(relation); + + /* + * allocate and initialize scan descriptor + */ + scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData)); + + scan->rs_base.rs_rd = relation; + scan->rs_base.rs_snapshot = snapshot; + scan->rs_base.rs_nkeys = nkeys; + scan->rs_base.rs_flags = flags; + scan->rs_base.rs_parallel = parallel_scan; + scan->rs_strategy = NULL; /* set in initscan */ + + /* + * Disable page-at-a-time mode if it's not a MVCC-safe snapshot. + */ + if (!(snapshot && IsMVCCSnapshot(snapshot))) + scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; + + /* + * For seqscan and sample scans in a serializable transaction, acquire a + * predicate lock on the entire relation. This is required not only to + * lock all the matching tuples, but also to conflict with new insertions + * into the table. In an indexscan, we take page locks on the index pages + * covering the range specified in the scan qual, but in a heap scan there + * is nothing more fine-grained to lock. A bitmap scan is a different + * story, there we have already scanned the index and locked the index + * pages covering the predicate. But in that case we still have to lock + * any matching heap tuples. For sample scan we could optimize the locking + * to be at least page-level granularity, but we'd need to add per-tuple + * locking for that. + */ + if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN)) + { + /* + * Ensure a missing snapshot is noticed reliably, even if the + * isolation mode means predicate locking isn't performed (and + * therefore the snapshot isn't used here). + */ + Assert(snapshot); + PredicateLockRelation(relation, snapshot); + } + + /* we only need to set this up once */ + scan->rs_ctup.t_tableOid = RelationGetRelid(relation); + + /* + * Allocate memory to keep track of page allocation for parallel workers + * when doing a parallel scan. + */ + if (parallel_scan != NULL) + scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData)); + else + scan->rs_parallelworkerdata = NULL; + + /* + * we do this here instead of in initscan() because tdeheap_rescan also calls + * initscan() and we don't want to allocate memory again + */ + if (nkeys > 0) + scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); + else + scan->rs_base.rs_key = NULL; + + initscan(scan, key, false); + + return (TableScanDesc) scan; +} + +void +tdeheap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, + bool allow_strat, bool allow_sync, bool allow_pagemode) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + if (set_params) + { + if (allow_strat) + scan->rs_base.rs_flags |= SO_ALLOW_STRAT; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT; + + if (allow_sync) + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + + if (allow_pagemode && scan->rs_base.rs_snapshot && + IsMVCCSnapshot(scan->rs_base.rs_snapshot)) + scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; + } + + /* + * unpin scan buffers + */ + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + + /* + * reinitialize scan descriptor + */ + initscan(scan, key, true); +} + +void +tdeheap_endscan(TableScanDesc sscan) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + /* Note: no locking manipulations needed */ + + /* + * unpin scan buffers + */ + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + + /* + * decrement relation reference count and free scan descriptor storage + */ + RelationDecrementReferenceCount(scan->rs_base.rs_rd); + + if (scan->rs_base.rs_key) + pfree(scan->rs_base.rs_key); + + if (scan->rs_strategy != NULL) + FreeAccessStrategy(scan->rs_strategy); + + if (scan->rs_parallelworkerdata != NULL) + pfree(scan->rs_parallelworkerdata); + + if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) + UnregisterSnapshot(scan->rs_base.rs_snapshot); + + pfree(scan); +} + +HeapTuple +tdeheap_getnext(TableScanDesc sscan, ScanDirection direction) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + /* + * This is still widely used directly, without going through table AM, so + * add a safety check. It's possible we should, at a later point, + * downgrade this to an assert. The reason for checking the AM routine, + * rather than the AM oid, is that this allows to write regression tests + * that create another AM reusing the heap handler. + */ + if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine())) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg_internal("only heap AM is supported"))); + + /* + * We don't expect direct calls to tdeheap_getnext with valid CheckXidAlive + * for catalog or regular tables. See detailed comments in xact.c where + * these variables are declared. Normally we have such a check at tableam + * level API but this is called from many places so we need to ensure it + * here. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected tdeheap_getnext call during logical decoding"); + + /* Note: no locking manipulations needed */ + + if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE) + tdeheapgettup_pagemode(scan, direction, + scan->rs_base.rs_nkeys, scan->rs_base.rs_key); + else + tdeheapgettup(scan, direction, + scan->rs_base.rs_nkeys, scan->rs_base.rs_key); + + if (scan->rs_ctup.t_data == NULL) + return NULL; + + /* + * if we get here it means we have a new current scan tuple, so point to + * the proper return buffer and return the tuple. + */ + + pgstat_count_tdeheap_getnext(scan->rs_base.rs_rd); + + return &scan->rs_ctup; +} + +bool +tdeheap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + /* Note: no locking manipulations needed */ + + if (sscan->rs_flags & SO_ALLOW_PAGEMODE) + tdeheapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key); + else + tdeheapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key); + + if (scan->rs_ctup.t_data == NULL) + { + ExecClearTuple(slot); + return false; + } + + /* + * if we get here it means we have a new current scan tuple, so point to + * the proper return buffer and return the tuple. + */ + + pgstat_count_tdeheap_getnext(scan->rs_base.rs_rd); + + ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, + scan->rs_cbuf); + return true; +} + +void +tdeheap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, + ItemPointer maxtid) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + BlockNumber startBlk; + BlockNumber numBlks; + ItemPointerData highestItem; + ItemPointerData lowestItem; + + /* + * For relations without any pages, we can simply leave the TID range + * unset. There will be no tuples to scan, therefore no tuples outside + * the given TID range. + */ + if (scan->rs_nblocks == 0) + return; + + /* + * Set up some ItemPointers which point to the first and last possible + * tuples in the heap. + */ + ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber); + ItemPointerSet(&lowestItem, 0, FirstOffsetNumber); + + /* + * If the given maximum TID is below the highest possible TID in the + * relation, then restrict the range to that, otherwise we scan to the end + * of the relation. + */ + if (ItemPointerCompare(maxtid, &highestItem) < 0) + ItemPointerCopy(maxtid, &highestItem); + + /* + * If the given minimum TID is above the lowest possible TID in the + * relation, then restrict the range to only scan for TIDs above that. + */ + if (ItemPointerCompare(mintid, &lowestItem) > 0) + ItemPointerCopy(mintid, &lowestItem); + + /* + * Check for an empty range and protect from would be negative results + * from the numBlks calculation below. + */ + if (ItemPointerCompare(&highestItem, &lowestItem) < 0) + { + /* Set an empty range of blocks to scan */ + tdeheap_setscanlimits(sscan, 0, 0); + return; + } + + /* + * Calculate the first block and the number of blocks we must scan. We + * could be more aggressive here and perform some more validation to try + * and further narrow the scope of blocks to scan by checking if the + * lowestItem has an offset above MaxOffsetNumber. In this case, we could + * advance startBlk by one. Likewise, if highestItem has an offset of 0 + * we could scan one fewer blocks. However, such an optimization does not + * seem worth troubling over, currently. + */ + startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem); + + numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) - + ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1; + + /* Set the start block and number of blocks to scan */ + tdeheap_setscanlimits(sscan, startBlk, numBlks); + + /* Finally, set the TID range in sscan */ + ItemPointerCopy(&lowestItem, &sscan->rs_mintid); + ItemPointerCopy(&highestItem, &sscan->rs_maxtid); +} + +bool +tdeheap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, + TupleTableSlot *slot) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + ItemPointer mintid = &sscan->rs_mintid; + ItemPointer maxtid = &sscan->rs_maxtid; + + /* Note: no locking manipulations needed */ + for (;;) + { + if (sscan->rs_flags & SO_ALLOW_PAGEMODE) + tdeheapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key); + else + tdeheapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key); + + if (scan->rs_ctup.t_data == NULL) + { + ExecClearTuple(slot); + return false; + } + + /* + * tdeheap_set_tidrange will have used tdeheap_setscanlimits to limit the + * range of pages we scan to only ones that can contain the TID range + * we're scanning for. Here we must filter out any tuples from these + * pages that are outside of that range. + */ + if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0) + { + ExecClearTuple(slot); + + /* + * When scanning backwards, the TIDs will be in descending order. + * Future tuples in this direction will be lower still, so we can + * just return false to indicate there will be no more tuples. + */ + if (ScanDirectionIsBackward(direction)) + return false; + + continue; + } + + /* + * Likewise for the final page, we must filter out TIDs greater than + * maxtid. + */ + if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0) + { + ExecClearTuple(slot); + + /* + * When scanning forward, the TIDs will be in ascending order. + * Future tuples in this direction will be higher still, so we can + * just return false to indicate there will be no more tuples. + */ + if (ScanDirectionIsForward(direction)) + return false; + continue; + } + + break; + } + + /* + * if we get here it means we have a new current scan tuple, so point to + * the proper return buffer and return the tuple. + */ + pgstat_count_tdeheap_getnext(scan->rs_base.rs_rd); + + ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf); + return true; +} + +/* + * tdeheap_fetch - retrieve tuple with given tid + * + * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding + * the tuple, fill in the remaining fields of *tuple, and check the tuple + * against the specified snapshot. + * + * If successful (tuple found and passes snapshot time qual), then *userbuf + * is set to the buffer holding the tuple and true is returned. The caller + * must unpin the buffer when done with the tuple. + * + * If the tuple is not found (ie, item number references a deleted slot), + * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer, + * and false is returned. + * + * If the tuple is found but fails the time qual check, then the behavior + * depends on the keep_buf parameter. If keep_buf is false, the results + * are the same as for the tuple-not-found case. If keep_buf is true, + * then tuple->t_data and *userbuf are returned as for the success case, + * and again the caller must unpin the buffer; but false is returned. + * + * tdeheap_fetch does not follow HOT chains: only the exact TID requested will + * be fetched. + * + * It is somewhat inconsistent that we ereport() on invalid block number but + * return false on invalid item number. There are a couple of reasons though. + * One is that the caller can relatively easily check the block number for + * validity, but cannot check the item number without reading the page + * himself. Another is that when we are following a t_ctid link, we can be + * reasonably confident that the page number is valid (since VACUUM shouldn't + * truncate off the destination page without having killed the referencing + * tuple first), but the item number might well not be good. + */ +bool +tdeheap_fetch(Relation relation, + Snapshot snapshot, + HeapTuple tuple, + Buffer *userbuf, + bool keep_buf) +{ + ItemPointer tid = &(tuple->t_self); + ItemId lp; + Buffer buffer; + Page page; + OffsetNumber offnum; + bool valid; + + /* + * Fetch and pin the appropriate page of the relation. + */ + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + + /* + * Need share lock on buffer to examine tuple commit status. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, relation, page); + + /* + * We'd better check for out-of-range offnum in case of VACUUM since the + * TID was obtained. + */ + offnum = ItemPointerGetOffsetNumber(tid); + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + tuple->t_data = NULL; + return false; + } + + /* + * get the item line pointer corresponding to the requested tid + */ + lp = PageGetItemId(page, offnum); + + /* + * Must check for deleted tuple. + */ + if (!ItemIdIsNormal(lp)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + tuple->t_data = NULL; + return false; + } + + /* + * fill in *tuple fields + */ + tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); + tuple->t_len = ItemIdGetLength(lp); + tuple->t_tableOid = RelationGetRelid(relation); + + /* + * check tuple visibility, then release lock + */ + valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + + if (valid) + PredicateLockTID(relation, &(tuple->t_self), snapshot, + HeapTupleHeaderGetXmin(tuple->t_data)); + + HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (valid) + { + /* + * All checks passed, so return the tuple as valid. Caller is now + * responsible for releasing the buffer. + */ + *userbuf = buffer; + + return true; + } + + /* Tuple failed time qual, but maybe caller wants to see it anyway. */ + if (keep_buf) + *userbuf = buffer; + else + { + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + tuple->t_data = NULL; + } + + return false; +} + +/* + * tdeheap_hot_search_buffer - search HOT chain for tuple satisfying snapshot + * + * On entry, *tid is the TID of a tuple (either a simple tuple, or the root + * of a HOT chain), and buffer is the buffer holding this tuple. We search + * for the first chain member satisfying the given snapshot. If one is + * found, we update *tid to reference that tuple's offset number, and + * return true. If no match, return false without modifying *tid. + * + * heapTuple is a caller-supplied buffer. When a match is found, we return + * the tuple here, in addition to updating *tid. If no match is found, the + * contents of this buffer on return are undefined. + * + * If all_dead is not NULL, we check non-visible tuples to see if they are + * globally dead; *all_dead is set true if all members of the HOT chain + * are vacuumable, false if not. + * + * Unlike tdeheap_fetch, the caller must already have pin and (at least) share + * lock on the buffer; it is still pinned/locked at exit. + */ +bool +tdeheap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, + Snapshot snapshot, HeapTuple heapTuple, + bool *all_dead, bool first_call) +{ + Page page = BufferGetPage(buffer); + TransactionId prev_xmax = InvalidTransactionId; + BlockNumber blkno; + OffsetNumber offnum; + bool at_chain_start; + bool valid; + bool skip; + GlobalVisState *vistest = NULL; + + /* If this is not the first call, previous call returned a (live!) tuple */ + if (all_dead) + *all_dead = first_call; + + blkno = ItemPointerGetBlockNumber(tid); + offnum = ItemPointerGetOffsetNumber(tid); + at_chain_start = first_call; + skip = !first_call; + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + Assert(BufferGetBlockNumber(buffer) == blkno); + + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) + break; + + lp = PageGetItemId(page, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + { + /* We should only see a redirect at start of chain */ + if (ItemIdIsRedirected(lp) && at_chain_start) + { + /* Follow the redirect */ + offnum = ItemIdGetRedirect(lp); + at_chain_start = false; + continue; + } + /* else must be end of chain */ + break; + } + + /* + * Update heapTuple to point to the element of the HOT chain we're + * currently investigating. Having t_self set correctly is important + * because the SSI checks and the *Satisfies routine for historical + * MVCC snapshots need the correct tid to decide about the visibility. + */ + heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); + heapTuple->t_len = ItemIdGetLength(lp); + heapTuple->t_tableOid = RelationGetRelid(relation); + ItemPointerSet(&heapTuple->t_self, blkno, offnum); + + /* + * Shouldn't see a HEAP_ONLY tuple at chain start. + */ + if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) + break; + + /* + * The xmin should match the previous xmax value, else chain is + * broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple->t_data))) + break; + + /* + * When first_call is true (and thus, skip is initially false) we'll + * return the first tuple we find. But on later passes, heapTuple + * will initially be pointing to the tuple we returned last time. + * Returning it again would be incorrect (and would loop forever), so + * we skip it and return the next match we find. + */ + if (!skip) + { + /* If it's visible per the snapshot, we must return it */ + valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer); + HeapCheckForSerializableConflictOut(valid, relation, heapTuple, + buffer, snapshot); + + if (valid) + { + ItemPointerSetOffsetNumber(tid, offnum); + PredicateLockTID(relation, &heapTuple->t_self, snapshot, + HeapTupleHeaderGetXmin(heapTuple->t_data)); + if (all_dead) + *all_dead = false; + return true; + } + } + skip = false; + + /* + * If we can't see it, maybe no one else can either. At caller + * request, check whether all chain members are dead to all + * transactions. + * + * Note: if you change the criterion here for what is "dead", fix the + * planner's get_actual_variable_range() function to match. + */ + if (all_dead && *all_dead) + { + if (!vistest) + vistest = GlobalVisTestFor(relation); + + if (!HeapTupleIsSurelyDead(heapTuple, vistest)) + *all_dead = false; + } + + /* + * Check to see if HOT chain continues past this tuple; if so fetch + * the next offnum and loop around. + */ + if (HeapTupleIsHotUpdated(heapTuple)) + { + Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) == + blkno); + offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid); + at_chain_start = false; + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); + } + else + break; /* end of chain */ + } + + return false; +} + +/* + * tdeheap_get_latest_tid - get the latest tid of a specified tuple + * + * Actually, this gets the latest version that is visible according to the + * scan's snapshot. Create a scan using SnapshotDirty to get the very latest, + * possibly uncommitted version. + * + * *tid is both an input and an output parameter: it is updated to + * show the latest version of the row. Note that it will not be changed + * if no version of the row passes the snapshot test. + */ +void +tdeheap_get_latest_tid(TableScanDesc sscan, + ItemPointer tid) +{ + Relation relation = sscan->rs_rd; + Snapshot snapshot = sscan->rs_snapshot; + ItemPointerData ctid; + TransactionId priorXmax; + + /* + * table_tuple_get_latest_tid() verified that the passed in tid is valid. + * Assume that t_ctid links are valid however - there shouldn't be invalid + * ones in the table. + */ + Assert(ItemPointerIsValid(tid)); + + /* + * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we + * need to examine, and *tid is the TID we will return if ctid turns out + * to be bogus. + * + * Note that we will loop until we reach the end of the t_ctid chain. + * Depending on the snapshot passed, there might be at most one visible + * version of the row, but we don't try to optimize for that. + */ + ctid = *tid; + priorXmax = InvalidTransactionId; /* cannot check first XMIN */ + for (;;) + { + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp; + HeapTupleData tp; + bool valid; + + /* + * Read, pin, and lock the page. + */ + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, relation, page); + + /* + * Check for bogus item number. This is not treated as an error + * condition because it can happen while following a t_ctid link. We + * just assume that the prior tid is OK and return it unchanged. + */ + offnum = ItemPointerGetOffsetNumber(&ctid); + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) + { + UnlockReleaseBuffer(buffer); + break; + } + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) + { + UnlockReleaseBuffer(buffer); + break; + } + + /* OK to access the tuple */ + tp.t_self = ctid; + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_tableOid = RelationGetRelid(relation); + + /* + * After following a t_ctid link, we might arrive at an unrelated + * tuple. Check for XMIN match. + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) + { + UnlockReleaseBuffer(buffer); + break; + } + + /* + * Check tuple visibility; if visible, set it as the new result + * candidate. + */ + valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer); + HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot); + if (valid) + *tid = ctid; + + /* + * If there's a valid t_ctid link, follow it, else we're done. + */ + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsOnlyLocked(tp.t_data) || + HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) || + ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) + { + UnlockReleaseBuffer(buffer); + break; + } + + ctid = tp.t_data->t_ctid; + priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); + UnlockReleaseBuffer(buffer); + } /* end of loop */ +} + + +/* + * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends + * + * This is called after we have waited for the XMAX transaction to terminate. + * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will + * be set on exit. If the transaction committed, we set the XMAX_COMMITTED + * hint bit if possible --- but beware that that may not yet be possible, + * if the transaction committed asynchronously. + * + * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID + * even if it commits. + * + * Hence callers should look only at XMAX_INVALID. + * + * Note this is not allowed for tuples whose xmax is a multixact. + */ +static void +UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid) +{ + Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid)); + Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + + if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID))) + { + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) && + TransactionIdDidCommit(xid)) + HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + xid); + else + HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + } +} + + +/* + * GetBulkInsertState - prepare status object for a bulk insert + */ +BulkInsertState +GetBulkInsertState(void) +{ + BulkInsertState bistate; + + bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData)); + bistate->strategy = GetAccessStrategy(BAS_BULKWRITE); + bistate->current_buf = InvalidBuffer; + bistate->next_free = InvalidBlockNumber; + bistate->last_free = InvalidBlockNumber; + bistate->already_extended_by = 0; + return bistate; +} + +/* + * FreeBulkInsertState - clean up after finishing a bulk insert + */ +void +FreeBulkInsertState(BulkInsertState bistate) +{ + if (bistate->current_buf != InvalidBuffer) + ReleaseBuffer(bistate->current_buf); + FreeAccessStrategy(bistate->strategy); + pfree(bistate); +} + +/* + * ReleaseBulkInsertStatePin - release a buffer currently held in bistate + */ +void +ReleaseBulkInsertStatePin(BulkInsertState bistate) +{ + if (bistate->current_buf != InvalidBuffer) + ReleaseBuffer(bistate->current_buf); + bistate->current_buf = InvalidBuffer; + + /* + * Despite the name, we also reset bulk relation extension state. + * Otherwise we can end up erroring out due to looking for free space in + * ->next_free of one partition, even though ->next_free was set when + * extending another partition. It could obviously also be bad for + * efficiency to look at existing blocks at offsets from another + * partition, even if we don't error out. + */ + bistate->next_free = InvalidBlockNumber; + bistate->last_free = InvalidBlockNumber; +} + + +/* + * tdeheap_insert - insert tuple into a heap + * + * The new tuple is stamped with current transaction ID and the specified + * command ID. + * + * See table_tuple_insert for comments about most of the input flags, except + * that this routine directly takes a tuple rather than a slot. + * + * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_ + * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to + * implement table_tuple_insert_speculative(). + * + * On return the header fields of *tup are updated to match the stored tuple; + * in particular tup->t_self receives the actual TID where the tuple was + * stored. But note that any toasting of fields within the tuple data is NOT + * reflected into *tup. + */ +void +tdeheap_insert(Relation relation, HeapTuple tup, CommandId cid, + int options, BulkInsertState bistate) +{ + TransactionId xid = GetCurrentTransactionId(); + HeapTuple heaptup; + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + bool all_visible_cleared = false; + + /* Cheap, simplistic check that the tuple matches the rel's rowtype. */ + Assert(HeapTupleHeaderGetNatts(tup->t_data) <= + RelationGetNumberOfAttributes(relation)); + + /* + * Fill in tuple header fields and toast the tuple if necessary. + * + * Note: below this point, heaptup is the data we actually intend to store + * into the relation; tup is the caller's original untoasted data. + */ + heaptup = tdeheap_prepare_insert(relation, tup, xid, cid, options); + + /* + * Find buffer to insert this tuple into. If the page is all visible, + * this will also pin the requisite visibility map page. + */ + buffer = tdeheap_RelationGetBufferForTuple(relation, heaptup->t_len, + InvalidBuffer, options, bistate, + &vmbuffer, NULL, + 0); + + /* + * We're about to do the actual insert -- but check for conflict first, to + * avoid possibly having to roll back work we've just done. + * + * This is safe without a recheck as long as there is no possibility of + * another process scanning the page between this check and the insert + * being visible to the scan (i.e., an exclusive buffer content lock is + * continuously held from this point until the tuple insert is visible). + * + * For a heap insert, we only need to check for table-level SSI locks. Our + * new tuple can't possibly conflict with existing tuple locks, and heap + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. So we don't need to specify a + * buffer when making the call, which makes for a faster check. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, + (options & HEAP_INSERT_SPECULATIVE) != 0); + + if (PageIsAllVisible(BufferGetPage(buffer))) + { + all_visible_cleared = true; + PageClearAllVisible(BufferGetPage(buffer)); + tdeheap_visibilitymap_clear(relation, + ItemPointerGetBlockNumber(&(heaptup->t_self)), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + + /* + * XXX Should we set PageSetPrunable on this page ? + * + * The inserting transaction may eventually abort thus making this tuple + * DEAD and hence available for pruning. Though we don't want to optimize + * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the + * aborted tuple will never be pruned until next vacuum is triggered. + * + * If you do add PageSetPrunable here, add it in tdeheap_xlog_insert too. + */ + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_tdeheap_insert xlrec; + xl_tdeheap_header xlhdr; + XLogRecPtr recptr; + Page page = BufferGetPage(buffer); + uint8 info = XLOG_HEAP_INSERT; + int bufflags = 0; + + /* + * If this is a catalog, we need to transmit combo CIDs to properly + * decode, so log that as well. + */ + if (RelationIsAccessibleInLogicalDecoding(relation)) + log_tdeheap_new_cid(relation, heaptup); + + /* + * If this is the single and first tuple on page, we can reinit the + * page instead of restoring the whole thing. Set flag, and hide + * buffer references from XLogInsert. + */ + if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber && + PageGetMaxOffsetNumber(page) == FirstOffsetNumber) + { + info |= XLOG_HEAP_INIT_PAGE; + bufflags |= REGBUF_WILL_INIT; + } + + xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self); + xlrec.flags = 0; + if (all_visible_cleared) + xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED; + if (options & HEAP_INSERT_SPECULATIVE) + xlrec.flags |= XLH_INSERT_IS_SPECULATIVE; + Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer)); + + /* + * For logical decoding, we need the tuple even if we're doing a full + * page write, so make sure it's included even if we take a full-page + * image. (XXX We could alternatively store a pointer into the FPW). + */ + if (RelationIsLogicallyLogged(relation) && + !(options & HEAP_INSERT_NO_LOGICAL)) + { + xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; + bufflags |= REGBUF_KEEP_DATA; + + if (IsToastRelation(relation)) + xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION; + } + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapInsert); + + xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; + xlhdr.t_infomask = heaptup->t_data->t_infomask; + xlhdr.t_hoff = heaptup->t_data->t_hoff; + + /* + * note we mark xlhdr as belonging to buffer; if XLogInsert decides to + * write the whole page to the xlog, we don't need to store + * xl_tdeheap_header in the xlog. + */ + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); + XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); + /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ + XLogRegisterBufData(0, + (char *) heaptup->t_data + SizeofHeapTupleHeader, + heaptup->t_len - SizeofHeapTupleHeader); + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP_ID, info); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + /* + * If tuple is cachable, mark it for invalidation from the caches in case + * we abort. Note it is OK to do this after releasing the buffer, because + * the heaptup data structure is all in local memory, not in the shared + * buffer. + */ + CacheInvalidateHeapTuple(relation, heaptup, NULL); + + /* Note: speculative insertions are counted too, even if aborted later */ + pgstat_count_tdeheap_insert(relation, 1); + + /* + * If heaptup is a private copy, release it. Don't forget to copy t_self + * back to the caller's image, too. + */ + if (heaptup != tup) + { + tup->t_self = heaptup->t_self; + tdeheap_freetuple(heaptup); + } +} + +/* + * Subroutine for tdeheap_insert(). Prepares a tuple for insertion. This sets the + * tuple header fields and toasts the tuple if necessary. Returns a toasted + * version of the tuple if it was toasted, or the original tuple if not. Note + * that in any case, the header fields are also set in the original tuple. + */ +static HeapTuple +tdeheap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, + CommandId cid, int options) +{ + /* + * To allow parallel inserts, we need to ensure that they are safe to be + * performed in workers. We have the infrastructure to allow parallel + * inserts in general except for the cases where inserts generate a new + * CommandId (eg. inserts into a table having a foreign key column). + */ + if (IsParallelWorker()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot insert tuples in a parallel worker"))); + + tup->t_data->t_infomask &= ~(HEAP_XACT_MASK); + tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); + tup->t_data->t_infomask |= HEAP_XMAX_INVALID; + HeapTupleHeaderSetXmin(tup->t_data, xid); + if (options & HEAP_INSERT_FROZEN) + HeapTupleHeaderSetXminFrozen(tup->t_data); + + HeapTupleHeaderSetCmin(tup->t_data, cid); + HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */ + tup->t_tableOid = RelationGetRelid(relation); + + /* + * If the new tuple is too big for storage or contains already toasted + * out-of-line attributes from some other relation, invoke the toaster. + */ + if (relation->rd_rel->relkind != RELKIND_RELATION && + relation->rd_rel->relkind != RELKIND_MATVIEW) + { + /* toast table entries should never be recursively toasted */ + Assert(!HeapTupleHasExternal(tup)); + return tup; + } + else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) + return tdeheap_toast_insert_or_update(relation, tup, NULL, options); + else + return tup; +} + +/* + * Helper for tdeheap_multi_insert() that computes the number of entire pages + * that inserting the remaining heaptuples requires. Used to determine how + * much the relation needs to be extended by. + */ +static int +tdeheap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace) +{ + size_t page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace; + int npages = 1; + + for (int i = done; i < ntuples; i++) + { + size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len); + + if (page_avail < tup_sz) + { + npages++; + page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace; + } + page_avail -= tup_sz; + } + + return npages; +} + +/* + * tdeheap_multi_insert - insert multiple tuples into a heap + * + * This is like tdeheap_insert(), but inserts multiple tuples in one operation. + * That's faster than calling tdeheap_insert() in a loop, because when multiple + * tuples can be inserted on a single page, we can write just a single WAL + * record covering all of them, and only need to lock/unlock the page once. + * + * Note: this leaks memory into the current memory context. You can create a + * temporary context before calling this, if that's a problem. + */ +void +tdeheap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, + CommandId cid, int options, BulkInsertState bistate) +{ + TransactionId xid = GetCurrentTransactionId(); + HeapTuple *heaptuples; + int i; + int ndone; + PGAlignedBlock scratch; + Page page; + Buffer vmbuffer = InvalidBuffer; + bool needwal; + Size saveFreeSpace; + bool need_tuple_data = RelationIsLogicallyLogged(relation); + bool need_cids = RelationIsAccessibleInLogicalDecoding(relation); + bool starting_with_empty_page = false; + int npages = 0; + int npages_used = 0; + + /* currently not needed (thus unsupported) for tdeheap_multi_insert() */ + Assert(!(options & HEAP_INSERT_NO_LOGICAL)); + + needwal = RelationNeedsWAL(relation); + saveFreeSpace = RelationGetTargetPageFreeSpace(relation, + HEAP_DEFAULT_FILLFACTOR); + + /* Toast and set header data in all the slots */ + heaptuples = palloc(ntuples * sizeof(HeapTuple)); + for (i = 0; i < ntuples; i++) + { + HeapTuple tuple; + + tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL); + slots[i]->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slots[i]->tts_tableOid; + heaptuples[i] = tdeheap_prepare_insert(relation, tuple, xid, cid, + options); + } + + /* + * We're about to do the actual inserts -- but check for conflict first, + * to minimize the possibility of having to roll back work we've just + * done. + * + * A check here does not definitively prevent a serialization anomaly; + * that check MUST be done at least past the point of acquiring an + * exclusive buffer content lock on every buffer that will be affected, + * and MAY be done after all inserts are reflected in the buffers and + * those locks are released; otherwise there is a race condition. Since + * multiple buffers can be locked and unlocked in the loop below, and it + * would not be feasible to identify and lock all of those buffers before + * the loop, we must do a final check at the end. + * + * The check here could be omitted with no loss of correctness; it is + * present strictly as an optimization. + * + * For heap inserts, we only need to check for table-level SSI locks. Our + * new tuples can't possibly conflict with existing tuple locks, and heap + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. So we don't need to specify a + * buffer when making the call, which makes for a faster check. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + + ndone = 0; + while (ndone < ntuples) + { + Buffer buffer; + bool all_visible_cleared = false; + bool all_frozen_set = false; + int nthispage; + + CHECK_FOR_INTERRUPTS(); + + /* + * Compute number of pages needed to fit the to-be-inserted tuples in + * the worst case. This will be used to determine how much to extend + * the relation by in tdeheap_RelationGetBufferForTuple(), if needed. If we + * filled a prior page from scratch, we can just update our last + * computation, but if we started with a partially filled page, + * recompute from scratch, the number of potentially required pages + * can vary due to tuples needing to fit onto the page, page headers + * etc. + */ + if (ndone == 0 || !starting_with_empty_page) + { + npages = tdeheap_multi_insert_pages(heaptuples, ndone, ntuples, + saveFreeSpace); + npages_used = 0; + } + else + npages_used++; + + /* + * Find buffer where at least the next tuple will fit. If the page is + * all-visible, this will also pin the requisite visibility map page. + * + * Also pin visibility map page if COPY FREEZE inserts tuples into an + * empty page. See all_frozen_set below. + */ + buffer = tdeheap_RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len, + InvalidBuffer, options, bistate, + &vmbuffer, NULL, + npages - npages_used); + page = BufferGetPage(buffer); + + starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0; + + if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN)) + all_frozen_set = true; + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + /* + * tdeheap_RelationGetBufferForTuple has ensured that the first tuple fits. + * Put that on the page, and then as many other tuples as fit. + */ + tdeheap_RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); + + /* + * For logical decoding we need combo CIDs to properly decode the + * catalog. + */ + if (needwal && need_cids) + log_tdeheap_new_cid(relation, heaptuples[ndone]); + + for (nthispage = 1; ndone + nthispage < ntuples; nthispage++) + { + HeapTuple heaptup = heaptuples[ndone + nthispage]; + + if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) + break; + + tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, false); + + /* + * For logical decoding we need combo CIDs to properly decode the + * catalog. + */ + if (needwal && need_cids) + log_tdeheap_new_cid(relation, heaptup); + } + + /* + * If the page is all visible, need to clear that, unless we're only + * going to add further frozen rows to it. + * + * If we're only adding already frozen rows to a previously empty + * page, mark it as all-visible. + */ + if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + tdeheap_visibilitymap_clear(relation, + BufferGetBlockNumber(buffer), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + else if (all_frozen_set) + PageSetAllVisible(page); + + /* + * XXX Should we set PageSetPrunable on this page ? See tdeheap_insert() + */ + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (needwal) + { + XLogRecPtr recptr; + xl_tdeheap_multi_insert *xlrec; + uint8 info = XLOG_HEAP2_MULTI_INSERT; + char *tupledata; + int totaldatalen; + char *scratchptr = scratch.data; + bool init; + int bufflags = 0; + + /* + * If the page was previously empty, we can reinit the page + * instead of restoring the whole thing. + */ + init = starting_with_empty_page; + + /* allocate xl_tdeheap_multi_insert struct from the scratch area */ + xlrec = (xl_tdeheap_multi_insert *) scratchptr; + scratchptr += SizeOfHeapMultiInsert; + + /* + * Allocate offsets array. Unless we're reinitializing the page, + * in that case the tuples are stored in order starting at + * FirstOffsetNumber and we don't need to store the offsets + * explicitly. + */ + if (!init) + scratchptr += nthispage * sizeof(OffsetNumber); + + /* the rest of the scratch space is used for tuple data */ + tupledata = scratchptr; + + /* check that the mutually exclusive flags are not both set */ + Assert(!(all_visible_cleared && all_frozen_set)); + + xlrec->flags = 0; + if (all_visible_cleared) + xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED; + if (all_frozen_set) + xlrec->flags = XLH_INSERT_ALL_FROZEN_SET; + + xlrec->ntuples = nthispage; + + /* + * Write out an xl_multi_insert_tuple and the tuple data itself + * for each tuple. + */ + for (i = 0; i < nthispage; i++) + { + HeapTuple heaptup = heaptuples[ndone + i]; + xl_multi_insert_tuple *tuphdr; + int datalen; + + if (!init) + xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); + /* xl_multi_insert_tuple needs two-byte alignment. */ + tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); + scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; + + tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; + tuphdr->t_infomask = heaptup->t_data->t_infomask; + tuphdr->t_hoff = heaptup->t_data->t_hoff; + + /* write bitmap [+ padding] [+ oid] + data */ + datalen = heaptup->t_len - SizeofHeapTupleHeader; + memcpy(scratchptr, + (char *) heaptup->t_data + SizeofHeapTupleHeader, + datalen); + tuphdr->datalen = datalen; + scratchptr += datalen; + } + totaldatalen = scratchptr - tupledata; + Assert((scratchptr - scratch.data) < BLCKSZ); + + if (need_tuple_data) + xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; + + /* + * Signal that this is the last xl_tdeheap_multi_insert record + * emitted by this call to tdeheap_multi_insert(). Needed for logical + * decoding so it knows when to cleanup temporary data. + */ + if (ndone + nthispage == ntuples) + xlrec->flags |= XLH_INSERT_LAST_IN_MULTI; + + if (init) + { + info |= XLOG_HEAP_INIT_PAGE; + bufflags |= REGBUF_WILL_INIT; + } + + /* + * If we're doing logical decoding, include the new tuple data + * even if we take a full-page image of the page. + */ + if (need_tuple_data) + bufflags |= REGBUF_KEEP_DATA; + + XLogBeginInsert(); + XLogRegisterData((char *) xlrec, tupledata - scratch.data); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); + + XLogRegisterBufData(0, tupledata, totaldatalen); + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP2_ID, info); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* + * If we've frozen everything on the page, update the visibilitymap. + * We're already holding pin on the vmbuffer. + */ + if (all_frozen_set) + { + Assert(PageIsAllVisible(page)); + Assert(tdeheap_visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); + + /* + * It's fine to use InvalidTransactionId here - this is only used + * when HEAP_INSERT_FROZEN is specified, which intentionally + * violates visibility rules. + */ + tdeheap_visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer, + InvalidXLogRecPtr, vmbuffer, + InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); + } + + UnlockReleaseBuffer(buffer); + ndone += nthispage; + + /* + * NB: Only release vmbuffer after inserting all tuples - it's fairly + * likely that we'll insert into subsequent heap pages that are likely + * to use the same vm page. + */ + } + + /* We're done with inserting all tuples, so release the last vmbuffer. */ + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + /* + * We're done with the actual inserts. Check for conflicts again, to + * ensure that all rw-conflicts in to these inserts are detected. Without + * this final check, a sequential scan of the heap may have locked the + * table after the "before" check, missing one opportunity to detect the + * conflict, and then scanned the table before the new tuples were there, + * missing the other chance to detect the conflict. + * + * For heap inserts, we only need to check for table-level SSI locks. Our + * new tuples can't possibly conflict with existing tuple locks, and heap + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. So we don't need to specify a + * buffer when making the call. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + + /* + * If tuples are cachable, mark them for invalidation from the caches in + * case we abort. Note it is OK to do this after releasing the buffer, + * because the heaptuples data structure is all in local memory, not in + * the shared buffer. + */ + if (IsCatalogRelation(relation)) + { + for (i = 0; i < ntuples; i++) + CacheInvalidateHeapTuple(relation, heaptuples[i], NULL); + } + + /* copy t_self fields back to the caller's slots */ + for (i = 0; i < ntuples; i++) + slots[i]->tts_tid = heaptuples[i]->t_self; + + pgstat_count_tdeheap_insert(relation, ntuples); +} + +/* + * simple_tdeheap_insert - insert a tuple + * + * Currently, this routine differs from tdeheap_insert only in supplying + * a default command ID and not allowing access to the speedup options. + * + * This should be used rather than using tdeheap_insert directly in most places + * where we are modifying system catalogs. + */ +void +simple_tdeheap_insert(Relation relation, HeapTuple tup) +{ + tdeheap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL); +} + +/* + * Given infomask/infomask2, compute the bits that must be saved in the + * "infobits" field of xl_tdeheap_delete, xl_tdeheap_update, xl_tdeheap_lock, + * xl_tdeheap_lock_updated WAL records. + * + * See fix_infomask_from_infobits. + */ +static uint8 +compute_infobits(uint16 infomask, uint16 infomask2) +{ + return + ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) | + ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) | + ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) | + /* note we ignore HEAP_XMAX_SHR_LOCK here */ + ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) | + ((infomask2 & HEAP_KEYS_UPDATED) != 0 ? + XLHL_KEYS_UPDATED : 0); +} + +/* + * Given two versions of the same t_infomask for a tuple, compare them and + * return whether the relevant status for a tuple Xmax has changed. This is + * used after a buffer lock has been released and reacquired: we want to ensure + * that the tuple state continues to be the same it was when we previously + * examined it. + * + * Note the Xmax field itself must be compared separately. + */ +static inline bool +xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) +{ + const uint16 interesting = + HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK; + + if ((new_infomask & interesting) != (old_infomask & interesting)) + return true; + + return false; +} + +/* + * tdeheap_delete - delete a tuple + * + * See table_tuple_delete() for an explanation of the parameters, except that + * this routine directly takes a tuple rather than a slot. + * + * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, + * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last + * only for TM_SelfModified, since we cannot obtain cmax from a combo CID + * generated by another transaction). + */ +TM_Result +tdeheap_delete(Relation relation, ItemPointer tid, + CommandId cid, Snapshot crosscheck, bool wait, + TM_FailureData *tmfd, bool changingPart) +{ + TM_Result result; + TransactionId xid = GetCurrentTransactionId(); + ItemId lp; + HeapTupleData tp; + Page page; + BlockNumber block; + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + TransactionId new_xmax; + uint16 new_infomask, + new_infomask2; + bool have_tuple_lock = false; + bool iscombo; + bool all_visible_cleared = false; + HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ + bool old_key_copied = false; + + Assert(ItemPointerIsValid(tid)); + + /* + * Forbid this during a parallel operation, lest it allocate a combo CID. + * Other workers might need that combo CID for visibility checks, and we + * have no provision for broadcasting it to them. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot delete tuples during a parallel operation"))); + + block = ItemPointerGetBlockNumber(tid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. + */ + if (PageIsAllVisible(page)) + tdeheap_visibilitymap_pin(relation, block, &vmbuffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tp.t_tableOid = RelationGetRelid(relation); + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_self = *tid; + +l1: + + /* + * If we didn't pin the visibility map page and the page has become all + * visible while we were busy locking the buffer, we'll have to unlock and + * re-lock, to avoid holding the buffer lock across an I/O. That's a bit + * unfortunate, but hopefully shouldn't happen often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + tdeheap_visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + + result = HeapTupleSatisfiesUpdate(&tp, cid, buffer); + + if (result == TM_Invisible) + { + UnlockReleaseBuffer(buffer); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("attempted to delete invisible tuple"))); + } + else if (result == TM_BeingModified && wait) + { + TransactionId xwait; + uint16 infomask; + + /* must copy state data before unlocking buffer */ + xwait = HeapTupleHeaderGetRawXmax(tp.t_data); + infomask = tp.t_data->t_infomask; + + /* + * Sleep until concurrent transaction ends -- except when there's a + * single locker and it's our own transaction. Note we don't care + * which lock mode the locker has, because we need the strongest one. + * + * Before sleeping, we need to acquire tuple lock to establish our + * priority for the tuple (see tdeheap_lock_tuple). LockTuple will + * release us when we are next-in-line for the tuple. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while rechecking + * tuple state. + */ + if (infomask & HEAP_XMAX_IS_MULTI) + { + bool current_is_member = false; + + if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, + LockTupleExclusive, ¤t_is_member)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * Acquire the lock, if necessary (but skip it when we're + * requesting a lock and already have one; avoids deadlock). + */ + if (!current_is_member) + tdeheap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive, + LockWaitBlock, &have_tuple_lock); + + /* wait for multixact */ + MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask, + relation, &(tp.t_self), XLTW_Delete, + NULL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If xwait had just locked the tuple then some other xact + * could update this tuple before we get to this point. Check + * for xmax change, and start over if so. + * + * We also must start over if we didn't pin the VM page, and + * the page has become all visible. + */ + if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) || + xmax_infomask_changed(tp.t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), + xwait)) + goto l1; + } + + /* + * You might think the multixact is necessarily done here, but not + * so: it could have surviving members, namely our own xact or + * other subxacts of this backend. It is legal for us to delete + * the tuple in either case, however (the latter case is + * essentially a situation of upgrading our former shared lock to + * exclusive). We don't bother changing the on-disk hint bits + * since we are about to overwrite the xmax altogether. + */ + } + else if (!TransactionIdIsCurrentTransactionId(xwait)) + { + /* + * Wait for regular transaction to end; but first, acquire tuple + * lock. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + tdeheap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive, + LockWaitBlock, &have_tuple_lock); + XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * xwait is done, but if xwait had just locked the tuple then some + * other xact could update this tuple before we get to this point. + * Check for xmax change, and start over if so. + * + * We also must start over if we didn't pin the VM page, and the + * page has become all visible. + */ + if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) || + xmax_infomask_changed(tp.t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), + xwait)) + goto l1; + + /* Otherwise check if it committed or aborted */ + UpdateXmaxHintBits(tp.t_data, buffer, xwait); + } + + /* + * We may overwrite if previous xmax aborted, or if it committed but + * only locked the tuple without updating it. + */ + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) || + HeapTupleHeaderIsOnlyLocked(tp.t_data)) + result = TM_Ok; + else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) + result = TM_Updated; + else + result = TM_Deleted; + } + + /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */ + if (result != TM_Ok) + { + Assert(result == TM_SelfModified || + result == TM_Updated || + result == TM_Deleted || + result == TM_BeingModified); + Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); + Assert(result != TM_Updated || + !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)); + } + + if (crosscheck != InvalidSnapshot && result == TM_Ok) + { + /* Perform additional check for transaction-snapshot mode RI updates */ + if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer)) + result = TM_Updated; + } + + if (result != TM_Ok) + { + tmfd->ctid = tp.t_data->t_ctid; + tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data); + if (result == TM_SelfModified) + tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); + else + tmfd->cmax = InvalidCommandId; + UnlockReleaseBuffer(buffer); + if (have_tuple_lock) + UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + return result; + } + + /* + * We're about to do the actual delete -- check for conflict first, to + * avoid possibly having to roll back work we've just done. + * + * This is safe without a recheck as long as there is no possibility of + * another process scanning the page between this check and the delete + * being visible to the scan (i.e., an exclusive buffer content lock is + * continuously held from this point until the tuple delete is visible). + */ + CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer)); + + /* replace cid with a combo CID if necessary */ + HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo); + + /* + * Compute replica identity tuple before entering the critical section so + * we don't PANIC upon a memory allocation failure. + */ + old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied); + + /* + * If this is the first possibly-multixact-able operation in the current + * transaction, set my per-backend OldestMemberMXactId setting. We can be + * certain that the transaction will never become a member of any older + * MultiXactIds than that. (We have to do this even if we end up just + * using our own TransactionId below, since some other backend could + * incorporate our XID into a MultiXact immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + + compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data), + tp.t_data->t_infomask, tp.t_data->t_infomask2, + xid, LockTupleExclusive, true, + &new_xmax, &new_infomask, &new_infomask2); + + START_CRIT_SECTION(); + + /* + * If this transaction commits, the tuple will become DEAD sooner or + * later. Set flag that this page is a candidate for pruning once our xid + * falls below the OldestXmin horizon. If the transaction finally aborts, + * the subsequent page pruning will be a no-op and the hint will be + * cleared. + */ + PageSetPrunable(page, xid); + + if (PageIsAllVisible(page)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + tdeheap_visibilitymap_clear(relation, BufferGetBlockNumber(buffer), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + + /* store transaction information of xact deleting the tuple */ + tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + tp.t_data->t_infomask |= new_infomask; + tp.t_data->t_infomask2 |= new_infomask2; + HeapTupleHeaderClearHotUpdated(tp.t_data); + HeapTupleHeaderSetXmax(tp.t_data, new_xmax); + HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo); + /* Make sure there is no forward chain link in t_ctid */ + tp.t_data->t_ctid = tp.t_self; + + /* Signal that this is actually a move into another partition */ + if (changingPart) + HeapTupleHeaderSetMovedPartitions(tp.t_data); + + MarkBufferDirty(buffer); + + /* + * XLOG stuff + * + * NB: tdeheap_abort_speculative() uses the same xlog record and replay + * routines. + */ + if (RelationNeedsWAL(relation)) + { + xl_tdeheap_delete xlrec; + xl_tdeheap_header xlhdr; + XLogRecPtr recptr; + + /* + * For logical decode we need combo CIDs to properly decode the + * catalog + */ + if (RelationIsAccessibleInLogicalDecoding(relation)) + log_tdeheap_new_cid(relation, &tp); + + xlrec.flags = 0; + if (all_visible_cleared) + xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED; + if (changingPart) + xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE; + xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, + tp.t_data->t_infomask2); + xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); + xlrec.xmax = new_xmax; + + if (old_key_tuple != NULL) + { + if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE; + else + xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY; + } + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + /* + * Log replica identity of the deleted tuple if there is one + */ + if (old_key_tuple != NULL) + { + xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; + xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; + + XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader); + XLogRegisterData((char *) old_key_tuple->t_data + + SizeofHeapTupleHeader, + old_key_tuple->t_len + - SizeofHeapTupleHeader); + } + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + /* + * If the tuple has toasted out-of-line attributes, we need to delete + * those items too. We have to do this before releasing the buffer + * because we need to look at the contents of the tuple, but it's OK to + * release the content lock on the buffer first. + */ + if (relation->rd_rel->relkind != RELKIND_RELATION && + relation->rd_rel->relkind != RELKIND_MATVIEW) + { + /* toast table entries should never be recursively toasted */ + Assert(!HeapTupleHasExternal(&tp)); + } + else if (HeapTupleHasExternal(&tp)) + tdeheap_toast_delete(relation, &tp, false); + + /* + * Mark tuple for invalidation from system caches at next command + * boundary. We have to do this before releasing the buffer because we + * need to look at the contents of the tuple. + */ + CacheInvalidateHeapTuple(relation, &tp, NULL); + + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + + /* + * Release the lmgr tuple lock, if we had it. + */ + if (have_tuple_lock) + UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); + + pgstat_count_tdeheap_delete(relation); + + if (old_key_tuple != NULL && old_key_copied) + tdeheap_freetuple(old_key_tuple); + + return TM_Ok; +} + +/* + * simple_tdeheap_delete - delete a tuple + * + * This routine may be used to delete a tuple when concurrent updates of + * the target tuple are not expected (for example, because we have a lock + * on the relation associated with the tuple). Any failure is reported + * via ereport(). + */ +void +simple_tdeheap_delete(Relation relation, ItemPointer tid) +{ + TM_Result result; + TM_FailureData tmfd; + + result = tdeheap_delete(relation, tid, + GetCurrentCommandId(true), InvalidSnapshot, + true /* wait for commit */ , + &tmfd, false /* changingPart */ ); + switch (result) + { + case TM_SelfModified: + /* Tuple was already updated in current command? */ + elog(ERROR, "tuple already updated by self"); + break; + + case TM_Ok: + /* done successfully */ + break; + + case TM_Updated: + elog(ERROR, "tuple concurrently updated"); + break; + + case TM_Deleted: + elog(ERROR, "tuple concurrently deleted"); + break; + + default: + elog(ERROR, "unrecognized tdeheap_delete status: %u", result); + break; + } +} + +/* + * tdeheap_update - replace a tuple + * + * See table_tuple_update() for an explanation of the parameters, except that + * this routine directly takes a tuple rather than a slot. + * + * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, + * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last + * only for TM_SelfModified, since we cannot obtain cmax from a combo CID + * generated by another transaction). + */ +TM_Result +tdeheap_update(Relation relation, ItemPointer otid, HeapTuple newtup, + CommandId cid, Snapshot crosscheck, bool wait, + TM_FailureData *tmfd, LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes) +{ + TM_Result result; + TransactionId xid = GetCurrentTransactionId(); + Bitmapset *hot_attrs; + Bitmapset *sum_attrs; + Bitmapset *key_attrs; + Bitmapset *id_attrs; + Bitmapset *interesting_attrs; + Bitmapset *modified_attrs; + ItemId lp; + HeapTupleData oldtup; + HeapTuple heaptup; + HeapTuple old_key_tuple = NULL; + bool old_key_copied = false; + Page page; + BlockNumber block; + MultiXactStatus mxact_status; + Buffer buffer, + newbuf, + vmbuffer = InvalidBuffer, + vmbuffer_new = InvalidBuffer; + bool need_toast; + Size newtupsize, + pagefree; + bool have_tuple_lock = false; + bool iscombo; + bool use_hot_update = false; + bool summarized_update = false; + bool key_intact; + bool all_visible_cleared = false; + bool all_visible_cleared_new = false; + bool checked_lockers; + bool locker_remains; + bool id_has_external = false; + TransactionId xmax_new_tuple, + xmax_old_tuple; + uint16 infomask_old_tuple, + infomask2_old_tuple, + infomask_new_tuple, + infomask2_new_tuple; + + Assert(ItemPointerIsValid(otid)); + + /* Cheap, simplistic check that the tuple matches the rel's rowtype. */ + Assert(HeapTupleHeaderGetNatts(newtup->t_data) <= + RelationGetNumberOfAttributes(relation)); + + /* + * Forbid this during a parallel operation, lest it allocate a combo CID. + * Other workers might need that combo CID for visibility checks, and we + * have no provision for broadcasting it to them. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot update tuples during a parallel operation"))); + + /* + * Fetch the list of attributes to be checked for various operations. + * + * For HOT considerations, this is wasted effort if we fail to update or + * have to put the new tuple on a different page. But we must compute the + * list before obtaining buffer lock --- in the worst case, if we are + * doing an update on one of the relevant system catalogs, we could + * deadlock if we try to fetch the list later. In any case, the relcache + * caches the data so this is usually pretty cheap. + * + * We also need columns used by the replica identity and columns that are + * considered the "key" of rows in the table. + * + * Note that we get copies of each bitmap, so we need not worry about + * relcache flush happening midway through. + */ + hot_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_HOT_BLOCKING); + sum_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_SUMMARIZED); + key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); + id_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + interesting_attrs = NULL; + interesting_attrs = bms_add_members(interesting_attrs, hot_attrs); + interesting_attrs = bms_add_members(interesting_attrs, sum_attrs); + interesting_attrs = bms_add_members(interesting_attrs, key_attrs); + interesting_attrs = bms_add_members(interesting_attrs, id_attrs); + + block = ItemPointerGetBlockNumber(otid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. + */ + if (PageIsAllVisible(page)) + tdeheap_visibilitymap_pin(relation, block, &vmbuffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid)); + Assert(ItemIdIsNormal(lp)); + + /* + * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work + * properly. + */ + oldtup.t_tableOid = RelationGetRelid(relation); + oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp); + oldtup.t_len = ItemIdGetLength(lp); + oldtup.t_self = *otid; + + /* the new tuple is ready, except for this: */ + newtup->t_tableOid = RelationGetRelid(relation); + + /* + * Determine columns modified by the update. Additionally, identify + * whether any of the unmodified replica identity key attributes in the + * old tuple is externally stored or not. This is required because for + * such attributes the flattened value won't be WAL logged as part of the + * new tuple so we must include it as part of the old_key_tuple. See + * ExtractReplicaIdentity. + */ + modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs, + id_attrs, &oldtup, + newtup, &id_has_external); + + /* + * If we're not updating any "key" column, we can grab a weaker lock type. + * This allows for more concurrency when we are running simultaneously + * with foreign key checks. + * + * Note that if a column gets detoasted while executing the update, but + * the value ends up being the same, this test will fail and we will use + * the stronger lock. This is acceptable; the important case to optimize + * is updates that don't manipulate key columns, not those that + * serendipitously arrive at the same key values. + */ + if (!bms_overlap(modified_attrs, key_attrs)) + { + *lockmode = LockTupleNoKeyExclusive; + mxact_status = MultiXactStatusNoKeyUpdate; + key_intact = true; + + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId + * setting. We can be certain that the transaction will never become a + * member of any older MultiXactIds than that. (We have to do this + * even if we end up just using our own TransactionId below, since + * some other backend could incorporate our XID into a MultiXact + * immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + } + else + { + *lockmode = LockTupleExclusive; + mxact_status = MultiXactStatusUpdate; + key_intact = false; + } + + /* + * Note: beyond this point, use oldtup not otid to refer to old tuple. + * otid may very well point at newtup->t_self, which we will overwrite + * with the new tuple's location, so there's great risk of confusion if we + * use otid anymore. + */ + +l2: + checked_lockers = false; + locker_remains = false; + result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer); + + /* see below about the "no wait" case */ + Assert(result != TM_BeingModified || wait); + + if (result == TM_Invisible) + { + UnlockReleaseBuffer(buffer); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("attempted to update invisible tuple"))); + } + else if (result == TM_BeingModified && wait) + { + TransactionId xwait; + uint16 infomask; + bool can_continue = false; + + /* + * XXX note that we don't consider the "no wait" case here. This + * isn't a problem currently because no caller uses that case, but it + * should be fixed if such a caller is introduced. It wasn't a + * problem previously because this code would always wait, but now + * that some tuple locks do not conflict with one of the lock modes we + * use, it is possible that this case is interesting to handle + * specially. + * + * This may cause failures with third-party code that calls + * tdeheap_update directly. + */ + + /* must copy state data before unlocking buffer */ + xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data); + infomask = oldtup.t_data->t_infomask; + + /* + * Now we have to do something about the existing locker. If it's a + * multi, sleep on it; we might be awakened before it is completely + * gone (or even not sleep at all in some cases); we need to preserve + * it as locker, unless it is gone completely. + * + * If it's not a multi, we need to check for sleeping conditions + * before actually going to sleep. If the update doesn't conflict + * with the locks, we just continue without sleeping (but making sure + * it is preserved). + * + * Before sleeping, we need to acquire tuple lock to establish our + * priority for the tuple (see tdeheap_lock_tuple). LockTuple will + * release us when we are next-in-line for the tuple. Note we must + * not acquire the tuple lock until we're sure we're going to sleep; + * otherwise we're open for race conditions with other transactions + * holding the tuple lock which sleep on us. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while rechecking + * tuple state. + */ + if (infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId update_xact; + int remain; + bool current_is_member = false; + + if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, + *lockmode, ¤t_is_member)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * Acquire the lock, if necessary (but skip it when we're + * requesting a lock and already have one; avoids deadlock). + */ + if (!current_is_member) + tdeheap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode, + LockWaitBlock, &have_tuple_lock); + + /* wait for multixact */ + MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask, + relation, &oldtup.t_self, XLTW_Update, + &remain); + checked_lockers = true; + locker_remains = remain != 0; + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If xwait had just locked the tuple then some other xact + * could update this tuple before we get to this point. Check + * for xmax change, and start over if so. + */ + if (xmax_infomask_changed(oldtup.t_data->t_infomask, + infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data), + xwait)) + goto l2; + } + + /* + * Note that the multixact may not be done by now. It could have + * surviving members; our own xact or other subxacts of this + * backend, and also any other concurrent transaction that locked + * the tuple with LockTupleKeyShare if we only got + * LockTupleNoKeyExclusive. If this is the case, we have to be + * careful to mark the updated tuple with the surviving members in + * Xmax. + * + * Note that there could have been another update in the + * MultiXact. In that case, we need to check whether it committed + * or aborted. If it aborted we are safe to update it again; + * otherwise there is an update conflict, and we have to return + * TableTuple{Deleted, Updated} below. + * + * In the LockTupleExclusive case, we still need to preserve the + * surviving members: those would include the tuple locks we had + * before this one, which are important to keep in case this + * subxact aborts. + */ + if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask)) + update_xact = HeapTupleGetUpdateXid(oldtup.t_data); + else + update_xact = InvalidTransactionId; + + /* + * There was no UPDATE in the MultiXact; or it aborted. No + * TransactionIdIsInProgress() call needed here, since we called + * MultiXactIdWait() above. + */ + if (!TransactionIdIsValid(update_xact) || + TransactionIdDidAbort(update_xact)) + can_continue = true; + } + else if (TransactionIdIsCurrentTransactionId(xwait)) + { + /* + * The only locker is ourselves; we can avoid grabbing the tuple + * lock here, but must preserve our locking information. + */ + checked_lockers = true; + locker_remains = true; + can_continue = true; + } + else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact) + { + /* + * If it's just a key-share locker, and we're not changing the key + * columns, we don't need to wait for it to end; but we need to + * preserve it as locker. + */ + checked_lockers = true; + locker_remains = true; + can_continue = true; + } + else + { + /* + * Wait for regular transaction to end; but first, acquire tuple + * lock. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + tdeheap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode, + LockWaitBlock, &have_tuple_lock); + XactLockTableWait(xwait, relation, &oldtup.t_self, + XLTW_Update); + checked_lockers = true; + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * xwait is done, but if xwait had just locked the tuple then some + * other xact could update this tuple before we get to this point. + * Check for xmax change, and start over if so. + */ + if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) || + !TransactionIdEquals(xwait, + HeapTupleHeaderGetRawXmax(oldtup.t_data))) + goto l2; + + /* Otherwise check if it committed or aborted */ + UpdateXmaxHintBits(oldtup.t_data, buffer, xwait); + if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) + can_continue = true; + } + + if (can_continue) + result = TM_Ok; + else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)) + result = TM_Updated; + else + result = TM_Deleted; + } + + /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */ + if (result != TM_Ok) + { + Assert(result == TM_SelfModified || + result == TM_Updated || + result == TM_Deleted || + result == TM_BeingModified); + Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); + Assert(result != TM_Updated || + !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)); + } + + if (crosscheck != InvalidSnapshot && result == TM_Ok) + { + /* Perform additional check for transaction-snapshot mode RI updates */ + if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer)) + result = TM_Updated; + } + + if (result != TM_Ok) + { + tmfd->ctid = oldtup.t_data->t_ctid; + tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data); + if (result == TM_SelfModified) + tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); + else + tmfd->cmax = InvalidCommandId; + UnlockReleaseBuffer(buffer); + if (have_tuple_lock) + UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + *update_indexes = TU_None; + + bms_free(hot_attrs); + bms_free(sum_attrs); + bms_free(key_attrs); + bms_free(id_attrs); + bms_free(modified_attrs); + bms_free(interesting_attrs); + return result; + } + + /* + * If we didn't pin the visibility map page and the page has become all + * visible while we were busy locking the buffer, or during some + * subsequent window during which we had it unlocked, we'll have to unlock + * and re-lock, to avoid holding the buffer lock across an I/O. That's a + * bit unfortunate, especially since we'll now have to recheck whether the + * tuple has been locked or updated under us, but hopefully it won't + * happen very often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + tdeheap_visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + goto l2; + } + + /* Fill in transaction status data */ + + /* + * If the tuple we're updating is locked, we need to preserve the locking + * info in the old tuple's Xmax. Prepare a new Xmax value for this. + */ + compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), + oldtup.t_data->t_infomask, + oldtup.t_data->t_infomask2, + xid, *lockmode, true, + &xmax_old_tuple, &infomask_old_tuple, + &infomask2_old_tuple); + + /* + * And also prepare an Xmax value for the new copy of the tuple. If there + * was no xmax previously, or there was one but all lockers are now gone, + * then use InvalidTransactionId; otherwise, get the xmax from the old + * tuple. (In rare cases that might also be InvalidTransactionId and yet + * not have the HEAP_XMAX_INVALID bit set; that's fine.) + */ + if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) || + (checked_lockers && !locker_remains)) + xmax_new_tuple = InvalidTransactionId; + else + xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data); + + if (!TransactionIdIsValid(xmax_new_tuple)) + { + infomask_new_tuple = HEAP_XMAX_INVALID; + infomask2_new_tuple = 0; + } + else + { + /* + * If we found a valid Xmax for the new tuple, then the infomask bits + * to use on the new tuple depend on what was there on the old one. + * Note that since we're doing an update, the only possibility is that + * the lockers had FOR KEY SHARE lock. + */ + if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) + { + GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple, + &infomask2_new_tuple); + } + else + { + infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY; + infomask2_new_tuple = 0; + } + } + + /* + * Prepare the new tuple with the appropriate initial values of Xmin and + * Xmax, as well as initial infomask bits as computed above. + */ + newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK); + newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); + HeapTupleHeaderSetXmin(newtup->t_data, xid); + HeapTupleHeaderSetCmin(newtup->t_data, cid); + newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple; + newtup->t_data->t_infomask2 |= infomask2_new_tuple; + HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple); + + /* + * Replace cid with a combo CID if necessary. Note that we already put + * the plain cid into the new tuple. + */ + HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo); + + /* + * If the toaster needs to be activated, OR if the new tuple will not fit + * on the same page as the old, then we need to release the content lock + * (but not the pin!) on the old tuple's buffer while we are off doing + * TOAST and/or table-file-extension work. We must mark the old tuple to + * show that it's locked, else other processes may try to update it + * themselves. + * + * We need to invoke the toaster if there are already any out-of-line + * toasted values present, or if the new tuple is over-threshold. + */ + if (relation->rd_rel->relkind != RELKIND_RELATION && + relation->rd_rel->relkind != RELKIND_MATVIEW) + { + /* toast table entries should never be recursively toasted */ + Assert(!HeapTupleHasExternal(&oldtup)); + Assert(!HeapTupleHasExternal(newtup)); + need_toast = false; + } + else + need_toast = (HeapTupleHasExternal(&oldtup) || + HeapTupleHasExternal(newtup) || + newtup->t_len > TOAST_TUPLE_THRESHOLD); + + pagefree = PageGetHeapFreeSpace(page); + + newtupsize = MAXALIGN(newtup->t_len); + + if (need_toast || newtupsize > pagefree) + { + TransactionId xmax_lock_old_tuple; + uint16 infomask_lock_old_tuple, + infomask2_lock_old_tuple; + bool cleared_all_frozen = false; + + /* + * To prevent concurrent sessions from updating the tuple, we have to + * temporarily mark it locked, while we release the page-level lock. + * + * To satisfy the rule that any xid potentially appearing in a buffer + * written out to disk, we unfortunately have to WAL log this + * temporary modification. We can reuse xl_tdeheap_lock for this + * purpose. If we crash/error before following through with the + * actual update, xmax will be of an aborted transaction, allowing + * other sessions to proceed. + */ + + /* + * Compute xmax / infomask appropriate for locking the tuple. This has + * to be done separately from the combo that's going to be used for + * updating, because the potentially created multixact would otherwise + * be wrong. + */ + compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), + oldtup.t_data->t_infomask, + oldtup.t_data->t_infomask2, + xid, *lockmode, false, + &xmax_lock_old_tuple, &infomask_lock_old_tuple, + &infomask2_lock_old_tuple); + + Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple)); + + START_CRIT_SECTION(); + + /* Clear obsolete visibility flags ... */ + oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + HeapTupleClearHotUpdated(&oldtup); + /* ... and store info about transaction updating this tuple */ + Assert(TransactionIdIsValid(xmax_lock_old_tuple)); + HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple); + oldtup.t_data->t_infomask |= infomask_lock_old_tuple; + oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple; + HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); + + /* temporarily make it look not-updated, but locked */ + oldtup.t_data->t_ctid = oldtup.t_self; + + /* + * Clear all-frozen bit on visibility map if needed. We could + * immediately reset ALL_VISIBLE, but given that the WAL logging + * overhead would be unchanged, that doesn't seem necessarily + * worthwhile. + */ + if (PageIsAllVisible(page) && + tdeheap_visibilitymap_clear(relation, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN)) + cleared_all_frozen = true; + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(relation)) + { + xl_tdeheap_lock xlrec; + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self); + xlrec.xmax = xmax_lock_old_tuple; + xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask, + oldtup.t_data->t_infomask2); + xlrec.flags = + cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * Let the toaster do its thing, if needed. + * + * Note: below this point, heaptup is the data we actually intend to + * store into the relation; newtup is the caller's original untoasted + * data. + */ + if (need_toast) + { + /* Note we always use WAL and FSM during updates */ + heaptup = tdeheap_toast_insert_or_update(relation, newtup, &oldtup, 0); + newtupsize = MAXALIGN(heaptup->t_len); + } + else + heaptup = newtup; + + /* + * Now, do we need a new page for the tuple, or not? This is a bit + * tricky since someone else could have added tuples to the page while + * we weren't looking. We have to recheck the available space after + * reacquiring the buffer lock. But don't bother to do that if the + * former amount of free space is still not enough; it's unlikely + * there's more free now than before. + * + * What's more, if we need to get a new page, we will need to acquire + * buffer locks on both old and new pages. To avoid deadlock against + * some other backend trying to get the same two locks in the other + * order, we must be consistent about the order we get the locks in. + * We use the rule "lock the lower-numbered page of the relation + * first". To implement this, we must do tdeheap_RelationGetBufferForTuple + * while not holding the lock on the old page, and we must rely on it + * to get the locks on both pages in the correct order. + * + * Another consideration is that we need visibility map page pin(s) if + * we will have to clear the all-visible flag on either page. If we + * call tdeheap_RelationGetBufferForTuple, we rely on it to acquire any such + * pins; but if we don't, we have to handle that here. Hence we need + * a loop. + */ + for (;;) + { + if (newtupsize > pagefree) + { + /* It doesn't fit, must use tdeheap_RelationGetBufferForTuple. */ + newbuf = tdeheap_RelationGetBufferForTuple(relation, heaptup->t_len, + buffer, 0, NULL, + &vmbuffer_new, &vmbuffer, + 0); + /* We're all done. */ + break; + } + /* Acquire VM page pin if needed and we don't have it. */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + tdeheap_visibilitymap_pin(relation, block, &vmbuffer); + /* Re-acquire the lock on the old tuple's page. */ + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + /* Re-check using the up-to-date free space */ + pagefree = PageGetHeapFreeSpace(page); + if (newtupsize > pagefree || + (vmbuffer == InvalidBuffer && PageIsAllVisible(page))) + { + /* + * Rats, it doesn't fit anymore, or somebody just now set the + * all-visible flag. We must now unlock and loop to avoid + * deadlock. Fortunately, this path should seldom be taken. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + else + { + /* We're all done. */ + newbuf = buffer; + break; + } + } + } + else + { + /* No TOAST work needed, and it'll fit on same page */ + newbuf = buffer; + heaptup = newtup; + } + + /* + * We're about to do the actual update -- check for conflict first, to + * avoid possibly having to roll back work we've just done. + * + * This is safe without a recheck as long as there is no possibility of + * another process scanning the pages between this check and the update + * being visible to the scan (i.e., exclusive buffer content lock(s) are + * continuously held from this point until the tuple update is visible). + * + * For the new tuple the only check needed is at the relation level, but + * since both tuples are in the same relation and the check for oldtup + * will include checking the relation level, there is no benefit to a + * separate check for the new tuple. + */ + CheckForSerializableConflictIn(relation, &oldtup.t_self, + BufferGetBlockNumber(buffer)); + + /* + * At this point newbuf and buffer are both pinned and locked, and newbuf + * has enough space for the new tuple. If they are the same buffer, only + * one pin is held. + */ + + if (newbuf == buffer) + { + /* + * Since the new tuple is going into the same page, we might be able + * to do a HOT update. Check if any of the index columns have been + * changed. + */ + if (!bms_overlap(modified_attrs, hot_attrs)) + { + use_hot_update = true; + + /* + * If none of the columns that are used in hot-blocking indexes + * were updated, we can apply HOT, but we do still need to check + * if we need to update the summarizing indexes, and update those + * indexes if the columns were updated, or we may fail to detect + * e.g. value bound changes in BRIN minmax indexes. + */ + if (bms_overlap(modified_attrs, sum_attrs)) + summarized_update = true; + } + } + else + { + /* Set a hint that the old page could use prune/defrag */ + PageSetFull(page); + } + + /* + * Compute replica identity tuple before entering the critical section so + * we don't PANIC upon a memory allocation failure. + * ExtractReplicaIdentity() will return NULL if nothing needs to be + * logged. Pass old key required as true only if the replica identity key + * columns are modified or it has external data. + */ + old_key_tuple = ExtractReplicaIdentity(relation, &oldtup, + bms_overlap(modified_attrs, id_attrs) || + id_has_external, + &old_key_copied); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + /* + * If this transaction commits, the old tuple will become DEAD sooner or + * later. Set flag that this page is a candidate for pruning once our xid + * falls below the OldestXmin horizon. If the transaction finally aborts, + * the subsequent page pruning will be a no-op and the hint will be + * cleared. + * + * XXX Should we set hint on newbuf as well? If the transaction aborts, + * there would be a prunable tuple in the newbuf; but for now we choose + * not to optimize for aborts. Note that tdeheap_xlog_update must be kept in + * sync if this decision changes. + */ + PageSetPrunable(page, xid); + + if (use_hot_update) + { + /* Mark the old tuple as HOT-updated */ + HeapTupleSetHotUpdated(&oldtup); + /* And mark the new tuple as heap-only */ + HeapTupleSetHeapOnly(heaptup); + /* Mark the caller's copy too, in case different from heaptup */ + HeapTupleSetHeapOnly(newtup); + } + else + { + /* Make sure tuples are correctly marked as not-HOT */ + HeapTupleClearHotUpdated(&oldtup); + HeapTupleClearHeapOnly(heaptup); + HeapTupleClearHeapOnly(newtup); + } + + tdeheap_RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */ + + + /* Clear obsolete visibility flags, possibly set by ourselves above... */ + oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + /* ... and store info about transaction updating this tuple */ + Assert(TransactionIdIsValid(xmax_old_tuple)); + HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple); + oldtup.t_data->t_infomask |= infomask_old_tuple; + oldtup.t_data->t_infomask2 |= infomask2_old_tuple; + HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); + + /* record address of new tuple in t_ctid of old one */ + oldtup.t_data->t_ctid = heaptup->t_self; + + /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */ + if (PageIsAllVisible(BufferGetPage(buffer))) + { + all_visible_cleared = true; + PageClearAllVisible(BufferGetPage(buffer)); + tdeheap_visibilitymap_clear(relation, BufferGetBlockNumber(buffer), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf))) + { + all_visible_cleared_new = true; + PageClearAllVisible(BufferGetPage(newbuf)); + tdeheap_visibilitymap_clear(relation, BufferGetBlockNumber(newbuf), + vmbuffer_new, VISIBILITYMAP_VALID_BITS); + } + + if (newbuf != buffer) + MarkBufferDirty(newbuf); + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + XLogRecPtr recptr; + + /* + * For logical decoding we need combo CIDs to properly decode the + * catalog. + */ + if (RelationIsAccessibleInLogicalDecoding(relation)) + { + log_tdeheap_new_cid(relation, &oldtup); + log_tdeheap_new_cid(relation, heaptup); + } + + recptr = log_tdeheap_update(relation, buffer, + newbuf, &oldtup, heaptup, + old_key_tuple, + all_visible_cleared, + all_visible_cleared_new); + if (newbuf != buffer) + { + PageSetLSN(BufferGetPage(newbuf), recptr); + } + PageSetLSN(BufferGetPage(buffer), recptr); + } + + END_CRIT_SECTION(); + + if (newbuf != buffer) + LockBuffer(newbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * Mark old tuple for invalidation from system caches at next command + * boundary, and mark the new tuple for invalidation in case we abort. We + * have to do this before releasing the buffer because oldtup is in the + * buffer. (heaptup is all in local memory, but it's necessary to process + * both tuple versions in one call to inval.c so we can avoid redundant + * sinval messages.) + */ + CacheInvalidateHeapTuple(relation, &oldtup, heaptup); + + /* Now we can release the buffer(s) */ + if (newbuf != buffer) + ReleaseBuffer(newbuf); + ReleaseBuffer(buffer); + if (BufferIsValid(vmbuffer_new)) + ReleaseBuffer(vmbuffer_new); + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); + + /* + * Release the lmgr tuple lock, if we had it. + */ + if (have_tuple_lock) + UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); + + pgstat_count_tdeheap_update(relation, use_hot_update, newbuf != buffer); + + /* + * If heaptup is a private copy, release it. Don't forget to copy t_self + * back to the caller's image, too. + */ + if (heaptup != newtup) + { + newtup->t_self = heaptup->t_self; + tdeheap_freetuple(heaptup); + } + + /* + * If it is a HOT update, the update may still need to update summarized + * indexes, lest we fail to update those summaries and get incorrect + * results (for example, minmax bounds of the block may change with this + * update). + */ + if (use_hot_update) + { + if (summarized_update) + *update_indexes = TU_Summarizing; + else + *update_indexes = TU_None; + } + else + *update_indexes = TU_All; + + if (old_key_tuple != NULL && old_key_copied) + tdeheap_freetuple(old_key_tuple); + + bms_free(hot_attrs); + bms_free(sum_attrs); + bms_free(key_attrs); + bms_free(id_attrs); + bms_free(modified_attrs); + bms_free(interesting_attrs); + + return TM_Ok; +} + +/* + * Check if the specified attribute's values are the same. Subroutine for + * HeapDetermineColumnsInfo. + */ +static bool +tdeheap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, + bool isnull1, bool isnull2) +{ + Form_pg_attribute att; + + /* + * If one value is NULL and other is not, then they are certainly not + * equal + */ + if (isnull1 != isnull2) + return false; + + /* + * If both are NULL, they can be considered equal. + */ + if (isnull1) + return true; + + /* + * We do simple binary comparison of the two datums. This may be overly + * strict because there can be multiple binary representations for the + * same logical value. But we should be OK as long as there are no false + * positives. Using a type-specific equality operator is messy because + * there could be multiple notions of equality in different operator + * classes; furthermore, we cannot safely invoke user-defined functions + * while holding exclusive buffer lock. + */ + if (attrnum <= 0) + { + /* The only allowed system columns are OIDs, so do this */ + return (DatumGetObjectId(value1) == DatumGetObjectId(value2)); + } + else + { + Assert(attrnum <= tupdesc->natts); + att = TupleDescAttr(tupdesc, attrnum - 1); + return datumIsEqual(value1, value2, att->attbyval, att->attlen); + } +} + +/* + * Check which columns are being updated. + * + * Given an updated tuple, determine (and return into the output bitmapset), + * from those listed as interesting, the set of columns that changed. + * + * has_external indicates if any of the unmodified attributes (from those + * listed as interesting) of the old tuple is a member of external_cols and is + * stored externally. + */ +static Bitmapset * +HeapDetermineColumnsInfo(Relation relation, + Bitmapset *interesting_cols, + Bitmapset *external_cols, + HeapTuple oldtup, HeapTuple newtup, + bool *has_external) +{ + int attidx; + Bitmapset *modified = NULL; + TupleDesc tupdesc = RelationGetDescr(relation); + + attidx = -1; + while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0) + { + /* attidx is zero-based, attrnum is the normal attribute number */ + AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber; + Datum value1, + value2; + bool isnull1, + isnull2; + + /* + * If it's a whole-tuple reference, say "not equal". It's not really + * worth supporting this case, since it could only succeed after a + * no-op update, which is hardly a case worth optimizing for. + */ + if (attrnum == 0) + { + modified = bms_add_member(modified, attidx); + continue; + } + + /* + * Likewise, automatically say "not equal" for any system attribute + * other than tableOID; we cannot expect these to be consistent in a + * HOT chain, or even to be set correctly yet in the new tuple. + */ + if (attrnum < 0) + { + if (attrnum != TableOidAttributeNumber) + { + modified = bms_add_member(modified, attidx); + continue; + } + } + + /* + * Extract the corresponding values. XXX this is pretty inefficient + * if there are many indexed columns. Should we do a single + * tdeheap_deform_tuple call on each tuple, instead? But that doesn't + * work for system columns ... + */ + value1 = tdeheap_getattr(oldtup, attrnum, tupdesc, &isnull1); + value2 = tdeheap_getattr(newtup, attrnum, tupdesc, &isnull2); + + if (!tdeheap_attr_equals(tupdesc, attrnum, value1, + value2, isnull1, isnull2)) + { + modified = bms_add_member(modified, attidx); + continue; + } + + /* + * No need to check attributes that can't be stored externally. Note + * that system attributes can't be stored externally. + */ + if (attrnum < 0 || isnull1 || + TupleDescAttr(tupdesc, attrnum - 1)->attlen != -1) + continue; + + /* + * Check if the old tuple's attribute is stored externally and is a + * member of external_cols. + */ + if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) && + bms_is_member(attidx, external_cols)) + *has_external = true; + } + + return modified; +} + +/* + * simple_tdeheap_update - replace a tuple + * + * This routine may be used to update a tuple when concurrent updates of + * the target tuple are not expected (for example, because we have a lock + * on the relation associated with the tuple). Any failure is reported + * via ereport(). + */ +void +simple_tdeheap_update(Relation relation, ItemPointer otid, HeapTuple tup, + TU_UpdateIndexes *update_indexes) +{ + TM_Result result; + TM_FailureData tmfd; + LockTupleMode lockmode; + + result = tdeheap_update(relation, otid, tup, + GetCurrentCommandId(true), InvalidSnapshot, + true /* wait for commit */ , + &tmfd, &lockmode, update_indexes); + switch (result) + { + case TM_SelfModified: + /* Tuple was already updated in current command? */ + elog(ERROR, "tuple already updated by self"); + break; + + case TM_Ok: + /* done successfully */ + break; + + case TM_Updated: + elog(ERROR, "tuple concurrently updated"); + break; + + case TM_Deleted: + elog(ERROR, "tuple concurrently deleted"); + break; + + default: + elog(ERROR, "unrecognized tdeheap_update status: %u", result); + break; + } +} + + +/* + * Return the MultiXactStatus corresponding to the given tuple lock mode. + */ +static MultiXactStatus +get_mxact_status_for_lock(LockTupleMode mode, bool is_update) +{ + int retval; + + if (is_update) + retval = tupleLockExtraInfo[mode].updstatus; + else + retval = tupleLockExtraInfo[mode].lockstatus; + + if (retval == -1) + elog(ERROR, "invalid lock tuple mode %d/%s", mode, + is_update ? "true" : "false"); + + return (MultiXactStatus) retval; +} + +/* + * tdeheap_lock_tuple - lock a tuple in shared or exclusive mode + * + * Note that this acquires a buffer pin, which the caller must release. + * + * Input parameters: + * relation: relation containing tuple (caller must hold suitable lock) + * tid: TID of tuple to lock + * cid: current command ID (used for visibility test, and stored into + * tuple's cmax if lock is successful) + * mode: indicates if shared or exclusive tuple lock is desired + * wait_policy: what to do if tuple lock is not available + * follow_updates: if true, follow the update chain to also lock descendant + * tuples. + * + * Output parameters: + * *tuple: all fields filled in + * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *tmfd: filled in failure cases (see below) + * + * Function results are the same as the ones for table_tuple_lock(). + * + * In the failure cases other than TM_Invisible, the routine fills + * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, + * if necessary), and t_cmax (the last only for TM_SelfModified, + * since we cannot obtain cmax from a combo CID generated by another + * transaction). + * See comments for struct TM_FailureData for additional info. + * + * See README.tuplock for a thorough explanation of this mechanism. + */ +TM_Result +tdeheap_lock_tuple(Relation relation, HeapTuple tuple, + CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, + bool follow_updates, + Buffer *buffer, TM_FailureData *tmfd) +{ + TM_Result result; + ItemPointer tid = &(tuple->t_self); + ItemId lp; + Page page; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + TransactionId xid, + xmax; + uint16 old_infomask, + new_infomask, + new_infomask2; + bool first_time = true; + bool skip_tuple_lock = false; + bool have_tuple_lock = false; + bool cleared_all_frozen = false; + + *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + block = ItemPointerGetBlockNumber(tid); + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. + */ + if (PageIsAllVisible(BufferGetPage(*buffer))) + tdeheap_visibilitymap_pin(relation, block, &vmbuffer); + + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(*buffer); + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); + tuple->t_len = ItemIdGetLength(lp); + tuple->t_tableOid = RelationGetRelid(relation); + +l3: + result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); + + if (result == TM_Invisible) + { + /* + * This is possible, but only when locking a tuple for ON CONFLICT + * UPDATE. We return this value here rather than throwing an error in + * order to give that case the opportunity to throw a more specific + * error. + */ + result = TM_Invisible; + goto out_locked; + } + else if (result == TM_BeingModified || + result == TM_Updated || + result == TM_Deleted) + { + TransactionId xwait; + uint16 infomask; + uint16 infomask2; + bool require_sleep; + ItemPointerData t_ctid; + + /* must copy state data before unlocking buffer */ + xwait = HeapTupleHeaderGetRawXmax(tuple->t_data); + infomask = tuple->t_data->t_infomask; + infomask2 = tuple->t_data->t_infomask2; + ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); + + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + + /* + * If any subtransaction of the current top transaction already holds + * a lock as strong as or stronger than what we're requesting, we + * effectively hold the desired lock already. We *must* succeed + * without trying to take the tuple lock, else we will deadlock + * against anyone wanting to acquire a stronger lock. + * + * Note we only do this the first time we loop on the HTSU result; + * there is no point in testing in subsequent passes, because + * evidently our own transaction cannot have acquired a new lock after + * the first time we checked. + */ + if (first_time) + { + first_time = false; + + if (infomask & HEAP_XMAX_IS_MULTI) + { + int i; + int nmembers; + MultiXactMember *members; + + /* + * We don't need to allow old multixacts here; if that had + * been the case, HeapTupleSatisfiesUpdate would have returned + * MayBeUpdated and we wouldn't be here. + */ + nmembers = + GetMultiXactIdMembers(xwait, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(infomask)); + + for (i = 0; i < nmembers; i++) + { + /* only consider members of our own transaction */ + if (!TransactionIdIsCurrentTransactionId(members[i].xid)) + continue; + + if (TUPLOCK_from_mxstatus(members[i].status) >= mode) + { + pfree(members); + result = TM_Ok; + goto out_unlocked; + } + else + { + /* + * Disable acquisition of the heavyweight tuple lock. + * Otherwise, when promoting a weaker lock, we might + * deadlock with another locker that has acquired the + * heavyweight tuple lock and is waiting for our + * transaction to finish. + * + * Note that in this case we still need to wait for + * the multixact if required, to avoid acquiring + * conflicting locks. + */ + skip_tuple_lock = true; + } + } + + if (members) + pfree(members); + } + else if (TransactionIdIsCurrentTransactionId(xwait)) + { + switch (mode) + { + case LockTupleKeyShare: + Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) || + HEAP_XMAX_IS_SHR_LOCKED(infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(infomask)); + result = TM_Ok; + goto out_unlocked; + case LockTupleShare: + if (HEAP_XMAX_IS_SHR_LOCKED(infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + result = TM_Ok; + goto out_unlocked; + } + break; + case LockTupleNoKeyExclusive: + if (HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + result = TM_Ok; + goto out_unlocked; + } + break; + case LockTupleExclusive: + if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) && + infomask2 & HEAP_KEYS_UPDATED) + { + result = TM_Ok; + goto out_unlocked; + } + break; + } + } + } + + /* + * Initially assume that we will have to wait for the locking + * transaction(s) to finish. We check various cases below in which + * this can be turned off. + */ + require_sleep = true; + if (mode == LockTupleKeyShare) + { + /* + * If we're requesting KeyShare, and there's no update present, we + * don't need to wait. Even if there is an update, we can still + * continue if the key hasn't been modified. + * + * However, if there are updates, we need to walk the update chain + * to mark future versions of the row as locked, too. That way, + * if somebody deletes that future version, we're protected + * against the key going away. This locking of future versions + * could block momentarily, if a concurrent transaction is + * deleting a key; or it could return a value to the effect that + * the transaction deleting the key has already committed. So we + * do this before re-locking the buffer; otherwise this would be + * prone to deadlocks. + * + * Note that the TID we're locking was grabbed before we unlocked + * the buffer. For it to change while we're not looking, the + * other properties we're testing for below after re-locking the + * buffer would also change, in which case we would restart this + * loop above. + */ + if (!(infomask2 & HEAP_KEYS_UPDATED)) + { + bool updated; + + updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask); + + /* + * If there are updates, follow the update chain; bail out if + * that cannot be done. + */ + if (follow_updates && updated) + { + TM_Result res; + + res = tdeheap_lock_updated_tuple(relation, tuple, &t_ctid, + GetCurrentTransactionId(), + mode); + if (res != TM_Ok) + { + result = res; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + } + + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Make sure it's still an appropriate lock, else start over. + * Also, if it wasn't updated before we released the lock, but + * is updated now, we start over too; the reason is that we + * now need to follow the update chain to lock the new + * versions. + */ + if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) && + ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) || + !updated)) + goto l3; + + /* Things look okay, so we can skip sleeping */ + require_sleep = false; + + /* + * Note we allow Xmax to change here; other updaters/lockers + * could have modified it before we grabbed the buffer lock. + * However, this is not a problem, because with the recheck we + * just did we ensure that they still don't conflict with the + * lock we want. + */ + } + } + else if (mode == LockTupleShare) + { + /* + * If we're requesting Share, we can similarly avoid sleeping if + * there's no update and no exclusive lock present. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && + !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Make sure it's still an appropriate lock, else start over. + * See above about allowing xmax to change. + */ + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask)) + goto l3; + require_sleep = false; + } + } + else if (mode == LockTupleNoKeyExclusive) + { + /* + * If we're requesting NoKeyExclusive, we might also be able to + * avoid sleeping; just ensure that there no conflicting lock + * already acquired. + */ + if (infomask & HEAP_XMAX_IS_MULTI) + { + if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask, + mode, NULL)) + { + /* + * No conflict, but if the xmax changed under us in the + * meantime, start over. + */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + xwait)) + goto l3; + + /* otherwise, we're good */ + require_sleep = false; + } + } + else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) + { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* if the xmax changed in the meantime, start over */ + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + xwait)) + goto l3; + /* otherwise, we're good */ + require_sleep = false; + } + } + + /* + * As a check independent from those above, we can also avoid sleeping + * if the current transaction is the sole locker of the tuple. Note + * that the strength of the lock already held is irrelevant; this is + * not about recording the lock in Xmax (which will be done regardless + * of this optimization, below). Also, note that the cases where we + * hold a lock stronger than we are requesting are already handled + * above by not doing anything. + * + * Note we only deal with the non-multixact case here; MultiXactIdWait + * is well equipped to deal with this situation on its own. + */ + if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) && + TransactionIdIsCurrentTransactionId(xwait)) + { + /* ... but if the xmax changed in the meantime, start over */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + xwait)) + goto l3; + Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask)); + require_sleep = false; + } + + /* + * Time to sleep on the other transaction/multixact, if necessary. + * + * If the other transaction is an update/delete that's already + * committed, then sleeping cannot possibly do any good: if we're + * required to sleep, get out to raise an error instead. + * + * By here, we either have already acquired the buffer exclusive lock, + * or we must wait for the locking transaction or multixact; so below + * we ensure that we grab buffer lock after the sleep. + */ + if (require_sleep && (result == TM_Updated || result == TM_Deleted)) + { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + else if (require_sleep) + { + /* + * Acquire tuple lock to establish our priority for the tuple, or + * die trying. LockTuple will release us when we are next-in-line + * for the tuple. We must do this even if we are share-locking, + * but not if we already have a weaker lock on the tuple. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while + * rechecking tuple state. + */ + if (!skip_tuple_lock && + !tdeheap_acquire_tuplock(relation, tid, mode, wait_policy, + &have_tuple_lock)) + { + /* + * This can only happen if wait_policy is Skip and the lock + * couldn't be obtained. + */ + result = TM_WouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + + if (infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactStatus status = get_mxact_status_for_lock(mode, false); + + /* We only ever lock tuples, never update them */ + if (status >= MultiXactStatusNoKeyUpdate) + elog(ERROR, "invalid lock mode in tdeheap_lock_tuple"); + + /* wait for multixact to end, or die trying */ + switch (wait_policy) + { + case LockWaitBlock: + MultiXactIdWait((MultiXactId) xwait, status, infomask, + relation, &tuple->t_self, XLTW_Lock, NULL); + break; + case LockWaitSkip: + if (!ConditionalMultiXactIdWait((MultiXactId) xwait, + status, infomask, relation, + NULL)) + { + result = TM_WouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + break; + case LockWaitError: + if (!ConditionalMultiXactIdWait((MultiXactId) xwait, + status, infomask, relation, + NULL)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + + break; + } + + /* + * Of course, the multixact might not be done here: if we're + * requesting a light lock mode, other transactions with light + * locks could still be alive, as well as locks owned by our + * own xact or other subxacts of this backend. We need to + * preserve the surviving MultiXact members. Note that it + * isn't absolutely necessary in the latter case, but doing so + * is simpler. + */ + } + else + { + /* wait for regular transaction to end, or die trying */ + switch (wait_policy) + { + case LockWaitBlock: + XactLockTableWait(xwait, relation, &tuple->t_self, + XLTW_Lock); + break; + case LockWaitSkip: + if (!ConditionalXactLockTableWait(xwait)) + { + result = TM_WouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + break; + case LockWaitError: + if (!ConditionalXactLockTableWait(xwait)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + } + + /* if there are updates, follow the update chain */ + if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask)) + { + TM_Result res; + + res = tdeheap_lock_updated_tuple(relation, tuple, &t_ctid, + GetCurrentTransactionId(), + mode); + if (res != TM_Ok) + { + result = res; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + } + + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * xwait is done, but if xwait had just locked the tuple then some + * other xact could update this tuple before we get to this point. + * Check for xmax change, and start over if so. + */ + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + xwait)) + goto l3; + + if (!(infomask & HEAP_XMAX_IS_MULTI)) + { + /* + * Otherwise check if it committed or aborted. Note we cannot + * be here if the tuple was only locked by somebody who didn't + * conflict with us; that would have been handled above. So + * that transaction must necessarily be gone by now. But + * don't check for this in the multixact case, because some + * locker transactions might still be running. + */ + UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); + } + } + + /* By here, we're certain that we hold buffer exclusive lock again */ + + /* + * We may lock if previous xmax aborted, or if it committed but only + * locked the tuple without updating it; or if we didn't have to wait + * at all for whatever reason. + */ + if (!require_sleep || + (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || + HeapTupleHeaderIsOnlyLocked(tuple->t_data)) + result = TM_Ok; + else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) + result = TM_Updated; + else + result = TM_Deleted; + } + +failed: + if (result != TM_Ok) + { + Assert(result == TM_SelfModified || result == TM_Updated || + result == TM_Deleted || result == TM_WouldBlock); + + /* + * When locking a tuple under LockWaitSkip semantics and we fail with + * TM_WouldBlock above, it's possible for concurrent transactions to + * release the lock and set HEAP_XMAX_INVALID in the meantime. So + * this assert is slightly different from the equivalent one in + * tdeheap_delete and tdeheap_update. + */ + Assert((result == TM_WouldBlock) || + !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); + Assert(result != TM_Updated || + !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)); + tmfd->ctid = tuple->t_data->t_ctid; + tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); + if (result == TM_SelfModified) + tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data); + else + tmfd->cmax = InvalidCommandId; + goto out_locked; + } + + /* + * If we didn't pin the visibility map page and the page has become all + * visible while we were busy locking the buffer, or during some + * subsequent window during which we had it unlocked, we'll have to unlock + * and re-lock, to avoid holding the buffer lock across I/O. That's a bit + * unfortunate, especially since we'll now have to recheck whether the + * tuple has been locked or updated under us, but hopefully it won't + * happen very often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + { + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + tdeheap_visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto l3; + } + + xmax = HeapTupleHeaderGetRawXmax(tuple->t_data); + old_infomask = tuple->t_data->t_infomask; + + /* + * If this is the first possibly-multixact-able operation in the current + * transaction, set my per-backend OldestMemberMXactId setting. We can be + * certain that the transaction will never become a member of any older + * MultiXactIds than that. (We have to do this even if we end up just + * using our own TransactionId below, since some other backend could + * incorporate our XID into a MultiXact immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + + /* + * Compute the new xmax and infomask to store into the tuple. Note we do + * not modify the tuple just yet, because that would leave it in the wrong + * state if multixact.c elogs. + */ + compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2, + GetCurrentTransactionId(), mode, false, + &xid, &new_infomask, &new_infomask2); + + START_CRIT_SECTION(); + + /* + * Store transaction information of xact locking the tuple. + * + * Note: Cmax is meaningless in this context, so don't set it; this avoids + * possibly generating a useless combo CID. Moreover, if we're locking a + * previously updated tuple, it's important to preserve the Cmax. + * + * Also reset the HOT UPDATE bit, but only if there's no update; otherwise + * we would break the HOT chain. + */ + tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS; + tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + tuple->t_data->t_infomask |= new_infomask; + tuple->t_data->t_infomask2 |= new_infomask2; + if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) + HeapTupleHeaderClearHotUpdated(tuple->t_data); + HeapTupleHeaderSetXmax(tuple->t_data, xid); + + /* + * Make sure there is no forward chain link in t_ctid. Note that in the + * cases where the tuple has been updated, we must not overwrite t_ctid, + * because it was set by the updater. Moreover, if the tuple has been + * updated, we need to follow the update chain to lock the new versions of + * the tuple as well. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) + tuple->t_data->t_ctid = *tid; + + /* Clear only the all-frozen bit on visibility map if needed */ + if (PageIsAllVisible(page) && + tdeheap_visibilitymap_clear(relation, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN)) + cleared_all_frozen = true; + + + MarkBufferDirty(*buffer); + + /* + * XLOG stuff. You might think that we don't need an XLOG record because + * there is no state change worth restoring after a crash. You would be + * wrong however: we have just written either a TransactionId or a + * MultiXactId that may never have been seen on disk before, and we need + * to make sure that there are XLOG entries covering those ID numbers. + * Else the same IDs might be re-used after a crash, which would be + * disastrous if this page made it to disk before the crash. Essentially + * we have to enforce the WAL log-before-data rule even in this case. + * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG + * entries for everything anyway.) + */ + if (RelationNeedsWAL(relation)) + { + xl_tdeheap_lock xlrec; + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); + xlrec.xmax = xid; + xlrec.infobits_set = compute_infobits(new_infomask, + tuple->t_data->t_infomask2); + xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); + + /* we don't decode row locks atm, so no need to log the origin */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + result = TM_Ok; + +out_locked: + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + +out_unlocked: + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); + + /* + * Don't update the visibility map here. Locking a tuple doesn't change + * visibility info. + */ + + /* + * Now that we have successfully marked the tuple as locked, we can + * release the lmgr tuple lock, if we had it. + */ + if (have_tuple_lock) + UnlockTupleTuplock(relation, tid, mode); + + return result; +} + +/* + * Acquire heavyweight lock on the given tuple, in preparation for acquiring + * its normal, Xmax-based tuple lock. + * + * have_tuple_lock is an input and output parameter: on input, it indicates + * whether the lock has previously been acquired (and this function does + * nothing in that case). If this function returns success, have_tuple_lock + * has been flipped to true. + * + * Returns false if it was unable to obtain the lock; this can only happen if + * wait_policy is Skip. + */ +static bool +tdeheap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, + LockWaitPolicy wait_policy, bool *have_tuple_lock) +{ + if (*have_tuple_lock) + return true; + + switch (wait_policy) + { + case LockWaitBlock: + LockTupleTuplock(relation, tid, mode); + break; + + case LockWaitSkip: + if (!ConditionalLockTupleTuplock(relation, tid, mode)) + return false; + break; + + case LockWaitError: + if (!ConditionalLockTupleTuplock(relation, tid, mode)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + *have_tuple_lock = true; + + return true; +} + +/* + * Given an original set of Xmax and infomask, and a transaction (identified by + * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and + * corresponding infomasks to use on the tuple. + * + * Note that this might have side effects such as creating a new MultiXactId. + * + * Most callers will have called HeapTupleSatisfiesUpdate before this function; + * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId + * but it was not running anymore. There is a race condition, which is that the + * MultiXactId may have finished since then, but that uncommon case is handled + * either here, or within MultiXactIdExpand. + * + * There is a similar race condition possible when the old xmax was a regular + * TransactionId. We test TransactionIdIsInProgress again just to narrow the + * window, but it's still possible to end up creating an unnecessary + * MultiXactId. Fortunately this is harmless. + */ +static void +compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, + uint16 old_infomask2, TransactionId add_to_xmax, + LockTupleMode mode, bool is_update, + TransactionId *result_xmax, uint16 *result_infomask, + uint16 *result_infomask2) +{ + TransactionId new_xmax; + uint16 new_infomask, + new_infomask2; + + Assert(TransactionIdIsCurrentTransactionId(add_to_xmax)); + +l5: + new_infomask = 0; + new_infomask2 = 0; + if (old_infomask & HEAP_XMAX_INVALID) + { + /* + * No previous locker; we just insert our own TransactionId. + * + * Note that it's critical that this case be the first one checked, + * because there are several blocks below that come back to this one + * to implement certain optimizations; old_infomask might contain + * other dirty bits in those cases, but we don't really care. + */ + if (is_update) + { + new_xmax = add_to_xmax; + if (mode == LockTupleExclusive) + new_infomask2 |= HEAP_KEYS_UPDATED; + } + else + { + new_infomask |= HEAP_XMAX_LOCK_ONLY; + switch (mode) + { + case LockTupleKeyShare: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_KEYSHR_LOCK; + break; + case LockTupleShare: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_SHR_LOCK; + break; + case LockTupleNoKeyExclusive: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_EXCL_LOCK; + break; + case LockTupleExclusive: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_EXCL_LOCK; + new_infomask2 |= HEAP_KEYS_UPDATED; + break; + default: + new_xmax = InvalidTransactionId; /* silence compiler */ + elog(ERROR, "invalid lock mode"); + } + } + } + else if (old_infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactStatus new_status; + + /* + * Currently we don't allow XMAX_COMMITTED to be set for multis, so + * cross-check. + */ + Assert(!(old_infomask & HEAP_XMAX_COMMITTED)); + + /* + * A multixact together with LOCK_ONLY set but neither lock bit set + * (i.e. a pg_upgraded share locked tuple) cannot possibly be running + * anymore. This check is critical for databases upgraded by + * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume + * that such multis are never passed. + */ + if (HEAP_LOCKED_UPGRADED(old_infomask)) + { + old_infomask &= ~HEAP_XMAX_IS_MULTI; + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + + /* + * If the XMAX is already a MultiXactId, then we need to expand it to + * include add_to_xmax; but if all the members were lockers and are + * all gone, we can do away with the IS_MULTI bit and just set + * add_to_xmax as the only locker/updater. If all lockers are gone + * and we have an updater that aborted, we can also do without a + * multi. + * + * The cost of doing GetMultiXactIdMembers would be paid by + * MultiXactIdExpand if we weren't to do this, so this check is not + * incurring extra work anyhow. + */ + if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) || + !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax, + old_infomask))) + { + /* + * Reset these bits and restart; otherwise fall through to + * create a new multi below. + */ + old_infomask &= ~HEAP_XMAX_IS_MULTI; + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + } + + new_status = get_mxact_status_for_lock(mode, is_update); + + new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax, + new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else if (old_infomask & HEAP_XMAX_COMMITTED) + { + /* + * It's a committed update, so we need to preserve him as updater of + * the tuple. + */ + MultiXactStatus status; + MultiXactStatus new_status; + + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; + + new_status = get_mxact_status_for_lock(mode, is_update); + + /* + * since it's not running, it's obviously impossible for the old + * updater to be identical to the current one, so we need not check + * for that case as we do in the block above. + */ + new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else if (TransactionIdIsInProgress(xmax)) + { + /* + * If the XMAX is a valid, in-progress TransactionId, then we need to + * create a new MultiXactId that includes both the old locker or + * updater and our own TransactionId. + */ + MultiXactStatus new_status; + MultiXactStatus old_status; + LockTupleMode old_mode; + + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) + { + if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) + old_status = MultiXactStatusForKeyShare; + else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) + old_status = MultiXactStatusForShare; + else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) + { + if (old_infomask2 & HEAP_KEYS_UPDATED) + old_status = MultiXactStatusForUpdate; + else + old_status = MultiXactStatusForNoKeyUpdate; + } + else + { + /* + * LOCK_ONLY can be present alone only when a page has been + * upgraded by pg_upgrade. But in that case, + * TransactionIdIsInProgress() should have returned false. We + * assume it's no longer locked in this case. + */ + elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax); + old_infomask |= HEAP_XMAX_INVALID; + old_infomask &= ~HEAP_XMAX_LOCK_ONLY; + goto l5; + } + } + else + { + /* it's an update, but which kind? */ + if (old_infomask2 & HEAP_KEYS_UPDATED) + old_status = MultiXactStatusUpdate; + else + old_status = MultiXactStatusNoKeyUpdate; + } + + old_mode = TUPLOCK_from_mxstatus(old_status); + + /* + * If the lock to be acquired is for the same TransactionId as the + * existing lock, there's an optimization possible: consider only the + * strongest of both locks as the only one present, and restart. + */ + if (xmax == add_to_xmax) + { + /* + * Note that it's not possible for the original tuple to be + * updated: we wouldn't be here because the tuple would have been + * invisible and we wouldn't try to update it. As a subtlety, + * this code can also run when traversing an update chain to lock + * future versions of a tuple. But we wouldn't be here either, + * because the add_to_xmax would be different from the original + * updater. + */ + Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); + + /* acquire the strongest of both */ + if (mode < old_mode) + mode = old_mode; + /* mustn't touch is_update */ + + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + + /* otherwise, just fall back to creating a new multixact */ + new_status = get_mxact_status_for_lock(mode, is_update); + new_xmax = MultiXactIdCreate(xmax, old_status, + add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) && + TransactionIdDidCommit(xmax)) + { + /* + * It's a committed update, so we gotta preserve him as updater of the + * tuple. + */ + MultiXactStatus status; + MultiXactStatus new_status; + + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; + + new_status = get_mxact_status_for_lock(mode, is_update); + + /* + * since it's not running, it's obviously impossible for the old + * updater to be identical to the current one, so we need not check + * for that case as we do in the block above. + */ + new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else + { + /* + * Can get here iff the locking/updating transaction was running when + * the infomask was extracted from the tuple, but finished before + * TransactionIdIsInProgress got to run. Deal with it as if there was + * no locker at all in the first place. + */ + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + + *result_infomask = new_infomask; + *result_infomask2 = new_infomask2; + *result_xmax = new_xmax; +} + +/* + * Subroutine for tdeheap_lock_updated_tuple_rec. + * + * Given a hypothetical multixact status held by the transaction identified + * with the given xid, does the current transaction need to wait, fail, or can + * it continue if it wanted to acquire a lock of the given mode? "needwait" + * is set to true if waiting is necessary; if it can continue, then TM_Ok is + * returned. If the lock is already held by the current transaction, return + * TM_SelfModified. In case of a conflict with another transaction, a + * different HeapTupleSatisfiesUpdate return code is returned. + * + * The held status is said to be hypothetical because it might correspond to a + * lock held by a single Xid, i.e. not a real MultiXactId; we express it this + * way for simplicity of API. + */ +static TM_Result +test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, + LockTupleMode mode, HeapTuple tup, + bool *needwait) +{ + MultiXactStatus wantedstatus; + + *needwait = false; + wantedstatus = get_mxact_status_for_lock(mode, false); + + /* + * Note: we *must* check TransactionIdIsInProgress before + * TransactionIdDidAbort/Commit; see comment at top of pg_tdeam_visibility.c + * for an explanation. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + { + /* + * The tuple has already been locked by our own transaction. This is + * very rare but can happen if multiple transactions are trying to + * lock an ancient version of the same tuple. + */ + return TM_SelfModified; + } + else if (TransactionIdIsInProgress(xid)) + { + /* + * If the locking transaction is running, what we do depends on + * whether the lock modes conflict: if they do, then we must wait for + * it to finish; otherwise we can fall through to lock this tuple + * version without waiting. + */ + if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), + LOCKMODE_from_mxstatus(wantedstatus))) + { + *needwait = true; + } + + /* + * If we set needwait above, then this value doesn't matter; + * otherwise, this value signals to caller that it's okay to proceed. + */ + return TM_Ok; + } + else if (TransactionIdDidAbort(xid)) + return TM_Ok; + else if (TransactionIdDidCommit(xid)) + { + /* + * The other transaction committed. If it was only a locker, then the + * lock is completely gone now and we can return success; but if it + * was an update, then what we do depends on whether the two lock + * modes conflict. If they conflict, then we must report error to + * caller. But if they don't, we can fall through to allow the current + * transaction to lock the tuple. + * + * Note: the reason we worry about ISUPDATE here is because as soon as + * a transaction ends, all its locks are gone and meaningless, and + * thus we can ignore them; whereas its updates persist. In the + * TransactionIdIsInProgress case, above, we don't need to check + * because we know the lock is still "alive" and thus a conflict needs + * always be checked. + */ + if (!ISUPDATE_from_mxstatus(status)) + return TM_Ok; + + if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), + LOCKMODE_from_mxstatus(wantedstatus))) + { + /* bummer */ + if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid)) + return TM_Updated; + else + return TM_Deleted; + } + + return TM_Ok; + } + + /* Not in progress, not aborted, not committed -- must have crashed */ + return TM_Ok; +} + + +/* + * Recursive part of tdeheap_lock_updated_tuple + * + * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given + * xid with the given mode; if this tuple is updated, recurse to lock the new + * version as well. + */ +static TM_Result +tdeheap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, + LockTupleMode mode) +{ + TM_Result result; + ItemPointerData tupid; + HeapTupleData mytup; + Buffer buf; + uint16 new_infomask, + new_infomask2, + old_infomask, + old_infomask2; + TransactionId xmax, + new_xmax; + TransactionId priorXmax = InvalidTransactionId; + bool cleared_all_frozen = false; + bool pinned_desired_page; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + + ItemPointerCopy(tid, &tupid); + + for (;;) + { + new_infomask = 0; + new_xmax = InvalidTransactionId; + block = ItemPointerGetBlockNumber(&tupid); + ItemPointerCopy(&tupid, &(mytup.t_self)); + + if (!tdeheap_fetch(rel, SnapshotAny, &mytup, &buf, false)) + { + /* + * if we fail to find the updated version of the tuple, it's + * because it was vacuumed/pruned away after its creator + * transaction aborted. So behave as if we got to the end of the + * chain, and there's no further tuple to lock: return success to + * caller. + */ + result = TM_Ok; + goto out_unlocked; + } + +l4: + CHECK_FOR_INTERRUPTS(); + + /* + * Before locking the buffer, pin the visibility map page if it + * appears to be necessary. Since we haven't got the lock yet, + * someone else might be in the middle of changing this, so we'll need + * to recheck after we have the lock. + */ + if (PageIsAllVisible(BufferGetPage(buf))) + { + tdeheap_visibilitymap_pin(rel, block, &vmbuffer); + pinned_desired_page = true; + } + else + pinned_desired_page = false; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * If we didn't pin the visibility map page and the page has become + * all visible while we were busy locking the buffer, we'll have to + * unlock and re-lock, to avoid holding the buffer lock across I/O. + * That's a bit unfortunate, but hopefully shouldn't happen often. + * + * Note: in some paths through this function, we will reach here + * holding a pin on a vm page that may or may not be the one matching + * this page. If this page isn't all-visible, we won't use the vm + * page, but we hold onto such a pin till the end of the function. + */ + if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf))) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + tdeheap_visibilitymap_pin(rel, block, &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * Check the tuple XMIN against prior XMAX, if any. If we reached the + * end of the chain, we're done, so return success. + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data), + priorXmax)) + { + result = TM_Ok; + goto out_locked; + } + + /* + * Also check Xmin: if this tuple was created by an aborted + * (sub)transaction, then we already locked the last live one in the + * chain, thus we're done, so return success. + */ + if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data))) + { + result = TM_Ok; + goto out_locked; + } + + old_infomask = mytup.t_data->t_infomask; + old_infomask2 = mytup.t_data->t_infomask2; + xmax = HeapTupleHeaderGetRawXmax(mytup.t_data); + + /* + * If this tuple version has been updated or locked by some concurrent + * transaction(s), what we do depends on whether our lock mode + * conflicts with what those other transactions hold, and also on the + * status of them. + */ + if (!(old_infomask & HEAP_XMAX_INVALID)) + { + TransactionId rawxmax; + bool needwait; + + rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data); + if (old_infomask & HEAP_XMAX_IS_MULTI) + { + int nmembers; + int i; + MultiXactMember *members; + + /* + * We don't need a test for pg_upgrade'd tuples: this is only + * applied to tuples after the first in an update chain. Said + * first tuple in the chain may well be locked-in-9.2-and- + * pg_upgraded, but that one was already locked by our caller, + * not us; and any subsequent ones cannot be because our + * caller must necessarily have obtained a snapshot later than + * the pg_upgrade itself. + */ + Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask)); + + nmembers = GetMultiXactIdMembers(rawxmax, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); + for (i = 0; i < nmembers; i++) + { + result = test_lockmode_for_conflict(members[i].status, + members[i].xid, + mode, + &mytup, + &needwait); + + /* + * If the tuple was already locked by ourselves in a + * previous iteration of this (say tdeheap_lock_tuple was + * forced to restart the locking loop because of a change + * in xmax), then we hold the lock already on this tuple + * version and we don't need to do anything; and this is + * not an error condition either. We just need to skip + * this tuple and continue locking the next version in the + * update chain. + */ + if (result == TM_SelfModified) + { + pfree(members); + goto next; + } + + if (needwait) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(members[i].xid, rel, + &mytup.t_self, + XLTW_LockUpdated); + pfree(members); + goto l4; + } + if (result != TM_Ok) + { + pfree(members); + goto out_locked; + } + } + if (members) + pfree(members); + } + else + { + MultiXactStatus status; + + /* + * For a non-multi Xmax, we first need to compute the + * corresponding MultiXactStatus by using the infomask bits. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) + { + if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) + status = MultiXactStatusForKeyShare; + else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) + status = MultiXactStatusForShare; + else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) + { + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusForUpdate; + else + status = MultiXactStatusForNoKeyUpdate; + } + else + { + /* + * LOCK_ONLY present alone (a pg_upgraded tuple marked + * as share-locked in the old cluster) shouldn't be + * seen in the middle of an update chain. + */ + elog(ERROR, "invalid lock status in tuple"); + } + } + else + { + /* it's an update, but which kind? */ + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; + } + + result = test_lockmode_for_conflict(status, rawxmax, mode, + &mytup, &needwait); + + /* + * If the tuple was already locked by ourselves in a previous + * iteration of this (say tdeheap_lock_tuple was forced to + * restart the locking loop because of a change in xmax), then + * we hold the lock already on this tuple version and we don't + * need to do anything; and this is not an error condition + * either. We just need to skip this tuple and continue + * locking the next version in the update chain. + */ + if (result == TM_SelfModified) + goto next; + + if (needwait) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(rawxmax, rel, &mytup.t_self, + XLTW_LockUpdated); + goto l4; + } + if (result != TM_Ok) + { + goto out_locked; + } + } + } + + /* compute the new Xmax and infomask values for the tuple ... */ + compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2, + xid, mode, false, + &new_xmax, &new_infomask, &new_infomask2); + + if (PageIsAllVisible(BufferGetPage(buf)) && + tdeheap_visibilitymap_clear(rel, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN)) + cleared_all_frozen = true; + + START_CRIT_SECTION(); + + /* ... and set them */ + HeapTupleHeaderSetXmax(mytup.t_data, new_xmax); + mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS; + mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + mytup.t_data->t_infomask |= new_infomask; + mytup.t_data->t_infomask2 |= new_infomask2; + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_tdeheap_lock_updated xlrec; + XLogRecPtr recptr; + Page page = BufferGetPage(buf); + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self); + xlrec.xmax = new_xmax; + xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2); + xlrec.flags = + cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + + XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + +next: + /* if we find the end of update chain, we're done. */ + if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || + HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) || + ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || + HeapTupleHeaderIsOnlyLocked(mytup.t_data)) + { + result = TM_Ok; + goto out_locked; + } + + /* tail recursion */ + priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data); + ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); + UnlockReleaseBuffer(buf); + } + + result = TM_Ok; + +out_locked: + UnlockReleaseBuffer(buf); + +out_unlocked: + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + return result; +} + +/* + * tdeheap_lock_updated_tuple + * Follow update chain when locking an updated tuple, acquiring locks (row + * marks) on the updated versions. + * + * The initial tuple is assumed to be already locked. + * + * This function doesn't check visibility, it just unconditionally marks the + * tuple(s) as locked. If any tuple in the updated chain is being deleted + * concurrently (or updated with the key being modified), sleep until the + * transaction doing it is finished. + * + * Note that we don't acquire heavyweight tuple locks on the tuples we walk + * when we have to wait for other transactions to release them, as opposed to + * what tdeheap_lock_tuple does. The reason is that having more than one + * transaction walking the chain is probably uncommon enough that risk of + * starvation is not likely: one of the preconditions for being here is that + * the snapshot in use predates the update that created this tuple (because we + * started at an earlier version of the tuple), but at the same time such a + * transaction cannot be using repeatable read or serializable isolation + * levels, because that would lead to a serializability failure. + */ +static TM_Result +tdeheap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, + TransactionId xid, LockTupleMode mode) +{ + /* + * If the tuple has not been updated, or has moved into another partition + * (effectively a delete) stop here. + */ + if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) && + !ItemPointerEquals(&tuple->t_self, ctid)) + { + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId + * setting. We can be certain that the transaction will never become a + * member of any older MultiXactIds than that. (We have to do this + * even if we end up just using our own TransactionId below, since + * some other backend could incorporate our XID into a MultiXact + * immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + + return tdeheap_lock_updated_tuple_rec(rel, ctid, xid, mode); + } + + /* nothing to lock */ + return TM_Ok; +} + +/* + * tdeheap_finish_speculative - mark speculative insertion as successful + * + * To successfully finish a speculative insertion we have to clear speculative + * token from tuple. To do so the t_ctid field, which will contain a + * speculative token value, is modified in place to point to the tuple itself, + * which is characteristic of a newly inserted ordinary tuple. + * + * NB: It is not ok to commit without either finishing or aborting a + * speculative insertion. We could treat speculative tuples of committed + * transactions implicitly as completed, but then we would have to be prepared + * to deal with speculative tokens on committed tuples. That wouldn't be + * difficult - no-one looks at the ctid field of a tuple with invalid xmax - + * but clearing the token at completion isn't very expensive either. + * An explicit confirmation WAL record also makes logical decoding simpler. + */ +void +tdeheap_finish_speculative(Relation relation, ItemPointer tid) +{ + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + offnum = ItemPointerGetOffsetNumber(tid); + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(ERROR, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + Assert(HeapTupleHeaderIsSpeculative(htup)); + + MarkBufferDirty(buffer); + + /* + * Replace the speculative insertion token with a real t_ctid, pointing to + * itself like it does on regular tuples. + */ + htup->t_ctid = *tid; + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_tdeheap_confirm xlrec; + XLogRecPtr recptr; + + xlrec.offnum = ItemPointerGetOffsetNumber(tid); + + XLogBeginInsert(); + + /* We want the same filtering on this as on a plain insert */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); +} + +/* + * tdeheap_abort_speculative - kill a speculatively inserted tuple + * + * Marks a tuple that was speculatively inserted in the same command as dead, + * by setting its xmin as invalid. That makes it immediately appear as dead + * to all transactions, including our own. In particular, it makes + * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend + * inserting a duplicate key value won't unnecessarily wait for our whole + * transaction to finish (it'll just wait for our speculative insertion to + * finish). + * + * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks + * that arise due to a mutual dependency that is not user visible. By + * definition, unprincipled deadlocks cannot be prevented by the user + * reordering lock acquisition in client code, because the implementation level + * lock acquisitions are not under the user's direct control. If speculative + * inserters did not take this precaution, then under high concurrency they + * could deadlock with each other, which would not be acceptable. + * + * This is somewhat redundant with tdeheap_delete, but we prefer to have a + * dedicated routine with stripped down requirements. Note that this is also + * used to delete the TOAST tuples created during speculative insertion. + * + * This routine does not affect logical decoding as it only looks at + * confirmation records. + */ +void +tdeheap_abort_speculative(Relation relation, ItemPointer tid) +{ + TransactionId xid = GetCurrentTransactionId(); + ItemId lp; + HeapTupleData tp; + Page page; + BlockNumber block; + Buffer buffer; + TransactionId prune_xid; + + Assert(ItemPointerIsValid(tid)); + + block = ItemPointerGetBlockNumber(tid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Page can't be all visible, we just inserted into it, and are still + * running. + */ + Assert(!PageIsAllVisible(page)); + + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tp.t_tableOid = RelationGetRelid(relation); + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_self = *tid; + + /* + * Sanity check that the tuple really is a speculatively inserted tuple, + * inserted by us. + */ + if (tp.t_data->t_choice.t_heap.t_xmin != xid) + elog(ERROR, "attempted to kill a tuple inserted by another transaction"); + if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data))) + elog(ERROR, "attempted to kill a non-speculative tuple"); + Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + + /* + * No need to check for serializable conflicts here. There is never a + * need for a combo CID, either. No need to extract replica identity, or + * do anything special with infomask bits. + */ + + START_CRIT_SECTION(); + + /* + * The tuple will become DEAD immediately. Flag that this page is a + * candidate for pruning by setting xmin to TransactionXmin. While not + * immediately prunable, it is the oldest xid we can cheaply determine + * that's safe against wraparound / being older than the table's + * relfrozenxid. To defend against the unlikely case of a new relation + * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid + * if so (vacuum can't subsequently move relfrozenxid to beyond + * TransactionXmin, so there's no race here). + */ + Assert(TransactionIdIsValid(TransactionXmin)); + if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid)) + prune_xid = relation->rd_rel->relfrozenxid; + else + prune_xid = TransactionXmin; + PageSetPrunable(page, prune_xid); + + /* store transaction information of xact deleting the tuple */ + tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + + /* + * Set the tuple header xmin to InvalidTransactionId. This makes the + * tuple immediately invisible everyone. (In particular, to any + * transactions waiting on the speculative token, woken up later.) + */ + HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); + + /* Clear the speculative insertion token too */ + tp.t_data->t_ctid = tp.t_self; + + MarkBufferDirty(buffer); + + /* + * XLOG stuff + * + * The WAL records generated here match tdeheap_delete(). The same recovery + * routines are used. + */ + if (RelationNeedsWAL(relation)) + { + xl_tdeheap_delete xlrec; + XLogRecPtr recptr; + + xlrec.flags = XLH_DELETE_IS_SUPER; + xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, + tp.t_data->t_infomask2); + xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); + xlrec.xmax = xid; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + /* No replica identity & replication origin logged */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (HeapTupleHasExternal(&tp)) + { + Assert(!IsToastRelation(relation)); + tdeheap_toast_delete(relation, &tp, true); + } + + /* + * Never need to mark tuple for invalidation, since catalogs don't support + * speculative insertion + */ + + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + + /* count deletion, as we counted the insertion too */ + pgstat_count_tdeheap_delete(relation); +} + +/* + * tdeheap_inplace_update - update a tuple "in place" (ie, overwrite it) + * + * Overwriting violates both MVCC and transactional safety, so the uses + * of this function in Postgres are extremely limited. Nonetheless we + * find some places to use it. + * + * The tuple cannot change size, and therefore it's reasonable to assume + * that its null bitmap (if any) doesn't change either. So we just + * overwrite the data portion of the tuple without touching the null + * bitmap or any of the header fields. + * + * tuple is an in-memory tuple structure containing the data to be written + * over the target tuple. Also, tuple->t_self identifies the target tuple. + * + * Note that the tuple updated here had better not come directly from the + * syscache if the relation has a toast relation as this tuple could + * include toast values that have been expanded, causing a failure here. + */ +void +tdeheap_inplace_update(Relation relation, HeapTuple tuple) +{ + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + uint32 oldlen; + uint32 newlen; + + /* + * For now, we don't allow parallel updates. Unlike a regular update, + * this should never create a combo CID, so it might be possible to relax + * this restriction, but not without more thought and testing. It's not + * clear that it would be useful, anyway. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot update tuples during a parallel operation"))); + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self))); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + offnum = ItemPointerGetOffsetNumber(&(tuple->t_self)); + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(ERROR, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldlen = ItemIdGetLength(lp) - htup->t_hoff; + newlen = tuple->t_len - tuple->t_data->t_hoff; + if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff) + elog(ERROR, "wrong tuple length"); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + memcpy((char *) htup + htup->t_hoff, + (char *) tuple->t_data + tuple->t_data->t_hoff, + newlen); + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_tdeheap_inplace xlrec; + XLogRecPtr recptr; + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapInplace); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); + + /* inplace updates aren't decoded atm, don't log the origin */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); + + /* + * Send out shared cache inval if necessary. Note that because we only + * pass the new version of the tuple, this mustn't be used for any + * operations that could change catcache lookup keys. But we aren't + * bothering with index updates either, so that's true a fortiori. + */ + if (!IsBootstrapProcessingMode()) + CacheInvalidateHeapTuple(relation, tuple, NULL); +} + +#define FRM_NOOP 0x0001 +#define FRM_INVALIDATE_XMAX 0x0002 +#define FRM_RETURN_IS_XID 0x0004 +#define FRM_RETURN_IS_MULTI 0x0008 +#define FRM_MARK_COMMITTED 0x0010 + +/* + * FreezeMultiXactId + * Determine what to do during freezing when a tuple is marked by a + * MultiXactId. + * + * "flags" is an output value; it's used to tell caller what to do on return. + * "pagefrz" is an input/output value, used to manage page level freezing. + * + * Possible values that we can set in "flags": + * FRM_NOOP + * don't do anything -- keep existing Xmax + * FRM_INVALIDATE_XMAX + * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag. + * FRM_RETURN_IS_XID + * The Xid return value is a single update Xid to set as xmax. + * FRM_MARK_COMMITTED + * Xmax can be marked as HEAP_XMAX_COMMITTED + * FRM_RETURN_IS_MULTI + * The return value is a new MultiXactId to set as new Xmax. + * (caller must obtain proper infomask bits using GetMultiXactIdHintBits) + * + * Caller delegates control of page freezing to us. In practice we always + * force freezing of caller's page unless FRM_NOOP processing is indicated. + * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff + * can never be left behind. We freely choose when and how to process each + * Multi, without ever violating the cutoff postconditions for freezing. + * + * It's useful to remove Multis on a proactive timeline (relative to freezing + * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also + * be cheaper in the short run, for us, since we too can avoid SLRU buffer + * misses through eager processing. + * + * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only + * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice. + * This can usually be put off, which is usually enough to avoid it altogether. + * Allocating new multis during VACUUM should be avoided on general principle; + * only VACUUM can advance relminmxid, so allocating new Multis here comes with + * its own special risks. + * + * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers + * using tdeheap_tuple_should_freeze when we haven't forced page-level freezing. + * + * NB: Caller should avoid needlessly calling tdeheap_tuple_should_freeze when we + * have already forced page-level freezing, since that might incur the same + * SLRU buffer misses that we specifically intended to avoid by freezing. + */ +static TransactionId +FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, + const struct VacuumCutoffs *cutoffs, uint16 *flags, + HeapPageFreeze *pagefrz) +{ + TransactionId newxmax; + MultiXactMember *members; + int nmembers; + bool need_replace; + int nnewmembers; + MultiXactMember *newmembers; + bool has_lockers; + TransactionId update_xid; + bool update_committed; + TransactionId FreezePageRelfrozenXid; + + *flags = 0; + + /* We should only be called in Multis */ + Assert(t_infomask & HEAP_XMAX_IS_MULTI); + + if (!MultiXactIdIsValid(multi) || + HEAP_LOCKED_UPGRADED(t_infomask)) + { + *flags |= FRM_INVALIDATE_XMAX; + pagefrz->freeze_required = true; + return InvalidTransactionId; + } + else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found multixact %u from before relminmxid %u", + multi, cutoffs->relminmxid))); + else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact)) + { + TransactionId update_xact; + + /* + * This old multi cannot possibly have members still running, but + * verify just in case. If it was a locker only, it can be removed + * without any further consideration; but if it contained an update, + * we might need to preserve it. + */ + if (MultiXactIdIsRunning(multi, + HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running", + multi, cutoffs->OldestMxact))); + + if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)) + { + *flags |= FRM_INVALIDATE_XMAX; + pagefrz->freeze_required = true; + return InvalidTransactionId; + } + + /* replace multi with single XID for its updater? */ + update_xact = MultiXactIdGetUpdateXid(multi, t_infomask); + if (TransactionIdPrecedes(update_xact, cutoffs->relfrozenxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u", + multi, update_xact, + cutoffs->relfrozenxid))); + else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin)) + { + /* + * Updater XID has to have aborted (otherwise the tuple would have + * been pruned away instead, since updater XID is < OldestXmin). + * Just remove xmax. + */ + if (TransactionIdDidCommit(update_xact)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u", + multi, update_xact, + cutoffs->OldestXmin))); + *flags |= FRM_INVALIDATE_XMAX; + pagefrz->freeze_required = true; + return InvalidTransactionId; + } + + /* Have to keep updater XID as new xmax */ + *flags |= FRM_RETURN_IS_XID; + pagefrz->freeze_required = true; + return update_xact; + } + + /* + * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we + * need to walk the whole members array to figure out what to do, if + * anything. + */ + nmembers = + GetMultiXactIdMembers(multi, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)); + if (nmembers <= 0) + { + /* Nothing worth keeping */ + *flags |= FRM_INVALIDATE_XMAX; + pagefrz->freeze_required = true; + return InvalidTransactionId; + } + + /* + * The FRM_NOOP case is the only case where we might need to ratchet back + * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only + * case where our caller might ratchet back its NoFreezePageRelfrozenXid + * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi. + * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid + * trackers managed by VACUUM being ratcheting back by xmax to the degree + * required to make it safe to leave xmax undisturbed, independent of + * whether or not page freezing is triggered somewhere else. + * + * Our policy is to force freezing in every case other than FRM_NOOP, + * which obviates the need to maintain either set of trackers, anywhere. + * Every other case will reliably execute a freeze plan for xmax that + * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or + * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen. + * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with + * OldestXmin/OldestMxact, so later values never need to be tracked here.) + */ + need_replace = false; + FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid; + for (int i = 0; i < nmembers; i++) + { + TransactionId xid = members[i].xid; + + Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid)); + + if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) + { + /* Can't violate the FreezeLimit postcondition */ + need_replace = true; + break; + } + if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid)) + FreezePageRelfrozenXid = xid; + } + + /* Can't violate the MultiXactCutoff postcondition, either */ + if (!need_replace) + need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff); + + if (!need_replace) + { + /* + * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or + * both together to make it safe to retain this particular multi after + * freezing its page + */ + *flags |= FRM_NOOP; + pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid; + if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid)) + pagefrz->FreezePageRelminMxid = multi; + pfree(members); + return multi; + } + + /* + * Do a more thorough second pass over the multi to figure out which + * member XIDs actually need to be kept. Checking the precise status of + * individual members might even show that we don't need to keep anything. + * That is quite possible even though the Multi must be >= OldestMxact, + * since our second pass only keeps member XIDs when it's truly necessary; + * even member XIDs >= OldestXmin often won't be kept by second pass. + */ + nnewmembers = 0; + newmembers = palloc(sizeof(MultiXactMember) * nmembers); + has_lockers = false; + update_xid = InvalidTransactionId; + update_committed = false; + + /* + * Determine whether to keep each member xid, or to ignore it instead + */ + for (int i = 0; i < nmembers; i++) + { + TransactionId xid = members[i].xid; + MultiXactStatus mstatus = members[i].status; + + Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid)); + + if (!ISUPDATE_from_mxstatus(mstatus)) + { + /* + * Locker XID (not updater XID). We only keep lockers that are + * still running. + */ + if (TransactionIdIsCurrentTransactionId(xid) || + TransactionIdIsInProgress(xid)) + { + if (TransactionIdPrecedes(xid, cutoffs->OldestXmin)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u", + multi, xid, + cutoffs->OldestXmin))); + newmembers[nnewmembers++] = members[i]; + has_lockers = true; + } + + continue; + } + + /* + * Updater XID (not locker XID). Should we keep it? + * + * Since the tuple wasn't totally removed when vacuum pruned, the + * update Xid cannot possibly be older than OldestXmin cutoff unless + * the updater XID aborted. If the updater transaction is known + * aborted or crashed then it's okay to ignore it, otherwise not. + * + * In any case the Multi should never contain two updaters, whatever + * their individual commit status. Check for that first, in passing. + */ + if (TransactionIdIsValid(update_xid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("multixact %u has two or more updating members", + multi), + errdetail_internal("First updater XID=%u second updater XID=%u.", + update_xid, xid))); + + /* + * As with all tuple visibility routines, it's critical to test + * TransactionIdIsInProgress before TransactionIdDidCommit, because of + * race conditions explained in detail in pg_tdeam_visibility.c. + */ + if (TransactionIdIsCurrentTransactionId(xid) || + TransactionIdIsInProgress(xid)) + update_xid = xid; + else if (TransactionIdDidCommit(xid)) + { + /* + * The transaction committed, so we can tell caller to set + * HEAP_XMAX_COMMITTED. (We can only do this because we know the + * transaction is not running.) + */ + update_committed = true; + update_xid = xid; + } + else + { + /* + * Not in progress, not committed -- must be aborted or crashed; + * we can ignore it. + */ + continue; + } + + /* + * We determined that updater must be kept -- add it to pending new + * members list + */ + if (TransactionIdPrecedes(xid, cutoffs->OldestXmin)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u", + multi, xid, cutoffs->OldestXmin))); + newmembers[nnewmembers++] = members[i]; + } + + pfree(members); + + /* + * Determine what to do with caller's multi based on information gathered + * during our second pass + */ + if (nnewmembers == 0) + { + /* Nothing worth keeping */ + *flags |= FRM_INVALIDATE_XMAX; + newxmax = InvalidTransactionId; + } + else if (TransactionIdIsValid(update_xid) && !has_lockers) + { + /* + * If there's a single member and it's an update, pass it back alone + * without creating a new Multi. (XXX we could do this when there's a + * single remaining locker, too, but that would complicate the API too + * much; moreover, the case with the single updater is more + * interesting, because those are longer-lived.) + */ + Assert(nnewmembers == 1); + *flags |= FRM_RETURN_IS_XID; + if (update_committed) + *flags |= FRM_MARK_COMMITTED; + newxmax = update_xid; + } + else + { + /* + * Create a new multixact with the surviving members of the previous + * one, to set as new Xmax in the tuple + */ + newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers); + *flags |= FRM_RETURN_IS_MULTI; + } + + pfree(newmembers); + + pagefrz->freeze_required = true; + return newxmax; +} + +/* + * tdeheap_prepare_freeze_tuple + * + * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) + * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so, + * setup enough state (in the *frz output argument) to enable caller to + * process this tuple as part of freezing its page, and return true. Return + * false if nothing can be changed about the tuple right now. + * + * Also sets *totally_frozen to true if the tuple will be totally frozen once + * caller executes returned freeze plan (or if the tuple was already totally + * frozen by an earlier VACUUM). This indicates that there are no remaining + * XIDs or MultiXactIds that will need to be processed by a future VACUUM. + * + * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every + * tuple that we returned true for, and call tdeheap_freeze_execute_prepared to + * execute freezing. Caller must initialize pagefrz fields for page as a + * whole before first call here for each heap page. + * + * VACUUM caller decides on whether or not to freeze the page as a whole. + * We'll often prepare freeze plans for a page that caller just discards. + * However, VACUUM doesn't always get to make a choice; it must freeze when + * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and + * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure + * that VACUUM always follows that rule. + * + * We sometimes force freezing of xmax MultiXactId values long before it is + * strictly necessary to do so just to ensure the FreezeLimit postcondition. + * It's worth processing MultiXactIds proactively when it is cheap to do so, + * and it's convenient to make that happen by piggy-backing it on the "force + * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds + * because it is expensive right now (though only when it's still possible to + * do so without violating the FreezeLimit/MultiXactCutoff postcondition). + * + * It is assumed that the caller has checked the tuple with + * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD + * (else we should be removing the tuple, not freezing it). + * + * NB: This function has side effects: it might allocate a new MultiXactId. + * It will be set as tuple's new xmax when our *frz output is processed within + * tdeheap_execute_freeze_tuple later on. If the tuple is in a shared buffer + * then caller had better have an exclusive lock on it already. + */ +bool +tdeheap_prepare_freeze_tuple(HeapTupleHeader tuple, + const struct VacuumCutoffs *cutoffs, + HeapPageFreeze *pagefrz, + HeapTupleFreeze *frz, bool *totally_frozen) +{ + bool xmin_already_frozen = false, + xmax_already_frozen = false; + bool freeze_xmin = false, + replace_xvac = false, + replace_xmax = false, + freeze_xmax = false; + TransactionId xid; + + frz->xmax = HeapTupleHeaderGetRawXmax(tuple); + frz->t_infomask2 = tuple->t_infomask2; + frz->t_infomask = tuple->t_infomask; + frz->frzflags = 0; + frz->checkflags = 0; + + /* + * Process xmin, while keeping track of whether it's already frozen, or + * will become frozen iff our freeze plan is executed by caller (could be + * neither). + */ + xid = HeapTupleHeaderGetXmin(tuple); + if (!TransactionIdIsNormal(xid)) + xmin_already_frozen = true; + else + { + if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found xmin %u from before relfrozenxid %u", + xid, cutoffs->relfrozenxid))); + + /* Will set freeze_xmin flags in freeze plan below */ + freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin); + + /* Verify that xmin committed if and when freeze plan is executed */ + if (freeze_xmin) + frz->checkflags |= HEAP_FREEZE_CHECK_XMIN_COMMITTED; + } + + /* + * Old-style VACUUM FULL is gone, but we have to process xvac for as long + * as we support having MOVED_OFF/MOVED_IN tuples in the database + */ + xid = HeapTupleHeaderGetXvac(tuple); + if (TransactionIdIsNormal(xid)) + { + Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); + Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin)); + + /* + * For Xvac, we always freeze proactively. This allows totally_frozen + * tracking to ignore xvac. + */ + replace_xvac = pagefrz->freeze_required = true; + + /* Will set replace_xvac flags in freeze plan below */ + } + + /* Now process xmax */ + xid = frz->xmax; + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + /* Raw xmax is a MultiXactId */ + TransactionId newxmax; + uint16 flags; + + /* + * We will either remove xmax completely (in the "freeze_xmax" path), + * process xmax by replacing it (in the "replace_xmax" path), or + * perform no-op xmax processing. The only constraint is that the + * FreezeLimit/MultiXactCutoff postcondition must never be violated. + */ + newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs, + &flags, pagefrz); + + if (flags & FRM_NOOP) + { + /* + * xmax is a MultiXactId, and nothing about it changes for now. + * This is the only case where 'freeze_required' won't have been + * set for us by FreezeMultiXactId, as well as the only case where + * neither freeze_xmax nor replace_xmax are set (given a multi). + * + * This is a no-op, but the call to FreezeMultiXactId might have + * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers + * for us (the "freeze page" variants, specifically). That'll + * make it safe for our caller to freeze the page later on, while + * leaving this particular xmax undisturbed. + * + * FreezeMultiXactId is _not_ responsible for the "no freeze" + * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our + * job. A call to tdeheap_tuple_should_freeze for this same tuple + * will take place below if 'freeze_required' isn't set already. + * (This repeats work from FreezeMultiXactId, but allows "no + * freeze" tracker maintenance to happen in only one place.) + */ + Assert(!MultiXactIdPrecedes(newxmax, cutoffs->MultiXactCutoff)); + Assert(MultiXactIdIsValid(newxmax) && xid == newxmax); + } + else if (flags & FRM_RETURN_IS_XID) + { + /* + * xmax will become an updater Xid (original MultiXact's updater + * member Xid will be carried forward as a simple Xid in Xmax). + */ + Assert(!TransactionIdPrecedes(newxmax, cutoffs->OldestXmin)); + + /* + * NB -- some of these transformations are only valid because we + * know the return Xid is a tuple updater (i.e. not merely a + * locker.) Also note that the only reason we don't explicitly + * worry about HEAP_KEYS_UPDATED is because it lives in + * t_infomask2 rather than t_infomask. + */ + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->xmax = newxmax; + if (flags & FRM_MARK_COMMITTED) + frz->t_infomask |= HEAP_XMAX_COMMITTED; + replace_xmax = true; + } + else if (flags & FRM_RETURN_IS_MULTI) + { + uint16 newbits; + uint16 newbits2; + + /* + * xmax is an old MultiXactId that we have to replace with a new + * MultiXactId, to carry forward two or more original member XIDs. + */ + Assert(!MultiXactIdPrecedes(newxmax, cutoffs->OldestMxact)); + + /* + * We can't use GetMultiXactIdHintBits directly on the new multi + * here; that routine initializes the masks to all zeroes, which + * would lose other bits we need. Doing it this way ensures all + * unrelated bits remain untouched. + */ + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; + GetMultiXactIdHintBits(newxmax, &newbits, &newbits2); + frz->t_infomask |= newbits; + frz->t_infomask2 |= newbits2; + frz->xmax = newxmax; + replace_xmax = true; + } + else + { + /* + * Freeze plan for tuple "freezes xmax" in the strictest sense: + * it'll leave nothing in xmax (neither an Xid nor a MultiXactId). + */ + Assert(flags & FRM_INVALIDATE_XMAX); + Assert(!TransactionIdIsValid(newxmax)); + + /* Will set freeze_xmax flags in freeze plan below */ + freeze_xmax = true; + } + + /* MultiXactId processing forces freezing (barring FRM_NOOP case) */ + Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax)); + } + else if (TransactionIdIsNormal(xid)) + { + /* Raw xmax is normal XID */ + if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found xmax %u from before relfrozenxid %u", + xid, cutoffs->relfrozenxid))); + + /* Will set freeze_xmax flags in freeze plan below */ + freeze_xmax = TransactionIdPrecedes(xid, cutoffs->OldestXmin); + + /* + * Verify that xmax aborted if and when freeze plan is executed, + * provided it's from an update. (A lock-only xmax can be removed + * independent of this, since the lock is released at xact end.) + */ + if (freeze_xmax && !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED; + } + else if (!TransactionIdIsValid(xid)) + { + /* Raw xmax is InvalidTransactionId XID */ + Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0); + xmax_already_frozen = true; + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi", + xid, tuple->t_infomask))); + + if (freeze_xmin) + { + Assert(!xmin_already_frozen); + + frz->t_infomask |= HEAP_XMIN_FROZEN; + } + if (replace_xvac) + { + /* + * If a MOVED_OFF tuple is not dead, the xvac transaction must have + * failed; whereas a non-dead MOVED_IN tuple must mean the xvac + * transaction succeeded. + */ + Assert(pagefrz->freeze_required); + if (tuple->t_infomask & HEAP_MOVED_OFF) + frz->frzflags |= XLH_INVALID_XVAC; + else + frz->frzflags |= XLH_FREEZE_XVAC; + } + if (replace_xmax) + { + Assert(!xmax_already_frozen && !freeze_xmax); + Assert(pagefrz->freeze_required); + + /* Already set replace_xmax flags in freeze plan earlier */ + } + if (freeze_xmax) + { + Assert(!xmax_already_frozen && !replace_xmax); + + frz->xmax = InvalidTransactionId; + + /* + * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + + * LOCKED. Normalize to INVALID just to be sure no one gets confused. + * Also get rid of the HEAP_KEYS_UPDATED bit. + */ + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->t_infomask |= HEAP_XMAX_INVALID; + frz->t_infomask2 &= ~HEAP_HOT_UPDATED; + frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; + } + + /* + * Determine if this tuple is already totally frozen, or will become + * totally frozen (provided caller executes freeze plans for the page) + */ + *totally_frozen = ((freeze_xmin || xmin_already_frozen) && + (freeze_xmax || xmax_already_frozen)); + + if (!pagefrz->freeze_required && !(xmin_already_frozen && + xmax_already_frozen)) + { + /* + * So far no previous tuple from the page made freezing mandatory. + * Does this tuple force caller to freeze the entire page? + */ + pagefrz->freeze_required = + tdeheap_tuple_should_freeze(tuple, cutoffs, + &pagefrz->NoFreezePageRelfrozenXid, + &pagefrz->NoFreezePageRelminMxid); + } + + /* Tell caller if this tuple has a usable freeze plan set in *frz */ + return freeze_xmin || replace_xvac || replace_xmax || freeze_xmax; +} + +/* + * tdeheap_execute_freeze_tuple + * Execute the prepared freezing of a tuple with caller's freeze plan. + * + * Caller is responsible for ensuring that no other backend can access the + * storage underlying this tuple, either by holding an exclusive lock on the + * buffer containing it (which is what lazy VACUUM does), or by having it be + * in private storage (which is what CLUSTER and friends do). + */ +static inline void +tdeheap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz) +{ + HeapTupleHeaderSetXmax(tuple, frz->xmax); + + if (frz->frzflags & XLH_FREEZE_XVAC) + HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); + + if (frz->frzflags & XLH_INVALID_XVAC) + HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); + + tuple->t_infomask = frz->t_infomask; + tuple->t_infomask2 = frz->t_infomask2; +} + +/* + * tdeheap_freeze_execute_prepared + * + * Executes freezing of one or more heap tuples on a page on behalf of caller. + * Caller passes an array of tuple plans from tdeheap_prepare_freeze_tuple. + * Caller must set 'offset' in each plan for us. Note that we destructively + * sort caller's tuples array in-place, so caller had better be done with it. + * + * WAL-logs the changes so that VACUUM can advance the rel's relfrozenxid + * later on without any risk of unsafe pg_xact lookups, even following a hard + * crash (or when querying from a standby). We represent freezing by setting + * infomask bits in tuple headers, but this shouldn't be thought of as a hint. + * See section on buffer access rules in src/backend/storage/buffer/README. + */ +void +tdeheap_freeze_execute_prepared(Relation rel, Buffer buffer, + TransactionId snapshotConflictHorizon, + HeapTupleFreeze *tuples, int ntuples) +{ + Page page = BufferGetPage(buffer); + + Assert(ntuples > 0); + + /* + * Perform xmin/xmax XID status sanity checks before critical section. + * + * tdeheap_prepare_freeze_tuple doesn't perform these checks directly because + * pg_xact lookups are relatively expensive. They shouldn't be repeated + * by successive VACUUMs that each decide against freezing the same page. + */ + for (int i = 0; i < ntuples; i++) + { + HeapTupleFreeze *frz = tuples + i; + ItemId itemid = PageGetItemId(page, frz->offset); + HeapTupleHeader htup; + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + /* Deliberately avoid relying on tuple hint bits here */ + if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED) + { + TransactionId xmin = HeapTupleHeaderGetRawXmin(htup); + + Assert(!HeapTupleHeaderXminFrozen(htup)); + if (unlikely(!TransactionIdDidCommit(xmin))) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("uncommitted xmin %u needs to be frozen", + xmin))); + } + + /* + * TransactionIdDidAbort won't work reliably in the presence of XIDs + * left behind by transactions that were in progress during a crash, + * so we can only check that xmax didn't commit + */ + if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED) + { + TransactionId xmax = HeapTupleHeaderGetRawXmax(htup); + + Assert(TransactionIdIsNormal(xmax)); + if (unlikely(TransactionIdDidCommit(xmax))) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("cannot freeze committed xmax %u", + xmax))); + } + } + + START_CRIT_SECTION(); + + for (int i = 0; i < ntuples; i++) + { + HeapTupleFreeze *frz = tuples + i; + ItemId itemid = PageGetItemId(page, frz->offset); + HeapTupleHeader htup; + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + tdeheap_execute_freeze_tuple(htup, frz); + } + + MarkBufferDirty(buffer); + + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(rel)) + { + xl_tdeheap_freeze_plan plans[MaxHeapTuplesPerPage]; + OffsetNumber offsets[MaxHeapTuplesPerPage]; + int nplans; + xl_tdeheap_freeze_page xlrec; + XLogRecPtr recptr; + + /* Prepare deduplicated representation for use in WAL record */ + nplans = tdeheap_log_freeze_plan(tuples, ntuples, plans, offsets); + + xlrec.snapshotConflictHorizon = snapshotConflictHorizon; + xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(rel); + xlrec.nplans = nplans; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage); + + /* + * The freeze plan array and offset array are not actually in the + * buffer, but pretend that they are. When XLogInsert stores the + * whole buffer, the arrays need not be stored too. + */ + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) plans, + nplans * sizeof(xl_tdeheap_freeze_plan)); + XLogRegisterBufData(0, (char *) offsets, + ntuples * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); +} + +/* + * Comparator used to deduplicate XLOG_HEAP2_FREEZE_PAGE freeze plans + */ +static int +tdeheap_log_freeze_cmp(const void *arg1, const void *arg2) +{ + HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1; + HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2; + + if (frz1->xmax < frz2->xmax) + return -1; + else if (frz1->xmax > frz2->xmax) + return 1; + + if (frz1->t_infomask2 < frz2->t_infomask2) + return -1; + else if (frz1->t_infomask2 > frz2->t_infomask2) + return 1; + + if (frz1->t_infomask < frz2->t_infomask) + return -1; + else if (frz1->t_infomask > frz2->t_infomask) + return 1; + + if (frz1->frzflags < frz2->frzflags) + return -1; + else if (frz1->frzflags > frz2->frzflags) + return 1; + + /* + * tdeheap_log_freeze_eq would consider these tuple-wise plans to be equal. + * (So the tuples will share a single canonical freeze plan.) + * + * We tiebreak on page offset number to keep each freeze plan's page + * offset number array individually sorted. (Unnecessary, but be tidy.) + */ + if (frz1->offset < frz2->offset) + return -1; + else if (frz1->offset > frz2->offset) + return 1; + + Assert(false); + return 0; +} + +/* + * Compare fields that describe actions required to freeze tuple with caller's + * open plan. If everything matches then the frz tuple plan is equivalent to + * caller's plan. + */ +static inline bool +tdeheap_log_freeze_eq(xl_tdeheap_freeze_plan *plan, HeapTupleFreeze *frz) +{ + if (plan->xmax == frz->xmax && + plan->t_infomask2 == frz->t_infomask2 && + plan->t_infomask == frz->t_infomask && + plan->frzflags == frz->frzflags) + return true; + + /* Caller must call tdeheap_log_freeze_new_plan again for frz */ + return false; +} + +/* + * Start new plan initialized using tuple-level actions. At least one tuple + * will have steps required to freeze described by caller's plan during REDO. + */ +static inline void +tdeheap_log_freeze_new_plan(xl_tdeheap_freeze_plan *plan, HeapTupleFreeze *frz) +{ + plan->xmax = frz->xmax; + plan->t_infomask2 = frz->t_infomask2; + plan->t_infomask = frz->t_infomask; + plan->frzflags = frz->frzflags; + plan->ntuples = 1; /* for now */ +} + +/* + * Deduplicate tuple-based freeze plans so that each distinct set of + * processing steps is only stored once in XLOG_HEAP2_FREEZE_PAGE records. + * Called during original execution of freezing (for logged relations). + * + * Return value is number of plans set in *plans_out for caller. Also writes + * an array of offset numbers into *offsets_out output argument for caller + * (actually there is one array per freeze plan, but that's not of immediate + * concern to our caller). + */ +static int +tdeheap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, + xl_tdeheap_freeze_plan *plans_out, + OffsetNumber *offsets_out) +{ + int nplans = 0; + + /* Sort tuple-based freeze plans in the order required to deduplicate */ + qsort(tuples, ntuples, sizeof(HeapTupleFreeze), tdeheap_log_freeze_cmp); + + for (int i = 0; i < ntuples; i++) + { + HeapTupleFreeze *frz = tuples + i; + + if (i == 0) + { + /* New canonical freeze plan starting with first tup */ + tdeheap_log_freeze_new_plan(plans_out, frz); + nplans++; + } + else if (tdeheap_log_freeze_eq(plans_out, frz)) + { + /* tup matches open canonical plan -- include tup in it */ + Assert(offsets_out[i - 1] < frz->offset); + plans_out->ntuples++; + } + else + { + /* Tup doesn't match current plan -- done with it now */ + plans_out++; + + /* New canonical freeze plan starting with this tup */ + tdeheap_log_freeze_new_plan(plans_out, frz); + nplans++; + } + + /* + * Save page offset number in dedicated buffer in passing. + * + * REDO routine relies on the record's offset numbers array grouping + * offset numbers by freeze plan. The sort order within each grouping + * is ascending offset number order, just to keep things tidy. + */ + offsets_out[i] = frz->offset; + } + + Assert(nplans > 0 && nplans <= ntuples); + + return nplans; +} + +/* + * tdeheap_freeze_tuple + * Freeze tuple in place, without WAL logging. + * + * Useful for callers like CLUSTER that perform their own WAL logging. + */ +bool +tdeheap_freeze_tuple(HeapTupleHeader tuple, + TransactionId relfrozenxid, TransactionId relminmxid, + TransactionId FreezeLimit, TransactionId MultiXactCutoff) +{ + HeapTupleFreeze frz; + bool do_freeze; + bool totally_frozen; + struct VacuumCutoffs cutoffs; + HeapPageFreeze pagefrz; + + cutoffs.relfrozenxid = relfrozenxid; + cutoffs.relminmxid = relminmxid; + cutoffs.OldestXmin = FreezeLimit; + cutoffs.OldestMxact = MultiXactCutoff; + cutoffs.FreezeLimit = FreezeLimit; + cutoffs.MultiXactCutoff = MultiXactCutoff; + + pagefrz.freeze_required = true; + pagefrz.FreezePageRelfrozenXid = FreezeLimit; + pagefrz.FreezePageRelminMxid = MultiXactCutoff; + pagefrz.NoFreezePageRelfrozenXid = FreezeLimit; + pagefrz.NoFreezePageRelminMxid = MultiXactCutoff; + + do_freeze = tdeheap_prepare_freeze_tuple(tuple, &cutoffs, + &pagefrz, &frz, &totally_frozen); + + /* + * Note that because this is not a WAL-logged operation, we don't need to + * fill in the offset in the freeze record. + */ + + if (do_freeze) + tdeheap_execute_freeze_tuple(tuple, &frz); + return do_freeze; +} + +/* + * For a given MultiXactId, return the hint bits that should be set in the + * tuple's infomask. + * + * Normally this should be called for a multixact that was just created, and + * so is on our local cache, so the GetMembers call is fast. + */ +static void +GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, + uint16 *new_infomask2) +{ + int nmembers; + MultiXactMember *members; + int i; + uint16 bits = HEAP_XMAX_IS_MULTI; + uint16 bits2 = 0; + bool has_update = false; + LockTupleMode strongest = LockTupleKeyShare; + + /* + * We only use this in multis we just created, so they cannot be values + * pre-pg_upgrade. + */ + nmembers = GetMultiXactIdMembers(multi, &members, false, false); + + for (i = 0; i < nmembers; i++) + { + LockTupleMode mode; + + /* + * Remember the strongest lock mode held by any member of the + * multixact. + */ + mode = TUPLOCK_from_mxstatus(members[i].status); + if (mode > strongest) + strongest = mode; + + /* See what other bits we need */ + switch (members[i].status) + { + case MultiXactStatusForKeyShare: + case MultiXactStatusForShare: + case MultiXactStatusForNoKeyUpdate: + break; + + case MultiXactStatusForUpdate: + bits2 |= HEAP_KEYS_UPDATED; + break; + + case MultiXactStatusNoKeyUpdate: + has_update = true; + break; + + case MultiXactStatusUpdate: + bits2 |= HEAP_KEYS_UPDATED; + has_update = true; + break; + } + } + + if (strongest == LockTupleExclusive || + strongest == LockTupleNoKeyExclusive) + bits |= HEAP_XMAX_EXCL_LOCK; + else if (strongest == LockTupleShare) + bits |= HEAP_XMAX_SHR_LOCK; + else if (strongest == LockTupleKeyShare) + bits |= HEAP_XMAX_KEYSHR_LOCK; + + if (!has_update) + bits |= HEAP_XMAX_LOCK_ONLY; + + if (nmembers > 0) + pfree(members); + + *new_infomask = bits; + *new_infomask2 = bits2; +} + +/* + * MultiXactIdGetUpdateXid + * + * Given a multixact Xmax and corresponding infomask, which does not have the + * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating + * transaction. + * + * Caller is expected to check the status of the updating transaction, if + * necessary. + */ +static TransactionId +MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask) +{ + TransactionId update_xact = InvalidTransactionId; + MultiXactMember *members; + int nmembers; + + Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY)); + Assert(t_infomask & HEAP_XMAX_IS_MULTI); + + /* + * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from + * pre-pg_upgrade. + */ + nmembers = GetMultiXactIdMembers(xmax, &members, false, false); + + if (nmembers > 0) + { + int i; + + for (i = 0; i < nmembers; i++) + { + /* Ignore lockers */ + if (!ISUPDATE_from_mxstatus(members[i].status)) + continue; + + /* there can be at most one updater */ + Assert(update_xact == InvalidTransactionId); + update_xact = members[i].xid; +#ifndef USE_ASSERT_CHECKING + + /* + * in an assert-enabled build, walk the whole array to ensure + * there's no other updater. + */ + break; +#endif + } + + pfree(members); + } + + return update_xact; +} + +/* + * HeapTupleGetUpdateXid + * As above, but use a HeapTupleHeader + * + * See also HeapTupleHeaderGetUpdateXid, which can be used without previously + * checking the hint bits. + */ +TransactionId +HeapTupleGetUpdateXid(HeapTupleHeader tuple) +{ + return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple), + tuple->t_infomask); +} + +/* + * Does the given multixact conflict with the current transaction grabbing a + * tuple lock of the given strength? + * + * The passed infomask pairs up with the given multixact in the tuple header. + * + * If current_is_member is not NULL, it is set to 'true' if the current + * transaction is a member of the given multixact. + */ +static bool +DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, + LockTupleMode lockmode, bool *current_is_member) +{ + int nmembers; + MultiXactMember *members; + bool result = false; + LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock; + + if (HEAP_LOCKED_UPGRADED(infomask)) + return false; + + nmembers = GetMultiXactIdMembers(multi, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(infomask)); + if (nmembers >= 0) + { + int i; + + for (i = 0; i < nmembers; i++) + { + TransactionId memxid; + LOCKMODE memlockmode; + + if (result && (current_is_member == NULL || *current_is_member)) + break; + + memlockmode = LOCKMODE_from_mxstatus(members[i].status); + + /* ignore members from current xact (but track their presence) */ + memxid = members[i].xid; + if (TransactionIdIsCurrentTransactionId(memxid)) + { + if (current_is_member != NULL) + *current_is_member = true; + continue; + } + else if (result) + continue; + + /* ignore members that don't conflict with the lock we want */ + if (!DoLockModesConflict(memlockmode, wanted)) + continue; + + if (ISUPDATE_from_mxstatus(members[i].status)) + { + /* ignore aborted updaters */ + if (TransactionIdDidAbort(memxid)) + continue; + } + else + { + /* ignore lockers-only that are no longer in progress */ + if (!TransactionIdIsInProgress(memxid)) + continue; + } + + /* + * Whatever remains are either live lockers that conflict with our + * wanted lock, and updaters that are not aborted. Those conflict + * with what we want. Set up to return true, but keep going to + * look for the current transaction among the multixact members, + * if needed. + */ + result = true; + } + pfree(members); + } + + return result; +} + +/* + * Do_MultiXactIdWait + * Actual implementation for the two functions below. + * + * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is + * needed to ensure we only sleep on conflicting members, and the infomask is + * used to optimize multixact access in case it's a lock-only multi); 'nowait' + * indicates whether to use conditional lock acquisition, to allow callers to + * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up + * context information for error messages. 'remaining', if not NULL, receives + * the number of members that are still running, including any (non-aborted) + * subtransactions of our own transaction. + * + * We do this by sleeping on each member using XactLockTableWait. Any + * members that belong to the current backend are *not* waited for, however; + * this would not merely be useless but would lead to Assert failure inside + * XactLockTableWait. By the time this returns, it is certain that all + * transactions *of other backends* that were members of the MultiXactId + * that conflict with the requested status are dead (and no new ones can have + * been added, since it is not legal to add members to an existing + * MultiXactId). + * + * But by the time we finish sleeping, someone else may have changed the Xmax + * of the containing tuple, so the caller needs to iterate on us somehow. + * + * Note that in case we return false, the number of remaining members is + * not to be trusted. + */ +static bool +Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, + uint16 infomask, bool nowait, + Relation rel, ItemPointer ctid, XLTW_Oper oper, + int *remaining) +{ + bool result = true; + MultiXactMember *members; + int nmembers; + int remain = 0; + + /* for pre-pg_upgrade tuples, no need to sleep at all */ + nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 : + GetMultiXactIdMembers(multi, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(infomask)); + + if (nmembers >= 0) + { + int i; + + for (i = 0; i < nmembers; i++) + { + TransactionId memxid = members[i].xid; + MultiXactStatus memstatus = members[i].status; + + if (TransactionIdIsCurrentTransactionId(memxid)) + { + remain++; + continue; + } + + if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus), + LOCKMODE_from_mxstatus(status))) + { + if (remaining && TransactionIdIsInProgress(memxid)) + remain++; + continue; + } + + /* + * This member conflicts with our multi, so we have to sleep (or + * return failure, if asked to avoid waiting.) + * + * Note that we don't set up an error context callback ourselves, + * but instead we pass the info down to XactLockTableWait. This + * might seem a bit wasteful because the context is set up and + * tore down for each member of the multixact, but in reality it + * should be barely noticeable, and it avoids duplicate code. + */ + if (nowait) + { + result = ConditionalXactLockTableWait(memxid); + if (!result) + break; + } + else + XactLockTableWait(memxid, rel, ctid, oper); + } + + pfree(members); + } + + if (remaining) + *remaining = remain; + + return result; +} + +/* + * MultiXactIdWait + * Sleep on a MultiXactId. + * + * By the time we finish sleeping, someone else may have changed the Xmax + * of the containing tuple, so the caller needs to iterate on us somehow. + * + * We return (in *remaining, if not NULL) the number of members that are still + * running, including any (non-aborted) subtransactions of our own transaction. + */ +static void +MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, + Relation rel, ItemPointer ctid, XLTW_Oper oper, + int *remaining) +{ + (void) Do_MultiXactIdWait(multi, status, infomask, false, + rel, ctid, oper, remaining); +} + +/* + * ConditionalMultiXactIdWait + * As above, but only lock if we can get the lock without blocking. + * + * By the time we finish sleeping, someone else may have changed the Xmax + * of the containing tuple, so the caller needs to iterate on us somehow. + * + * If the multixact is now all gone, return true. Returns false if some + * transactions might still be running. + * + * We return (in *remaining, if not NULL) the number of members that are still + * running, including any (non-aborted) subtransactions of our own transaction. + */ +static bool +ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, + uint16 infomask, Relation rel, int *remaining) +{ + return Do_MultiXactIdWait(multi, status, infomask, true, + rel, NULL, XLTW_None, remaining); +} + +/* + * tdeheap_tuple_needs_eventual_freeze + * + * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) + * will eventually require freezing (if tuple isn't removed by pruning first). + */ +bool +tdeheap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) +{ + TransactionId xid; + + /* + * If xmin is a normal transaction ID, this tuple is definitely not + * frozen. + */ + xid = HeapTupleHeaderGetXmin(tuple); + if (TransactionIdIsNormal(xid)) + return true; + + /* + * If xmax is a valid xact or multixact, this tuple is also not frozen. + */ + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactId multi; + + multi = HeapTupleHeaderGetRawXmax(tuple); + if (MultiXactIdIsValid(multi)) + return true; + } + else + { + xid = HeapTupleHeaderGetRawXmax(tuple); + if (TransactionIdIsNormal(xid)) + return true; + } + + if (tuple->t_infomask & HEAP_MOVED) + { + xid = HeapTupleHeaderGetXvac(tuple); + if (TransactionIdIsNormal(xid)) + return true; + } + + return false; +} + +/* + * tdeheap_tuple_should_freeze + * + * Return value indicates if tdeheap_prepare_freeze_tuple sibling function would + * (or should) force freezing of the heap page that contains caller's tuple. + * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing. + * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs. + * + * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output + * arguments help VACUUM track the oldest extant XID/MXID remaining in rel. + * Our working assumption is that caller won't decide to freeze this tuple. + * It's up to caller to only ratchet back its own top-level trackers after the + * point that it fully commits to not freezing the tuple/page in question. + */ +bool +tdeheap_tuple_should_freeze(HeapTupleHeader tuple, + const struct VacuumCutoffs *cutoffs, + TransactionId *NoFreezePageRelfrozenXid, + MultiXactId *NoFreezePageRelminMxid) +{ + TransactionId xid; + MultiXactId multi; + bool freeze = false; + + /* First deal with xmin */ + xid = HeapTupleHeaderGetXmin(tuple); + if (TransactionIdIsNormal(xid)) + { + Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); + if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) + *NoFreezePageRelfrozenXid = xid; + if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) + freeze = true; + } + + /* Now deal with xmax */ + xid = InvalidTransactionId; + multi = InvalidMultiXactId; + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + multi = HeapTupleHeaderGetRawXmax(tuple); + else + xid = HeapTupleHeaderGetRawXmax(tuple); + + if (TransactionIdIsNormal(xid)) + { + Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); + /* xmax is a non-permanent XID */ + if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) + *NoFreezePageRelfrozenXid = xid; + if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) + freeze = true; + } + else if (!MultiXactIdIsValid(multi)) + { + /* xmax is a permanent XID or invalid MultiXactId/XID */ + } + else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask)) + { + /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */ + if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid)) + *NoFreezePageRelminMxid = multi; + /* tdeheap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */ + freeze = true; + } + else + { + /* xmax is a MultiXactId that may have an updater XID */ + MultiXactMember *members; + int nmembers; + + Assert(MultiXactIdPrecedesOrEquals(cutoffs->relminmxid, multi)); + if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid)) + *NoFreezePageRelminMxid = multi; + if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff)) + freeze = true; + + /* need to check whether any member of the mxact is old */ + nmembers = GetMultiXactIdMembers(multi, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); + + for (int i = 0; i < nmembers; i++) + { + xid = members[i].xid; + Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); + if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) + *NoFreezePageRelfrozenXid = xid; + if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) + freeze = true; + } + if (nmembers > 0) + pfree(members); + } + + if (tuple->t_infomask & HEAP_MOVED) + { + xid = HeapTupleHeaderGetXvac(tuple); + if (TransactionIdIsNormal(xid)) + { + Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); + if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) + *NoFreezePageRelfrozenXid = xid; + /* tdeheap_prepare_freeze_tuple forces xvac freezing */ + freeze = true; + } + } + + return freeze; +} + +/* + * Maintain snapshotConflictHorizon for caller by ratcheting forward its value + * using any committed XIDs contained in 'tuple', an obsolescent heap tuple + * that caller is in the process of physically removing, e.g. via HOT pruning + * or index deletion. + * + * Caller must initialize its value to InvalidTransactionId, which is + * generally interpreted as "definitely no need for a recovery conflict". + * Final value must reflect all heap tuples that caller will physically remove + * (or remove TID references to) via its ongoing pruning/deletion operation. + * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from + * caller's WAL record) by REDO routine when it replays caller's operation. + */ +void +HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, + TransactionId *snapshotConflictHorizon) +{ + TransactionId xmin = HeapTupleHeaderGetXmin(tuple); + TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple); + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (tuple->t_infomask & HEAP_MOVED) + { + if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac)) + *snapshotConflictHorizon = xvac; + } + + /* + * Ignore tuples inserted by an aborted transaction or if the tuple was + * updated/deleted by the inserting transaction. + * + * Look for a committed hint bit, or if no xmin bit is set, check clog. + */ + if (HeapTupleHeaderXminCommitted(tuple) || + (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin))) + { + if (xmax != xmin && + TransactionIdFollows(xmax, *snapshotConflictHorizon)) + *snapshotConflictHorizon = xmax; + } +} + +#ifdef USE_PREFETCH +/* + * Helper function for tdeheap_index_delete_tuples. Issues prefetch requests for + * prefetch_count buffers. The prefetch_state keeps track of all the buffers + * we can prefetch, and which have already been prefetched; each call to this + * function picks up where the previous call left off. + * + * Note: we expect the deltids array to be sorted in an order that groups TIDs + * by heap block, with all TIDs for each block appearing together in exactly + * one group. + */ +static void +index_delete_prefetch_buffer(Relation rel, + IndexDeletePrefetchState *prefetch_state, + int prefetch_count) +{ + BlockNumber cur_hblkno = prefetch_state->cur_hblkno; + int count = 0; + int i; + int ndeltids = prefetch_state->ndeltids; + TM_IndexDelete *deltids = prefetch_state->deltids; + + for (i = prefetch_state->next_item; + i < ndeltids && count < prefetch_count; + i++) + { + ItemPointer htid = &deltids[i].tid; + + if (cur_hblkno == InvalidBlockNumber || + ItemPointerGetBlockNumber(htid) != cur_hblkno) + { + cur_hblkno = ItemPointerGetBlockNumber(htid); + PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno); + count++; + } + } + + /* + * Save the prefetch position so that next time we can continue from that + * position. + */ + prefetch_state->next_item = i; + prefetch_state->cur_hblkno = cur_hblkno; +} +#endif + +/* + * Helper function for tdeheap_index_delete_tuples. Checks for index corruption + * involving an invalid TID in index AM caller's index page. + * + * This is an ideal place for these checks. The index AM must hold a buffer + * lock on the index page containing the TIDs we examine here, so we don't + * have to worry about concurrent VACUUMs at all. We can be sure that the + * index is corrupt when htid points directly to an LP_UNUSED item or + * heap-only tuple, which is not the case during standard index scans. + */ +static inline void +index_delete_check_htid(TM_IndexDeleteOp *delstate, + Page page, OffsetNumber maxoff, + ItemPointer htid, TM_IndexStatus *istatus) +{ + OffsetNumber indexpagehoffnum = ItemPointerGetOffsetNumber(htid); + ItemId iid; + + Assert(OffsetNumberIsValid(istatus->idxoffnum)); + + if (unlikely(indexpagehoffnum > maxoff)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(htid), + indexpagehoffnum, + istatus->idxoffnum, delstate->iblknum, + RelationGetRelationName(delstate->irel)))); + + iid = PageGetItemId(page, indexpagehoffnum); + if (unlikely(!ItemIdIsUsed(iid))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(htid), + indexpagehoffnum, + istatus->idxoffnum, delstate->iblknum, + RelationGetRelationName(delstate->irel)))); + + if (ItemIdHasStorage(iid)) + { + HeapTupleHeader htup; + + Assert(ItemIdIsNormal(iid)); + htup = (HeapTupleHeader) PageGetItem(page, iid); + + if (unlikely(HeapTupleHeaderIsHeapOnly(htup))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(htid), + indexpagehoffnum, + istatus->idxoffnum, delstate->iblknum, + RelationGetRelationName(delstate->irel)))); + } +} + +/* + * heapam implementation of tableam's index_delete_tuples interface. + * + * This helper function is called by index AMs during index tuple deletion. + * See tableam header comments for an explanation of the interface implemented + * here and a general theory of operation. Note that each call here is either + * a simple index deletion call, or a bottom-up index deletion call. + * + * It's possible for this to generate a fair amount of I/O, since we may be + * deleting hundreds of tuples from a single index block. To amortize that + * cost to some degree, this uses prefetching and combines repeat accesses to + * the same heap block. + */ +TransactionId +tdeheap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) +{ + /* Initial assumption is that earlier pruning took care of conflict */ + TransactionId snapshotConflictHorizon = InvalidTransactionId; + BlockNumber blkno = InvalidBlockNumber; + Buffer buf = InvalidBuffer; + Page page = NULL; + OffsetNumber maxoff = InvalidOffsetNumber; + TransactionId priorXmax; +#ifdef USE_PREFETCH + IndexDeletePrefetchState prefetch_state; + int prefetch_distance; +#endif + SnapshotData SnapshotNonVacuumable; + int finalndeltids = 0, + nblocksaccessed = 0; + + /* State that's only used in bottom-up index deletion case */ + int nblocksfavorable = 0; + int curtargetfreespace = delstate->bottomupfreespace, + lastfreespace = 0, + actualfreespace = 0; + bool bottomup_final_block = false; + + InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel)); + + /* Sort caller's deltids array by TID for further processing */ + index_delete_sort(delstate); + + /* + * Bottom-up case: resort deltids array in an order attuned to where the + * greatest number of promising TIDs are to be found, and determine how + * many blocks from the start of sorted array should be considered + * favorable. This will also shrink the deltids array in order to + * eliminate completely unfavorable blocks up front. + */ + if (delstate->bottomup) + nblocksfavorable = bottomup_sort_and_shrink(delstate); + +#ifdef USE_PREFETCH + /* Initialize prefetch state. */ + prefetch_state.cur_hblkno = InvalidBlockNumber; + prefetch_state.next_item = 0; + prefetch_state.ndeltids = delstate->ndeltids; + prefetch_state.deltids = delstate->deltids; + + /* + * Determine the prefetch distance that we will attempt to maintain. + * + * Since the caller holds a buffer lock somewhere in rel, we'd better make + * sure that isn't a catalog relation before we call code that does + * syscache lookups, to avoid risk of deadlock. + */ + if (IsCatalogRelation(rel)) + prefetch_distance = maintenance_io_concurrency; + else + prefetch_distance = + get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + + /* Cap initial prefetch distance for bottom-up deletion caller */ + if (delstate->bottomup) + { + Assert(nblocksfavorable >= 1); + Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS); + prefetch_distance = Min(prefetch_distance, nblocksfavorable); + } + + /* Start prefetching. */ + index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance); +#endif + + /* Iterate over deltids, determine which to delete, check their horizon */ + Assert(delstate->ndeltids > 0); + for (int i = 0; i < delstate->ndeltids; i++) + { + TM_IndexDelete *ideltid = &delstate->deltids[i]; + TM_IndexStatus *istatus = delstate->status + ideltid->id; + ItemPointer htid = &ideltid->tid; + OffsetNumber offnum; + + /* + * Read buffer, and perform required extra steps each time a new block + * is encountered. Avoid refetching if it's the same block as the one + * from the last htid. + */ + if (blkno == InvalidBlockNumber || + ItemPointerGetBlockNumber(htid) != blkno) + { + /* + * Consider giving up early for bottom-up index deletion caller + * first. (Only prefetch next-next block afterwards, when it + * becomes clear that we're at least going to access the next + * block in line.) + * + * Sometimes the first block frees so much space for bottom-up + * caller that the deletion process can end without accessing any + * more blocks. It is usually necessary to access 2 or 3 blocks + * per bottom-up deletion operation, though. + */ + if (delstate->bottomup) + { + /* + * We often allow caller to delete a few additional items + * whose entries we reached after the point that space target + * from caller was satisfied. The cost of accessing the page + * was already paid at that point, so it made sense to finish + * it off. When that happened, we finalize everything here + * (by finishing off the whole bottom-up deletion operation + * without needlessly paying the cost of accessing any more + * blocks). + */ + if (bottomup_final_block) + break; + + /* + * Give up when we didn't enable our caller to free any + * additional space as a result of processing the page that we + * just finished up with. This rule is the main way in which + * we keep the cost of bottom-up deletion under control. + */ + if (nblocksaccessed >= 1 && actualfreespace == lastfreespace) + break; + lastfreespace = actualfreespace; /* for next time */ + + /* + * Deletion operation (which is bottom-up) will definitely + * access the next block in line. Prepare for that now. + * + * Decay target free space so that we don't hang on for too + * long with a marginal case. (Space target is only truly + * helpful when it allows us to recognize that we don't need + * to access more than 1 or 2 blocks to satisfy caller due to + * agreeable workload characteristics.) + * + * We are a bit more patient when we encounter contiguous + * blocks, though: these are treated as favorable blocks. The + * decay process is only applied when the next block in line + * is not a favorable/contiguous block. This is not an + * exception to the general rule; we still insist on finding + * at least one deletable item per block accessed. See + * bottomup_nblocksfavorable() for full details of the theory + * behind favorable blocks and heap block locality in general. + * + * Note: The first block in line is always treated as a + * favorable block, so the earliest possible point that the + * decay can be applied is just before we access the second + * block in line. The Assert() verifies this for us. + */ + Assert(nblocksaccessed > 0 || nblocksfavorable > 0); + if (nblocksfavorable > 0) + nblocksfavorable--; + else + curtargetfreespace /= 2; + } + + /* release old buffer */ + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); + + blkno = ItemPointerGetBlockNumber(htid); + buf = ReadBuffer(rel, blkno); + nblocksaccessed++; + Assert(!delstate->bottomup || + nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS); + +#ifdef USE_PREFETCH + + /* + * To maintain the prefetch distance, prefetch one more page for + * each page we read. + */ + index_delete_prefetch_buffer(rel, &prefetch_state, 1); +#endif + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + } + + /* + * In passing, detect index corruption involving an index page with a + * TID that points to a location in the heap that couldn't possibly be + * correct. We only do this with actual TIDs from caller's index page + * (not items reached by traversing through a HOT chain). + */ + index_delete_check_htid(delstate, page, maxoff, htid, istatus); + + if (istatus->knowndeletable) + Assert(!delstate->bottomup && !istatus->promising); + else + { + ItemPointerData tmp = *htid; + HeapTupleData heapTuple; + + /* Are any tuples from this HOT chain non-vacuumable? */ + if (tdeheap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable, + &heapTuple, NULL, true)) + continue; /* can't delete entry */ + + /* Caller will delete, since whole HOT chain is vacuumable */ + istatus->knowndeletable = true; + + /* Maintain index free space info for bottom-up deletion case */ + if (delstate->bottomup) + { + Assert(istatus->freespace > 0); + actualfreespace += istatus->freespace; + if (actualfreespace >= curtargetfreespace) + bottomup_final_block = true; + } + } + + /* + * Maintain snapshotConflictHorizon value for deletion operation as a + * whole by advancing current value using heap tuple headers. This is + * loosely based on the logic for pruning a HOT chain. + */ + offnum = ItemPointerGetOffsetNumber(htid); + priorXmax = InvalidTransactionId; /* cannot check first XMIN */ + for (;;) + { + ItemId lp; + HeapTupleHeader htup; + + /* Sanity check (pure paranoia) */ + if (offnum < FirstOffsetNumber) + break; + + /* + * An offset past the end of page's line pointer array is possible + * when the array was truncated + */ + if (offnum > maxoff) + break; + + lp = PageGetItemId(page, offnum); + if (ItemIdIsRedirected(lp)) + { + offnum = ItemIdGetRedirect(lp); + continue; + } + + /* + * We'll often encounter LP_DEAD line pointers (especially with an + * entry marked knowndeletable by our caller up front). No heap + * tuple headers get examined for an htid that leads us to an + * LP_DEAD item. This is okay because the earlier pruning + * operation that made the line pointer LP_DEAD in the first place + * must have considered the original tuple header as part of + * generating its own snapshotConflictHorizon value. + * + * Relying on XLOG_HEAP2_PRUNE records like this is the same + * strategy that index vacuuming uses in all cases. Index VACUUM + * WAL records don't even have a snapshotConflictHorizon field of + * their own for this reason. + */ + if (!ItemIdIsNormal(lp)) + break; + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Check the tuple XMIN against prior XMAX, if any + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) + break; + + HeapTupleHeaderAdvanceConflictHorizon(htup, + &snapshotConflictHorizon); + + /* + * If the tuple is not HOT-updated, then we are at the end of this + * HOT-chain. No need to visit later tuples from the same update + * chain (they get their own index entries) -- just move on to + * next htid from index AM caller. + */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + break; + + /* Advance to next HOT chain member */ + Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno); + offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetUpdateXid(htup); + } + + /* Enable further/final shrinking of deltids for caller */ + finalndeltids = i + 1; + } + + UnlockReleaseBuffer(buf); + + /* + * Shrink deltids array to exclude non-deletable entries at the end. This + * is not just a minor optimization. Final deltids array size might be + * zero for a bottom-up caller. Index AM is explicitly allowed to rely on + * ndeltids being zero in all cases with zero total deletable entries. + */ + Assert(finalndeltids > 0 || delstate->bottomup); + delstate->ndeltids = finalndeltids; + + return snapshotConflictHorizon; +} + +/* + * Specialized inlineable comparison function for index_delete_sort() + */ +static inline int +index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2) +{ + ItemPointer tid1 = &deltid1->tid; + ItemPointer tid2 = &deltid2->tid; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(tid1); + BlockNumber blk2 = ItemPointerGetBlockNumber(tid2); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + Assert(false); + + return 0; +} + +/* + * Sort deltids array from delstate by TID. This prepares it for further + * processing by tdeheap_index_delete_tuples(). + * + * This operation becomes a noticeable consumer of CPU cycles with some + * workloads, so we go to the trouble of specialization/micro optimization. + * We use shellsort for this because it's easy to specialize, compiles to + * relatively few instructions, and is adaptive to presorted inputs/subsets + * (which are typical here). + */ +static void +index_delete_sort(TM_IndexDeleteOp *delstate) +{ + TM_IndexDelete *deltids = delstate->deltids; + int ndeltids = delstate->ndeltids; + int low = 0; + + /* + * Shellsort gap sequence (taken from Sedgewick-Incerpi paper). + * + * This implementation is fast with array sizes up to ~4500. This covers + * all supported BLCKSZ values. + */ + const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1}; + + /* Think carefully before changing anything here -- keep swaps cheap */ + StaticAssertDecl(sizeof(TM_IndexDelete) <= 8, + "element size exceeds 8 bytes"); + + for (int g = 0; g < lengthof(gaps); g++) + { + for (int hi = gaps[g], i = low + hi; i < ndeltids; i++) + { + TM_IndexDelete d = deltids[i]; + int j = i; + + while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0) + { + deltids[j] = deltids[j - hi]; + j -= hi; + } + deltids[j] = d; + } + } +} + +/* + * Returns how many blocks should be considered favorable/contiguous for a + * bottom-up index deletion pass. This is a number of heap blocks that starts + * from and includes the first block in line. + * + * There is always at least one favorable block during bottom-up index + * deletion. In the worst case (i.e. with totally random heap blocks) the + * first block in line (the only favorable block) can be thought of as a + * degenerate array of contiguous blocks that consists of a single block. + * tdeheap_index_delete_tuples() will expect this. + * + * Caller passes blockgroups, a description of the final order that deltids + * will be sorted in for tdeheap_index_delete_tuples() bottom-up index deletion + * processing. Note that deltids need not actually be sorted just yet (caller + * only passes deltids to us so that we can interpret blockgroups). + * + * You might guess that the existence of contiguous blocks cannot matter much, + * since in general the main factor that determines which blocks we visit is + * the number of promising TIDs, which is a fixed hint from the index AM. + * We're not really targeting the general case, though -- the actual goal is + * to adapt our behavior to a wide variety of naturally occurring conditions. + * The effects of most of the heuristics we apply are only noticeable in the + * aggregate, over time and across many _related_ bottom-up index deletion + * passes. + * + * Deeming certain blocks favorable allows heapam to recognize and adapt to + * workloads where heap blocks visited during bottom-up index deletion can be + * accessed contiguously, in the sense that each newly visited block is the + * neighbor of the block that bottom-up deletion just finished processing (or + * close enough to it). It will likely be cheaper to access more favorable + * blocks sooner rather than later (e.g. in this pass, not across a series of + * related bottom-up passes). Either way it is probably only a matter of time + * (or a matter of further correlated version churn) before all blocks that + * appear together as a single large batch of favorable blocks get accessed by + * _some_ bottom-up pass. Large batches of favorable blocks tend to either + * appear almost constantly or not even once (it all depends on per-index + * workload characteristics). + * + * Note that the blockgroups sort order applies a power-of-two bucketing + * scheme that creates opportunities for contiguous groups of blocks to get + * batched together, at least with workloads that are naturally amenable to + * being driven by heap block locality. This doesn't just enhance the spatial + * locality of bottom-up heap block processing in the obvious way. It also + * enables temporal locality of access, since sorting by heap block number + * naturally tends to make the bottom-up processing order deterministic. + * + * Consider the following example to get a sense of how temporal locality + * might matter: There is a heap relation with several indexes, each of which + * is low to medium cardinality. It is subject to constant non-HOT updates. + * The updates are skewed (in one part of the primary key, perhaps). None of + * the indexes are logically modified by the UPDATE statements (if they were + * then bottom-up index deletion would not be triggered in the first place). + * Naturally, each new round of index tuples (for each heap tuple that gets a + * tdeheap_update() call) will have the same heap TID in each and every index. + * Since these indexes are low cardinality and never get logically modified, + * heapam processing during bottom-up deletion passes will access heap blocks + * in approximately sequential order. Temporal locality of access occurs due + * to bottom-up deletion passes behaving very similarly across each of the + * indexes at any given moment. This keeps the number of buffer misses needed + * to visit heap blocks to a minimum. + */ +static int +bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, + TM_IndexDelete *deltids) +{ + int64 lastblock = -1; + int nblocksfavorable = 0; + + Assert(nblockgroups >= 1); + Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS); + + /* + * We tolerate heap blocks that will be accessed only slightly out of + * physical order. Small blips occur when a pair of almost-contiguous + * blocks happen to fall into different buckets (perhaps due only to a + * small difference in npromisingtids that the bucketing scheme didn't + * quite manage to ignore). We effectively ignore these blips by applying + * a small tolerance. The precise tolerance we use is a little arbitrary, + * but it works well enough in practice. + */ + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + TM_IndexDelete *firstdtid = deltids + group->ifirsttid; + BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid); + + if (lastblock != -1 && + ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS || + (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS)) + break; + + nblocksfavorable++; + lastblock = block; + } + + /* Always indicate that there is at least 1 favorable block */ + Assert(nblocksfavorable >= 1); + + return nblocksfavorable; +} + +/* + * qsort comparison function for bottomup_sort_and_shrink() + */ +static int +bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2) +{ + const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1; + const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2; + + /* + * Most significant field is npromisingtids (which we invert the order of + * so as to sort in desc order). + * + * Caller should have already normalized npromisingtids fields into + * power-of-two values (buckets). + */ + if (group1->npromisingtids > group2->npromisingtids) + return -1; + if (group1->npromisingtids < group2->npromisingtids) + return 1; + + /* + * Tiebreak: desc ntids sort order. + * + * We cannot expect power-of-two values for ntids fields. We should + * behave as if they were already rounded up for us instead. + */ + if (group1->ntids != group2->ntids) + { + uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids); + uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids); + + if (ntids1 > ntids2) + return -1; + if (ntids1 < ntids2) + return 1; + } + + /* + * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for + * block in deltids array) order. + * + * This is equivalent to sorting in ascending heap block number order + * (among otherwise equal subsets of the array). This approach allows us + * to avoid accessing the out-of-line TID. (We rely on the assumption + * that the deltids array was sorted in ascending heap TID order when + * these offsets to the first TID from each heap block group were formed.) + */ + if (group1->ifirsttid > group2->ifirsttid) + return 1; + if (group1->ifirsttid < group2->ifirsttid) + return -1; + + pg_unreachable(); + + return 0; +} + +/* + * tdeheap_index_delete_tuples() helper function for bottom-up deletion callers. + * + * Sorts deltids array in the order needed for useful processing by bottom-up + * deletion. The array should already be sorted in TID order when we're + * called. The sort process groups heap TIDs from deltids into heap block + * groupings. Earlier/more-promising groups/blocks are usually those that are + * known to have the most "promising" TIDs. + * + * Sets new size of deltids array (ndeltids) in state. deltids will only have + * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we + * return. This often means that deltids will be shrunk to a small fraction + * of its original size (we eliminate many heap blocks from consideration for + * caller up front). + * + * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable() + * for a definition and full details. + */ +static int +bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) +{ + IndexDeleteCounts *blockgroups; + TM_IndexDelete *reordereddeltids; + BlockNumber curblock = InvalidBlockNumber; + int nblockgroups = 0; + int ncopied = 0; + int nblocksfavorable = 0; + + Assert(delstate->bottomup); + Assert(delstate->ndeltids > 0); + + /* Calculate per-heap-block count of TIDs */ + blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids); + for (int i = 0; i < delstate->ndeltids; i++) + { + TM_IndexDelete *ideltid = &delstate->deltids[i]; + TM_IndexStatus *istatus = delstate->status + ideltid->id; + ItemPointer htid = &ideltid->tid; + bool promising = istatus->promising; + + if (curblock != ItemPointerGetBlockNumber(htid)) + { + /* New block group */ + nblockgroups++; + + Assert(curblock < ItemPointerGetBlockNumber(htid) || + !BlockNumberIsValid(curblock)); + + curblock = ItemPointerGetBlockNumber(htid); + blockgroups[nblockgroups - 1].ifirsttid = i; + blockgroups[nblockgroups - 1].ntids = 1; + blockgroups[nblockgroups - 1].npromisingtids = 0; + } + else + { + blockgroups[nblockgroups - 1].ntids++; + } + + if (promising) + blockgroups[nblockgroups - 1].npromisingtids++; + } + + /* + * We're about ready to sort block groups to determine the optimal order + * for visiting heap blocks. But before we do, round the number of + * promising tuples for each block group up to the next power-of-two, + * unless it is very low (less than 4), in which case we round up to 4. + * npromisingtids is far too noisy to trust when choosing between a pair + * of block groups that both have very low values. + * + * This scheme divides heap blocks/block groups into buckets. Each bucket + * contains blocks that have _approximately_ the same number of promising + * TIDs as each other. The goal is to ignore relatively small differences + * in the total number of promising entries, so that the whole process can + * give a little weight to heapam factors (like heap block locality) + * instead. This isn't a trade-off, really -- we have nothing to lose. It + * would be foolish to interpret small differences in npromisingtids + * values as anything more than noise. + * + * We tiebreak on nhtids when sorting block group subsets that have the + * same npromisingtids, but this has the same issues as npromisingtids, + * and so nhtids is subject to the same power-of-two bucketing scheme. The + * only reason that we don't fix nhtids in the same way here too is that + * we'll need accurate nhtids values after the sort. We handle nhtids + * bucketization dynamically instead (in the sort comparator). + * + * See bottomup_nblocksfavorable() for a full explanation of when and how + * heap locality/favorable blocks can significantly influence when and how + * heap blocks are accessed. + */ + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + + /* Better off falling back on nhtids with low npromisingtids */ + if (group->npromisingtids <= 4) + group->npromisingtids = 4; + else + group->npromisingtids = + pg_nextpower2_32((uint32) group->npromisingtids); + } + + /* Sort groups and rearrange caller's deltids array */ + qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts), + bottomup_sort_and_shrink_cmp); + reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete)); + + nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups); + /* Determine number of favorable blocks at the start of final deltids */ + nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups, + delstate->deltids); + + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid; + + memcpy(reordereddeltids + ncopied, firstdtid, + sizeof(TM_IndexDelete) * group->ntids); + ncopied += group->ntids; + } + + /* Copy final grouped and sorted TIDs back into start of caller's array */ + memcpy(delstate->deltids, reordereddeltids, + sizeof(TM_IndexDelete) * ncopied); + delstate->ndeltids = ncopied; + + pfree(reordereddeltids); + pfree(blockgroups); + + return nblocksfavorable; +} + +/* + * Perform XLogInsert for a heap-visible operation. 'block' is the block + * being marked all-visible, and vm_buffer is the buffer containing the + * corresponding visibility map block. Both should have already been modified + * and dirtied. + * + * snapshotConflictHorizon comes from the largest xmin on the page being + * marked all-visible. REDO routine uses it to generate recovery conflicts. + * + * If checksums or wal_log_hints are enabled, we may also generate a full-page + * image of tdeheap_buffer. Otherwise, we optimize away the FPI (by specifying + * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not* + * update the heap page's LSN. + */ +XLogRecPtr +log_tdeheap_visible(Relation rel, Buffer tdeheap_buffer, Buffer vm_buffer, + TransactionId snapshotConflictHorizon, uint8 vmflags) +{ + xl_tdeheap_visible xlrec; + XLogRecPtr recptr; + uint8 flags; + + Assert(BufferIsValid (tdeheap_buffer)); + Assert(BufferIsValid(vm_buffer)); + + xlrec.snapshotConflictHorizon = snapshotConflictHorizon; + xlrec.flags = vmflags; + if (RelationIsAccessibleInLogicalDecoding(rel)) + xlrec.flags |= VISIBILITYMAP_XLOG_CATALOG_REL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapVisible); + + XLogRegisterBuffer(0, vm_buffer, 0); + + flags = REGBUF_STANDARD; + if (!XLogHintBitIsNeeded()) + flags |= REGBUF_NO_IMAGE; + XLogRegisterBuffer(1, tdeheap_buffer, flags); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE); + + return recptr; +} + +/* + * Perform XLogInsert for a heap-update operation. Caller must already + * have modified the buffer(s) and marked them dirty. + */ +static XLogRecPtr +log_tdeheap_update(Relation reln, Buffer oldbuf, + Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, + HeapTuple old_key_tuple, + bool all_visible_cleared, bool new_all_visible_cleared) +{ + xl_tdeheap_update xlrec; + xl_tdeheap_header xlhdr; + xl_tdeheap_header xlhdr_idx; + uint8 info; + uint16 prefix_suffix[2]; + uint16 prefixlen = 0, + suffixlen = 0; + XLogRecPtr recptr; + Page page = BufferGetPage(newbuf); + bool need_tuple_data = RelationIsLogicallyLogged(reln); + bool init; + int bufflags; + + /* Caller should not call me on a non-WAL-logged relation */ + Assert(RelationNeedsWAL(reln)); + + XLogBeginInsert(); + + if (HeapTupleIsHeapOnly(newtup)) + info = XLOG_HEAP_HOT_UPDATE; + else + info = XLOG_HEAP_UPDATE; + + /* + * If the old and new tuple are on the same page, we only need to log the + * parts of the new tuple that were changed. That saves on the amount of + * WAL we need to write. Currently, we just count any unchanged bytes in + * the beginning and end of the tuple. That's quick to check, and + * perfectly covers the common case that only one field is updated. + * + * We could do this even if the old and new tuple are on different pages, + * but only if we don't make a full-page image of the old page, which is + * difficult to know in advance. Also, if the old tuple is corrupt for + * some reason, it would allow the corruption to propagate the new page, + * so it seems best to avoid. Under the general assumption that most + * updates tend to create the new tuple version on the same page, there + * isn't much to be gained by doing this across pages anyway. + * + * Skip this if we're taking a full-page image of the new page, as we + * don't include the new tuple in the WAL record in that case. Also + * disable if wal_level='logical', as logical decoding needs to be able to + * read the new tuple in whole from the WAL record alone. + */ + if (oldbuf == newbuf && !need_tuple_data && + !XLogCheckBufferNeedsBackup(newbuf)) + { + char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff; + char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff; + int oldlen = oldtup->t_len - oldtup->t_data->t_hoff; + int newlen = newtup->t_len - newtup->t_data->t_hoff; + + /* Check for common prefix between old and new tuple */ + for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++) + { + if (newp[prefixlen] != oldp[prefixlen]) + break; + } + + /* + * Storing the length of the prefix takes 2 bytes, so we need to save + * at least 3 bytes or there's no point. + */ + if (prefixlen < 3) + prefixlen = 0; + + /* Same for suffix */ + for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++) + { + if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1]) + break; + } + if (suffixlen < 3) + suffixlen = 0; + } + + /* Prepare main WAL data chain */ + xlrec.flags = 0; + if (all_visible_cleared) + xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED; + if (new_all_visible_cleared) + xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED; + if (prefixlen > 0) + xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD; + if (suffixlen > 0) + xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD; + if (need_tuple_data) + { + xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE; + if (old_key_tuple) + { + if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE; + else + xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; + } + } + + /* If new tuple is the single and first tuple on page... */ + if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && + PageGetMaxOffsetNumber(page) == FirstOffsetNumber) + { + info |= XLOG_HEAP_INIT_PAGE; + init = true; + } + else + init = false; + + /* Prepare WAL data for the old page */ + xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self); + xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); + xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, + oldtup->t_data->t_infomask2); + + /* Prepare WAL data for the new page */ + xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); + xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); + + bufflags = REGBUF_STANDARD; + if (init) + bufflags |= REGBUF_WILL_INIT; + if (need_tuple_data) + bufflags |= REGBUF_KEEP_DATA; + + XLogRegisterBuffer(0, newbuf, bufflags); + if (oldbuf != newbuf) + XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD); + + XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate); + + /* + * Prepare WAL data for the new tuple. + */ + if (prefixlen > 0 || suffixlen > 0) + { + if (prefixlen > 0 && suffixlen > 0) + { + prefix_suffix[0] = prefixlen; + prefix_suffix[1] = suffixlen; + XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2); + } + else if (prefixlen > 0) + { + XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16)); + } + else + { + XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16)); + } + } + + xlhdr.t_infomask2 = newtup->t_data->t_infomask2; + xlhdr.t_infomask = newtup->t_data->t_infomask; + xlhdr.t_hoff = newtup->t_data->t_hoff; + Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); + + /* + * PG73FORMAT: write bitmap [+ padding] [+ oid] + data + * + * The 'data' doesn't include the common prefix or suffix. + */ + XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); + if (prefixlen == 0) + { + XLogRegisterBufData(0, + ((char *) newtup->t_data) + SizeofHeapTupleHeader, + newtup->t_len - SizeofHeapTupleHeader - suffixlen); + } + else + { + /* + * Have to write the null bitmap and data after the common prefix as + * two separate rdata entries. + */ + /* bitmap [+ padding] [+ oid] */ + if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0) + { + XLogRegisterBufData(0, + ((char *) newtup->t_data) + SizeofHeapTupleHeader, + newtup->t_data->t_hoff - SizeofHeapTupleHeader); + } + + /* data after common prefix */ + XLogRegisterBufData(0, + ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen, + newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen); + } + + /* We need to log a tuple identity */ + if (need_tuple_data && old_key_tuple) + { + /* don't really need this, but its more comfy to decode */ + xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; + xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; + + XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader); + + /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ + XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader, + old_key_tuple->t_len - SizeofHeapTupleHeader); + } + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP_ID, info); + + return recptr; +} + +/* + * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record + * + * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog + * tuples. + */ +static XLogRecPtr +log_tdeheap_new_cid(Relation relation, HeapTuple tup) +{ + xl_tdeheap_new_cid xlrec; + + XLogRecPtr recptr; + HeapTupleHeader hdr = tup->t_data; + + Assert(ItemPointerIsValid(&tup->t_self)); + Assert(tup->t_tableOid != InvalidOid); + + xlrec.top_xid = GetTopTransactionId(); + xlrec.target_locator = relation->rd_locator; + xlrec.target_tid = tup->t_self; + + /* + * If the tuple got inserted & deleted in the same TX we definitely have a + * combo CID, set cmin and cmax. + */ + if (hdr->t_infomask & HEAP_COMBOCID) + { + Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID)); + Assert(!HeapTupleHeaderXminInvalid(hdr)); + xlrec.cmin = HeapTupleHeaderGetCmin(hdr); + xlrec.cmax = HeapTupleHeaderGetCmax(hdr); + xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr); + } + /* No combo CID, so only cmin or cmax can be set by this TX */ + else + { + /* + * Tuple inserted. + * + * We need to check for LOCK ONLY because multixacts might be + * transferred to the new tuple in case of FOR KEY SHARE updates in + * which case there will be an xmax, although the tuple just got + * inserted. + */ + if (hdr->t_infomask & HEAP_XMAX_INVALID || + HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask)) + { + xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr); + xlrec.cmax = InvalidCommandId; + } + /* Tuple from a different tx updated or deleted. */ + else + { + xlrec.cmin = InvalidCommandId; + xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr); + } + xlrec.combocid = InvalidCommandId; + } + + /* + * Note that we don't need to register the buffer here, because this + * operation does not modify the page. The insert/update/delete that + * called us certainly did, but that's WAL-logged separately. + */ + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid); + + /* will be looked at irrespective of origin */ + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID); + + return recptr; +} + +/* + * Build a heap tuple representing the configured REPLICA IDENTITY to represent + * the old tuple in an UPDATE or DELETE. + * + * Returns NULL if there's no need to log an identity or if there's no suitable + * key defined. + * + * Pass key_required true if any replica identity columns changed value, or if + * any of them have any external data. Delete must always pass true. + * + * *copy is set to true if the returned tuple is a modified copy rather than + * the same tuple that was passed in. + */ +static HeapTuple +ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, + bool *copy) +{ + TupleDesc desc = RelationGetDescr(relation); + char replident = relation->rd_rel->relreplident; + Bitmapset *idattrs; + HeapTuple key_tuple; + bool nulls[MaxHeapAttributeNumber]; + Datum values[MaxHeapAttributeNumber]; + + *copy = false; + + if (!RelationIsLogicallyLogged(relation)) + return NULL; + + if (replident == REPLICA_IDENTITY_NOTHING) + return NULL; + + if (replident == REPLICA_IDENTITY_FULL) + { + /* + * When logging the entire old tuple, it very well could contain + * toasted columns. If so, force them to be inlined. + */ + if (HeapTupleHasExternal(tp)) + { + *copy = true; + tp = toast_flatten_tuple(tp, desc); + } + return tp; + } + + /* if the key isn't required and we're only logging the key, we're done */ + if (!key_required) + return NULL; + + /* find out the replica identity columns */ + idattrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + + /* + * If there's no defined replica identity columns, treat as !key_required. + * (This case should not be reachable from tdeheap_update, since that should + * calculate key_required accurately. But tdeheap_delete just passes + * constant true for key_required, so we can hit this case in deletes.) + */ + if (bms_is_empty(idattrs)) + return NULL; + + /* + * Construct a new tuple containing only the replica identity columns, + * with nulls elsewhere. While we're at it, assert that the replica + * identity columns aren't null. + */ + tdeheap_deform_tuple(tp, desc, values, nulls); + + for (int i = 0; i < desc->natts; i++) + { + if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber, + idattrs)) + Assert(!nulls[i]); + else + nulls[i] = true; + } + + key_tuple = tdeheap_form_tuple(desc, values, nulls); + *copy = true; + + bms_free(idattrs); + + /* + * If the tuple, which by here only contains indexed columns, still has + * toasted columns, force them to be inlined. This is somewhat unlikely + * since there's limits on the size of indexed columns, so we don't + * duplicate toast_flatten_tuple()s functionality in the above loop over + * the indexed columns, even if it would be more efficient. + */ + if (HeapTupleHasExternal(key_tuple)) + { + HeapTuple oldtup = key_tuple; + + key_tuple = toast_flatten_tuple(oldtup, desc); + tdeheap_freetuple(oldtup); + } + + return key_tuple; +} + +/* + * Handles XLOG_HEAP2_PRUNE record type. + * + * Acquires a full cleanup lock. + */ +static void +tdeheap_xlog_prune(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_prune *xlrec = (xl_tdeheap_prune *) XLogRecGetData(record); + Buffer buffer; + RelFileLocator rlocator; + BlockNumber blkno; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); + + /* + * We're about to remove tuples. In Hot Standby mode, ensure that there's + * no queries running for which the removed tuples are still visible. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, + xlrec->isCatalogRel, + rlocator); + + /* + * If we have a full-page image, restore it (using a cleanup lock) and + * we're done. + */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, + &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *end; + OffsetNumber *redirected; + OffsetNumber *nowdead; + OffsetNumber *nowunused; + int nredirected; + int ndead; + int nunused; + Size datalen; + + redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); + + nredirected = xlrec->nredirected; + ndead = xlrec->ndead; + end = (OffsetNumber *) ((char *) redirected + datalen); + nowdead = redirected + (nredirected * 2); + nowunused = nowdead + ndead; + nunused = (end - nowunused); + Assert(nunused >= 0); + + /* Update all line pointers per the record, and repair fragmentation */ + tdeheap_page_prune_execute(buffer, + redirected, nredirected, + nowdead, ndead, + nowunused, nunused); + + /* + * Note: we don't worry about updating the page's prunability hints. + * At worst this will cause an extra prune cycle to occur soon. + */ + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + { + Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); + + UnlockReleaseBuffer(buffer); + + /* + * After pruning records from a page, it's useful to update the FSM + * about it, as it may cause the page become target for insertions + * later even if vacuum decides not to visit it (which is possible if + * gets marked all-visible.) + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); + } +} + +/* + * Handles XLOG_HEAP2_VACUUM record type. + * + * Acquires an ordinary exclusive lock only. + */ +static void +tdeheap_xlog_vacuum(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_vacuum *xlrec = (xl_tdeheap_vacuum *) XLogRecGetData(record); + Buffer buffer; + BlockNumber blkno; + XLogRedoAction action; + + /* + * If we have a full-page image, restore it (without using a cleanup lock) + * and we're done. + */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false, + &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *nowunused; + Size datalen; + OffsetNumber *offnum; + + nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); + + /* Shouldn't be a record unless there's something to do */ + Assert(xlrec->nunused > 0); + + /* Update all now-unused line pointers */ + offnum = nowunused; + for (int i = 0; i < xlrec->nunused; i++) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + + Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp)); + ItemIdSetUnused(lp); + } + + /* Attempt to truncate line pointer array now */ + PageTruncateLinePointerArray(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + { + Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); + RelFileLocator rlocator; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); + + UnlockReleaseBuffer(buffer); + + /* + * After vacuuming LP_DEAD items from a page, it's useful to update + * the FSM about it, as it may cause the page become target for + * insertions later even if vacuum decides not to visit it (which is + * possible if gets marked all-visible.) + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); + } +} + +/* + * Replay XLOG_HEAP2_VISIBLE record. + * + * The critical integrity requirement here is that we must never end up with + * a situation where the visibility map bit is set, and the page-level + * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent + * page modification would fail to clear the visibility map bit. + */ +static void +tdeheap_xlog_visible(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_visible *xlrec = (xl_tdeheap_visible *) XLogRecGetData(record); + Buffer vmbuffer = InvalidBuffer; + Buffer buffer; + Page page; + RelFileLocator rlocator; + BlockNumber blkno; + XLogRedoAction action; + + Assert((xlrec->flags & VISIBILITYMAP_XLOG_VALID_BITS) == xlrec->flags); + + XLogRecGetBlockTag(record, 1, &rlocator, NULL, &blkno); + + /* + * If there are any Hot Standby transactions running that have an xmin + * horizon old enough that this page isn't all-visible for them, they + * might incorrectly decide that an index-only scan can skip a heap fetch. + * + * NB: It might be better to throw some kind of "soft" conflict here that + * forces any index-only scan that is in flight to perform heap fetches, + * rather than killing the transaction outright. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, + xlrec->flags & VISIBILITYMAP_XLOG_CATALOG_REL, + rlocator); + + /* + * Read the heap page, if it still exists. If the heap file has dropped or + * truncated later in recovery, we don't need to update the page, but we'd + * better still update the visibility map. + */ + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + /* + * We don't bump the LSN of the heap page when setting the visibility + * map bit (unless checksums or wal_hint_bits is enabled, in which + * case we must). This exposes us to torn page hazards, but since + * we're not inspecting the existing page contents in any way, we + * don't care. + */ + page = BufferGetPage(buffer); + + PageSetAllVisible(page); + + if (XLogHintBitIsNeeded()) + PageSetLSN(page, lsn); + + MarkBufferDirty(buffer); + } + else if (action == BLK_RESTORED) + { + /* + * If heap block was backed up, we already restored it and there's + * nothing more to do. (This can only happen with checksums or + * wal_log_hints enabled.) + */ + } + + if (BufferIsValid(buffer)) + { + Size space = PageGetFreeSpace(BufferGetPage(buffer)); + + UnlockReleaseBuffer(buffer); + + /* + * Since FSM is not WAL-logged and only updated heuristically, it + * easily becomes stale in standbys. If the standby is later promoted + * and runs VACUUM, it will skip updating individual free space + * figures for pages that became all-visible (or all-frozen, depending + * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum + * propagates too optimistic free space values to upper FSM layers; + * later inserters try to use such pages only to find out that they + * are unusable. This can cause long stalls when there are many such + * pages. + * + * Forestall those problems by updating FSM's idea about a page that + * is becoming all-visible or all-frozen. + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + if (xlrec->flags & VISIBILITYMAP_VALID_BITS) + XLogRecordPageWithFreeSpace(rlocator, blkno, space); + } + + /* + * Even if we skipped the heap page update due to the LSN interlock, it's + * still safe to update the visibility map. Any WAL record that clears + * the visibility map bit does so before checking the page LSN, so any + * bits that need to be cleared will still be cleared. + */ + if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false, + &vmbuffer) == BLK_NEEDS_REDO) + { + Page vmpage = BufferGetPage(vmbuffer); + Relation reln; + uint8 vmbits; + + /* initialize the page if it was read as zeros */ + if (PageIsNew(vmpage)) + PageInit(vmpage, BLCKSZ, 0); + + /* remove VISIBILITYMAP_XLOG_* */ + vmbits = xlrec->flags & VISIBILITYMAP_VALID_BITS; + + /* + * XLogReadBufferForRedoExtended locked the buffer. But + * tdeheap_visibilitymap_set will handle locking itself. + */ + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); + + reln = CreateFakeRelcacheEntry(rlocator); + tdeheap_visibilitymap_pin(reln, blkno, &vmbuffer); + + tdeheap_visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, + xlrec->snapshotConflictHorizon, vmbits); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + else if (BufferIsValid(vmbuffer)) + UnlockReleaseBuffer(vmbuffer); +} + +/* + * Replay XLOG_HEAP2_FREEZE_PAGE records + */ +static void +tdeheap_xlog_freeze_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_freeze_page *xlrec = (xl_tdeheap_freeze_page *) XLogRecGetData(record); + Buffer buffer; + + /* + * In Hot Standby mode, ensure that there's no queries running which still + * consider the frozen xids as running. + */ + if (InHotStandby) + { + RelFileLocator rlocator; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, NULL); + ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, + xlrec->isCatalogRel, + rlocator); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + xl_tdeheap_freeze_plan *plans; + OffsetNumber *offsets; + int curoff = 0; + + plans = (xl_tdeheap_freeze_plan *) XLogRecGetBlockData(record, 0, NULL); + offsets = (OffsetNumber *) ((char *) plans + + (xlrec->nplans * + sizeof(xl_tdeheap_freeze_plan))); + for (int p = 0; p < xlrec->nplans; p++) + { + HeapTupleFreeze frz; + + /* + * Convert freeze plan representation from WAL record into + * per-tuple format used by tdeheap_execute_freeze_tuple + */ + frz.xmax = plans[p].xmax; + frz.t_infomask2 = plans[p].t_infomask2; + frz.t_infomask = plans[p].t_infomask; + frz.frzflags = plans[p].frzflags; + frz.offset = InvalidOffsetNumber; /* unused, but be tidy */ + + for (int i = 0; i < plans[p].ntuples; i++) + { + OffsetNumber offset = offsets[curoff++]; + ItemId lp; + HeapTupleHeader tuple; + + lp = PageGetItemId(page, offset); + tuple = (HeapTupleHeader) PageGetItem(page, lp); + tdeheap_execute_freeze_tuple(tuple, &frz); + } + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * Given an "infobits" field from an XLog record, set the correct bits in the + * given infomask and infomask2 for the tuple touched by the record. + * + * (This is the reverse of compute_infobits). + */ +static void +fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) +{ + *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + *infomask2 &= ~HEAP_KEYS_UPDATED; + + if (infobits & XLHL_XMAX_IS_MULTI) + *infomask |= HEAP_XMAX_IS_MULTI; + if (infobits & XLHL_XMAX_LOCK_ONLY) + *infomask |= HEAP_XMAX_LOCK_ONLY; + if (infobits & XLHL_XMAX_EXCL_LOCK) + *infomask |= HEAP_XMAX_EXCL_LOCK; + /* note HEAP_XMAX_SHR_LOCK isn't considered here */ + if (infobits & XLHL_XMAX_KEYSHR_LOCK) + *infomask |= HEAP_XMAX_KEYSHR_LOCK; + + if (infobits & XLHL_KEYS_UPDATED) + *infomask2 |= HEAP_KEYS_UPDATED; +} + +static void +tdeheap_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_delete *xlrec = (xl_tdeheap_delete *) XLogRecGetData(record); + Buffer buffer; + Page page; + ItemId lp = NULL; + HeapTupleHeader htup; + BlockNumber blkno; + RelFileLocator target_locator; + ItemPointerData target_tid; + + XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_locator); + Buffer vmbuffer = InvalidBuffer; + + tdeheap_visibilitymap_pin(reln, blkno, &vmbuffer); + tdeheap_visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) + lp = PageGetItemId(page, xlrec->offnum); + + if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->infobits_set, + &htup->t_infomask, &htup->t_infomask2); + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + else + HeapTupleHeaderSetXmin(htup, InvalidTransactionId); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* Make sure t_ctid is set correctly */ + if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE) + HeapTupleHeaderSetMovedPartitions(htup); + else + htup->t_ctid = target_tid; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +tdeheap_xlog_insert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_insert *xlrec = (xl_tdeheap_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + HeapTupleHeader htup; + xl_tdeheap_header xlhdr; + uint32 newlen; + Size freespace = 0; + RelFileLocator target_locator; + BlockNumber blkno; + ItemPointerData target_tid; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_locator); + Buffer vmbuffer = InvalidBuffer; + + tdeheap_visibilitymap_pin(reln, blkno, &vmbuffer); + tdeheap_visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. + */ + if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + Size datalen; + char *data; + + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum) + elog(PANIC, "invalid max offset number"); + + data = XLogRecGetBlockData(record, 0, &datalen); + + newlen = datalen - SizeOfHeapHeader; + Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize); + memcpy((char *) &xlhdr, data, SizeOfHeapHeader); + data += SizeOfHeapHeader; + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + data, + newlen); + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + htup->t_ctid = target_tid; + + if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, + true, true) == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(target_locator, blkno, freespace); +} + +/* + * Handles MULTI_INSERT record type. + */ +static void +tdeheap_xlog_multi_insert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_multi_insert *xlrec; + RelFileLocator rlocator; + BlockNumber blkno; + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + HeapTupleHeader htup; + uint32 newlen; + Size freespace = 0; + int i; + bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; + XLogRedoAction action; + + /* + * Insertion doesn't overwrite MVCC data, so no conflict processing is + * required. + */ + xlrec = (xl_tdeheap_multi_insert *) XLogRecGetData(record); + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); + + /* check that the mutually exclusive flags are not both set */ + Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) && + (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET))); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + tdeheap_visibilitymap_pin(reln, blkno, &vmbuffer); + tdeheap_visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (isinit) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + char *tupdata; + char *endptr; + Size len; + + /* Tuples are stored as block data */ + tupdata = XLogRecGetBlockData(record, 0, &len); + endptr = tupdata + len; + + page = (Page) BufferGetPage(buffer); + + for (i = 0; i < xlrec->ntuples; i++) + { + OffsetNumber offnum; + xl_multi_insert_tuple *xlhdr; + + /* + * If we're reinitializing the page, the tuples are stored in + * order from FirstOffsetNumber. Otherwise there's an array of + * offsets in the WAL record, and the tuples come after that. + */ + if (isinit) + offnum = FirstOffsetNumber + i; + else + offnum = xlrec->offsets[i]; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "invalid max offset number"); + + xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata); + tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple; + + newlen = xlhdr->datalen; + Assert(newlen <= MaxHeapTupleSize); + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + (char *) tupdata, + newlen); + tupdata += newlen; + + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr->t_infomask2; + htup->t_infomask = xlhdr->t_infomask; + htup->t_hoff = xlhdr->t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + ItemPointerSetBlockNumber(&htup->t_ctid, blkno); + ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + } + if (tupdata != endptr) + elog(PANIC, "total tuple length mismatch"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); +} + +/* + * Handles UPDATE and HOT_UPDATE + */ +static void +tdeheap_xlog_update(XLogReaderState *record, bool hot_update) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_update *xlrec = (xl_tdeheap_update *) XLogRecGetData(record); + RelFileLocator rlocator; + BlockNumber oldblk; + BlockNumber newblk; + ItemPointerData newtid; + Buffer obuffer, + nbuffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleData oldtup; + HeapTupleHeader htup; + uint16 prefixlen = 0, + suffixlen = 0; + char *newp; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + xl_tdeheap_header xlhdr; + uint32 newlen; + Size freespace = 0; + XLogRedoAction oldaction; + XLogRedoAction newaction; + + /* initialize to keep the compiler quiet */ + oldtup.t_data = NULL; + oldtup.t_len = 0; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk); + if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL)) + { + /* HOT updates are never done across pages */ + Assert(!hot_update); + } + else + oldblk = newblk; + + ItemPointerSet(&newtid, newblk, xlrec->new_offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + tdeheap_visibilitymap_pin(reln, oldblk, &vmbuffer); + tdeheap_visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * In normal operation, it is important to lock the two pages in + * page-number order, to avoid possible deadlocks against other update + * operations going the other way. However, during WAL replay there can + * be no other update happening, so we don't need to worry about that. But + * we *do* need to worry that we don't expose an inconsistent state to Hot + * Standby queries --- so the original page can't be unlocked before we've + * added the new tuple to the new page. + */ + + /* Deal with old tuple version */ + oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1, + &obuffer); + if (oldaction == BLK_NEEDS_REDO) + { + page = BufferGetPage(obuffer); + offnum = xlrec->old_offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldtup.t_data = htup; + oldtup.t_len = ItemIdGetLength(lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + if (hot_update) + HeapTupleHeaderSetHotUpdated(htup); + else + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, + &htup->t_infomask2); + HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + /* Set forward chain link in t_ctid */ + htup->t_ctid = newtid; + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(obuffer); + } + + /* + * Read the page the new tuple goes into, if different from old. + */ + if (oldblk == newblk) + { + nbuffer = obuffer; + newaction = oldaction; + } + else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + nbuffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(nbuffer); + PageInit(page, BufferGetPageSize(nbuffer), 0); + newaction = BLK_NEEDS_REDO; + } + else + newaction = XLogReadBufferForRedo(record, 0, &nbuffer); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + tdeheap_visibilitymap_pin(reln, newblk, &vmbuffer); + tdeheap_visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* Deal with new tuple */ + if (newaction == BLK_NEEDS_REDO) + { + char *recdata; + char *recdata_end; + Size datalen; + Size tuplen; + + recdata = XLogRecGetBlockData(record, 0, &datalen); + recdata_end = recdata + datalen; + + page = BufferGetPage(nbuffer); + + offnum = xlrec->new_offnum; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "invalid max offset number"); + + if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&prefixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&suffixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + + memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader); + recdata += SizeOfHeapHeader; + + tuplen = recdata_end - recdata; + Assert(tuplen <= MaxHeapTupleSize); + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + + /* + * Reconstruct the new tuple using the prefix and/or suffix from the + * old tuple, and the data stored in the WAL record. + */ + newp = (char *) htup + SizeofHeapTupleHeader; + if (prefixlen > 0) + { + int len; + + /* copy bitmap [+ padding] [+ oid] from WAL record */ + len = xlhdr.t_hoff - SizeofHeapTupleHeader; + memcpy(newp, recdata, len); + recdata += len; + newp += len; + + /* copy prefix from old tuple */ + memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen); + newp += prefixlen; + + /* copy new tuple data from WAL record */ + len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader); + memcpy(newp, recdata, len); + recdata += len; + newp += len; + } + else + { + /* + * copy bitmap [+ padding] [+ oid] + data from record, all in one + * go + */ + memcpy(newp, recdata, tuplen); + recdata += tuplen; + newp += tuplen; + } + Assert(recdata == recdata_end); + + /* copy suffix from old tuple */ + if (suffixlen > 0) + memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen); + + newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + /* Make sure there is no forward chain link in t_ctid */ + htup->t_ctid = newtid; + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + MarkBufferDirty(nbuffer); + } + + if (BufferIsValid(nbuffer) && nbuffer != obuffer) + UnlockReleaseBuffer(nbuffer); + if (BufferIsValid(obuffer)) + UnlockReleaseBuffer(obuffer); + + /* + * If the new page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * However, don't update the FSM on HOT updates, because after crash + * recovery, either the old or the new tuple will certainly be dead and + * prunable. After pruning, the page will have roughly as much free space + * as it did before the update, assuming the new tuple is about the same + * size as the old one. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rlocator, newblk, freespace); +} + +static void +tdeheap_xlog_confirm(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_confirm *xlrec = (xl_tdeheap_confirm *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Confirm tuple as actually inserted + */ + ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +tdeheap_xlog_lock(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_lock *xlrec = (xl_tdeheap_lock *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + { + RelFileLocator rlocator; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); + reln = CreateFakeRelcacheEntry(rlocator); + + tdeheap_visibilitymap_pin(reln, block, &vmbuffer); + tdeheap_visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, + &htup->t_infomask2); + + /* + * Clear relevant update flags, but only if the modified infomask says + * there's no update. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask)) + { + HeapTupleHeaderClearHotUpdated(htup); + /* Make sure there is no forward chain link in t_ctid */ + ItemPointerSet(&htup->t_ctid, + BufferGetBlockNumber(buffer), + offnum); + } + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +tdeheap_xlog_lock_updated(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_lock_updated *xlrec; + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + xlrec = (xl_tdeheap_lock_updated *) XLogRecGetData(record); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + { + RelFileLocator rlocator; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); + reln = CreateFakeRelcacheEntry(rlocator); + + tdeheap_visibilitymap_pin(reln, block, &vmbuffer); + tdeheap_visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, + &htup->t_infomask2); + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +tdeheap_xlog_inplace(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_tdeheap_inplace *xlrec = (xl_tdeheap_inplace *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + uint32 oldlen; + Size newlen; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + char *newtup = XLogRecGetBlockData(record, 0, &newlen); + + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldlen = ItemIdGetLength(lp) - htup->t_hoff; + if (oldlen != newlen) + elog(PANIC, "wrong tuple length"); + + memcpy((char *) htup + htup->t_hoff, newtup, newlen); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +void +tdeheap_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* + * These operations don't overwrite MVCC data so no conflict processing is + * required. The ones in heap2 rmgr do. + */ + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP_INSERT: + tdeheap_xlog_insert(record); + break; + case XLOG_HEAP_DELETE: + tdeheap_xlog_delete(record); + break; + case XLOG_HEAP_UPDATE: + tdeheap_xlog_update(record, false); + break; + case XLOG_HEAP_TRUNCATE: + + /* + * TRUNCATE is a no-op because the actions are already logged as + * SMGR WAL records. TRUNCATE WAL record only exists for logical + * decoding. + */ + break; + case XLOG_HEAP_HOT_UPDATE: + tdeheap_xlog_update(record, true); + break; + case XLOG_HEAP_CONFIRM: + tdeheap_xlog_confirm(record); + break; + case XLOG_HEAP_LOCK: + tdeheap_xlog_lock(record); + break; + case XLOG_HEAP_INPLACE: + tdeheap_xlog_inplace(record); + break; + default: + elog(PANIC, "heap_redo: unknown op code %u", info); + } +} + +void +tdeheap2_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP2_PRUNE: + tdeheap_xlog_prune(record); + break; + case XLOG_HEAP2_VACUUM: + tdeheap_xlog_vacuum(record); + break; + case XLOG_HEAP2_FREEZE_PAGE: + tdeheap_xlog_freeze_page(record); + break; + case XLOG_HEAP2_VISIBLE: + tdeheap_xlog_visible(record); + break; + case XLOG_HEAP2_MULTI_INSERT: + tdeheap_xlog_multi_insert(record); + break; + case XLOG_HEAP2_LOCK_UPDATED: + tdeheap_xlog_lock_updated(record); + break; + case XLOG_HEAP2_NEW_CID: + + /* + * Nothing to do on a real replay, only used during logical + * decoding. + */ + break; + case XLOG_HEAP2_REWRITE: + tdeheap_xlog_logical_rewrite(record); + break; + default: + elog(PANIC, "tdeheap2_redo: unknown op code %u", info); + } +} + +/* + * Mask a heap page before performing consistency checks on it. + */ +void +tdeheap_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + OffsetNumber off; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + /* + * If xmin of a tuple is not yet frozen, we should ignore + * differences in hint bits, since they can be set without + * emitting WAL. + */ + if (!HeapTupleHeaderXminFrozen(page_htup)) + page_htup->t_infomask &= ~HEAP_XACT_MASK; + else + { + /* Still we need to mask xmax hint bits. */ + page_htup->t_infomask &= ~HEAP_XMAX_INVALID; + page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; + } + + /* + * During replay, we set Command Id to FirstCommandId. Hence, mask + * it. See tdeheap_xlog_insert() for details. + */ + page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; + + /* + * For a speculative tuple, tdeheap_insert() does not set ctid in the + * caller-passed heap tuple itself, leaving the ctid field to + * contain a speculative token value - a per-backend monotonically + * increasing identifier. Besides, it does not WAL-log ctid under + * any circumstances. + * + * During redo, tdeheap_xlog_insert() sets t_ctid to current block + * number and self offset number. It doesn't care about any + * speculative insertions on the primary. Hence, we set t_ctid to + * current block number and self offset number to ignore any + * inconsistency. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, blkno, off); + + /* + * NB: Not ignoring ctid changes due to the tuple having moved + * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's + * important information that needs to be in-sync between primary + * and standby, and thus is WAL logged. + */ + } + + /* + * Ignore any padding bytes after the tuple, when the length of the + * item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} + +/* + * HeapCheckForSerializableConflictOut + * We are reading a tuple. If it's not visible, there may be a + * rw-conflict out with the inserter. Otherwise, if it is visible to us + * but has been deleted, there may be a rw-conflict out with the deleter. + * + * We will determine the top level xid of the writing transaction with which + * we may be in conflict, and ask CheckForSerializableConflictOut() to check + * for overlap with our own transaction. + * + * This function should be called just about anywhere in heapam.c where a + * tuple has been read. The caller must hold at least a shared lock on the + * buffer, because this function might set hint bits on the tuple. There is + * currently no known reason to call this function from an index AM. + */ +void +HeapCheckForSerializableConflictOut(bool visible, Relation relation, + HeapTuple tuple, Buffer buffer, + Snapshot snapshot) +{ + TransactionId xid; + HTSV_Result htsvResult; + + if (!CheckForSerializableConflictOutNeeded(relation, snapshot)) + return; + + /* + * Check to see whether the tuple has been written to by a concurrent + * transaction, either to create it not visible to us, or to delete it + * while it is visible to us. The "visible" bool indicates whether the + * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else + * is going on with it. + * + * In the event of a concurrently inserted tuple that also happens to have + * been concurrently updated (by a separate transaction), the xmin of the + * tuple will be used -- not the updater's xid. + */ + htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer); + switch (htsvResult) + { + case HEAPTUPLE_LIVE: + if (visible) + return; + xid = HeapTupleHeaderGetXmin(tuple->t_data); + break; + case HEAPTUPLE_RECENTLY_DEAD: + case HEAPTUPLE_DELETE_IN_PROGRESS: + if (visible) + xid = HeapTupleHeaderGetUpdateXid(tuple->t_data); + else + xid = HeapTupleHeaderGetXmin(tuple->t_data); + + if (TransactionIdPrecedes(xid, TransactionXmin)) + { + /* This is like the HEAPTUPLE_DEAD case */ + Assert(!visible); + return; + } + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + xid = HeapTupleHeaderGetXmin(tuple->t_data); + break; + case HEAPTUPLE_DEAD: + Assert(!visible); + return; + default: + + /* + * The only way to get to this default clause is if a new value is + * added to the enum type without adding it to this switch + * statement. That's a bug, so elog. + */ + elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult); + + /* + * In spite of having all enum values covered and calling elog on + * this default, some compilers think this is a code path which + * allows xid to be used below without initialization. Silence + * that warning. + */ + xid = InvalidTransactionId; + } + + Assert(TransactionIdIsValid(xid)); + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + /* + * Find top level xid. Bail out if xid is too early to be a conflict, or + * if it's our own xid. + */ + if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) + return; + xid = SubTransGetTopmostTransaction(xid); + if (TransactionIdPrecedes(xid, TransactionXmin)) + return; + + CheckForSerializableConflictOut(relation, xid, snapshot); +} diff --git a/src16/access/pg_tdeam_handler.c b/src16/access/pg_tdeam_handler.c new file mode 100644 index 00000000..afda52fb --- /dev/null +++ b/src16/access/pg_tdeam_handler.c @@ -0,0 +1,2615 @@ +/*------------------------------------------------------------------------- + * + * pg_tdeam_handler.c + * heap table access method code + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/pg_tdeam_handler.c + * + * + * NOTES + * This files wires up the lower level heapam.c et al routines with the + * tableam abstraction. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/multixact.h" +#include "access/rewriteheap.h" +#include "access/syncscan.h" +#include "access/tableam.h" +#include "access/tsmapi.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "catalog/index.h" +#include "catalog/storage.h" +#include "catalog/storage_xlog.h" +#include "commands/progress.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/rel.h" + +static void reform_and_rewrite_tuple(HeapTuple tuple, + Relation OldHeap, Relation NewHeap, + Datum *values, bool *isnull, RewriteState rwstate); + +static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, + HeapTuple tuple, + OffsetNumber tupoffset); + +static BlockNumber pg_tdeam_scan_get_blocks_done(HeapScanDesc hscan); + +static const TableAmRoutine pg_tdeam_methods; + + +/* ------------------------------------------------------------------------ + * Slot related callbacks for heap AM + * ------------------------------------------------------------------------ + */ + +static const TupleTableSlotOps * +pg_tdeam_slot_callbacks(Relation relation) +{ + return &TTSOpsTDEBufferHeapTuple; +} + + +/* ------------------------------------------------------------------------ + * Index Scan Callbacks for heap AM + * ------------------------------------------------------------------------ + */ + +static IndexFetchTableData * +pg_tdeam_index_fetch_begin(Relation rel) +{ + IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData)); + + hscan->xs_base.rel = rel; + hscan->xs_cbuf = InvalidBuffer; + + return &hscan->xs_base; +} + +static void +pg_tdeam_index_fetch_reset(IndexFetchTableData *scan) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + + if (BufferIsValid(hscan->xs_cbuf)) + { + ReleaseBuffer(hscan->xs_cbuf); + hscan->xs_cbuf = InvalidBuffer; + } +} + +static void +pg_tdeam_index_fetch_end(IndexFetchTableData *scan) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + + pg_tdeam_index_fetch_reset(scan); + + pfree(hscan); +} + +static bool +pg_tdeam_index_fetch_tuple(struct IndexFetchTableData *scan, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + bool *call_again, bool *all_dead) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + bool got_tdeheap_tuple; + + Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); + + /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ + if (!*call_again) + { + /* Switch to correct buffer if we don't have it already */ + Buffer prev_buf = hscan->xs_cbuf; + + hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, + hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid)); + + /* + * Prune page, but only if we weren't already on this page + */ + if (prev_buf != hscan->xs_cbuf) + tdeheap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + } + + /* Obtain share-lock on the buffer so we can examine visibility */ + LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE); + got_tdeheap_tuple = tdeheap_hot_search_buffer(tid, + hscan->xs_base.rel, + hscan->xs_cbuf, + snapshot, + &bslot->base.tupdata, + all_dead, + !*call_again); + bslot->base.tupdata.t_self = *tid; + LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK); + + if (got_tdeheap_tuple) + { + /* + * Only in a non-MVCC snapshot can more than one member of the HOT + * chain be visible. + */ + *call_again = !IsMVCCSnapshot(snapshot); + + slot->tts_tableOid = RelationGetRelid(scan->rel); + ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf); + } + else + { + /* We've reached the end of the HOT chain. */ + *call_again = false; + } + + return got_tdeheap_tuple; +} + + +/* ------------------------------------------------------------------------ + * Callbacks for non-modifying operations on individual tuples for heap AM + * ------------------------------------------------------------------------ + */ + +static bool +pg_tdeam_fetch_row_version(Relation relation, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + Buffer buffer; + + Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); + + bslot->base.tupdata.t_self = *tid; + if (tdeheap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false)) + { + /* store in slot, transferring existing pin */ + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer); + slot->tts_tableOid = RelationGetRelid(relation); + + return true; + } + + return false; +} + +static bool +pg_tdeam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + return ItemPointerIsValid(tid) && + ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks; +} + +static bool +pg_tdeam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, + Snapshot snapshot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + bool res; + + Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); + Assert(BufferIsValid(bslot->buffer)); + + /* + * We need buffer pin and lock to call HeapTupleSatisfiesVisibility. + * Caller should be holding pin, but not lock. + */ + LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE); + res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot, + bslot->buffer); + LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK); + + return res; +} + + +/* ---------------------------------------------------------------------------- + * Functions for manipulations of physical tuples for heap AM. + * ---------------------------------------------------------------------------- + */ + +static void +pg_tdeam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, + int options, BulkInsertState bistate) +{ + bool shouldFree = true; + HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); + + /* Update the tuple with table oid */ + slot->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slot->tts_tableOid; + + /* Perform the insertion, and copy the resulting ItemPointer */ + tdeheap_insert(relation, tuple, cid, options, bistate); + ItemPointerCopy(&tuple->t_self, &slot->tts_tid); + + if (shouldFree) + pfree(tuple); +} + +static void +pg_tdeam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot, + CommandId cid, int options, + BulkInsertState bistate, uint32 specToken) +{ + bool shouldFree = true; + HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); + + /* Update the tuple with table oid */ + slot->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slot->tts_tableOid; + + HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken); + options |= HEAP_INSERT_SPECULATIVE; + + /* Perform the insertion, and copy the resulting ItemPointer */ + tdeheap_insert(relation, tuple, cid, options, bistate); + ItemPointerCopy(&tuple->t_self, &slot->tts_tid); + + if (shouldFree) + pfree(tuple); +} + +static void +pg_tdeam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, + uint32 specToken, bool succeeded) +{ + bool shouldFree = true; + HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); + + /* adjust the tuple's state accordingly */ + if (succeeded) + tdeheap_finish_speculative(relation, &slot->tts_tid); + else + tdeheap_abort_speculative(relation, &slot->tts_tid); + + if (shouldFree) + pfree(tuple); +} + +static TM_Result +pg_tdeam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, + Snapshot snapshot, Snapshot crosscheck, bool wait, + TM_FailureData *tmfd, bool changingPart) +{ + /* + * Currently Deleting of index tuples are handled at vacuum, in case if + * the storage itself is cleaning the dead tuples by itself, it is the + * time to call the index tuple deletion also. + */ + return tdeheap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart); +} + + +static TM_Result +pg_tdeam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, + CommandId cid, Snapshot snapshot, Snapshot crosscheck, + bool wait, TM_FailureData *tmfd, + LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes) +{ + bool shouldFree = true; + HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); + TM_Result result; + + /* Update the tuple with table oid */ + slot->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slot->tts_tableOid; + + result = tdeheap_update(relation, otid, tuple, cid, crosscheck, wait, + tmfd, lockmode, update_indexes); + ItemPointerCopy(&tuple->t_self, &slot->tts_tid); + + /* + * Decide whether new index entries are needed for the tuple + * + * Note: tdeheap_update returns the tid (location) of the new tuple in the + * t_self field. + * + * If the update is not HOT, we must update all indexes. If the update is + * HOT, it could be that we updated summarized columns, so we either + * update only summarized indexes, or none at all. + */ + if (result != TM_Ok) + { + Assert(*update_indexes == TU_None); + *update_indexes = TU_None; + } + else if (!HeapTupleIsHeapOnly(tuple)) + Assert(*update_indexes == TU_All); + else + Assert((*update_indexes == TU_Summarizing) || + (*update_indexes == TU_None)); + + if (shouldFree) + pfree(tuple); + + return result; +} + +static TM_Result +pg_tdeam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, + TupleTableSlot *slot, CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, uint8 flags, + TM_FailureData *tmfd) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + TM_Result result; + Buffer buffer; + HeapTuple tuple = &bslot->base.tupdata; + bool follow_updates; + + follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0; + tmfd->traversed = false; + + Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); + +tuple_lock_retry: + tuple->t_self = *tid; + result = tdeheap_lock_tuple(relation, tuple, cid, mode, wait_policy, + follow_updates, &buffer, tmfd); + + if (result == TM_Updated && + (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION)) + { + /* Should not encounter speculative tuple on recheck */ + Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); + + ReleaseBuffer(buffer); + + if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self)) + { + SnapshotData SnapshotDirty; + TransactionId priorXmax; + + /* it was updated, so look at the updated version */ + *tid = tmfd->ctid; + /* updated row should have xmin matching this xmax */ + priorXmax = tmfd->xmax; + + /* signal that a tuple later in the chain is getting locked */ + tmfd->traversed = true; + + /* + * fetch target tuple + * + * Loop here to deal with updated or busy tuples + */ + InitDirtySnapshot(SnapshotDirty); + for (;;) + { + if (ItemPointerIndicatesMovedPartitions(tid)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update"))); + + tuple->t_self = *tid; + if (tdeheap_fetch(relation, &SnapshotDirty, tuple, &buffer, true)) + { + /* + * If xmin isn't what we're expecting, the slot must have + * been recycled and reused for an unrelated tuple. This + * implies that the latest version of the row was deleted, + * so we need do nothing. (Should be safe to examine xmin + * without getting buffer's content lock. We assume + * reading a TransactionId to be atomic, and Xmin never + * changes in an existing tuple, except to invalid or + * frozen, and neither of those can match priorXmax.) + */ + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), + priorXmax)) + { + ReleaseBuffer(buffer); + return TM_Deleted; + } + + /* otherwise xmin should not be dirty... */ + if (TransactionIdIsValid(SnapshotDirty.xmin)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"", + SnapshotDirty.xmin, + ItemPointerGetBlockNumber(&tuple->t_self), + ItemPointerGetOffsetNumber(&tuple->t_self), + RelationGetRelationName(relation)))); + + /* + * If tuple is being updated by other transaction then we + * have to wait for its commit/abort, or die trying. + */ + if (TransactionIdIsValid(SnapshotDirty.xmax)) + { + ReleaseBuffer(buffer); + switch (wait_policy) + { + case LockWaitBlock: + XactLockTableWait(SnapshotDirty.xmax, + relation, &tuple->t_self, + XLTW_FetchUpdated); + break; + case LockWaitSkip: + if (!ConditionalXactLockTableWait(SnapshotDirty.xmax)) + /* skip instead of waiting */ + return TM_WouldBlock; + break; + case LockWaitError: + if (!ConditionalXactLockTableWait(SnapshotDirty.xmax)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + continue; /* loop back to repeat tdeheap_fetch */ + } + + /* + * If tuple was inserted by our own transaction, we have + * to check cmin against cid: cmin >= current CID means + * our command cannot see the tuple, so we should ignore + * it. Otherwise tdeheap_lock_tuple() will throw an error, + * and so would any later attempt to update or delete the + * tuple. (We need not check cmax because + * HeapTupleSatisfiesDirty will consider a tuple deleted + * by our transaction dead, regardless of cmax.) We just + * checked that priorXmax == xmin, so we can test that + * variable instead of doing HeapTupleHeaderGetXmin again. + */ + if (TransactionIdIsCurrentTransactionId(priorXmax) && + HeapTupleHeaderGetCmin(tuple->t_data) >= cid) + { + tmfd->xmax = priorXmax; + + /* + * Cmin is the problematic value, so store that. See + * above. + */ + tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data); + ReleaseBuffer(buffer); + return TM_SelfModified; + } + + /* + * This is a live tuple, so try to lock it again. + */ + ReleaseBuffer(buffer); + goto tuple_lock_retry; + } + + /* + * If the referenced slot was actually empty, the latest + * version of the row must have been deleted, so we need do + * nothing. + */ + if (tuple->t_data == NULL) + { + Assert(!BufferIsValid(buffer)); + return TM_Deleted; + } + + /* + * As above, if xmin isn't what we're expecting, do nothing. + */ + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), + priorXmax)) + { + ReleaseBuffer(buffer); + return TM_Deleted; + } + + /* + * If we get here, the tuple was found but failed + * SnapshotDirty. Assuming the xmin is either a committed xact + * or our own xact (as it certainly should be if we're trying + * to modify the tuple), this must mean that the row was + * updated or deleted by either a committed xact or our own + * xact. If it was deleted, we can ignore it; if it was + * updated then chain up to the next version and repeat the + * whole process. + * + * As above, it should be safe to examine xmax and t_ctid + * without the buffer content lock, because they can't be + * changing. We'd better hold a buffer pin though. + */ + if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) + { + /* deleted, so forget about it */ + ReleaseBuffer(buffer); + return TM_Deleted; + } + + /* updated, so look at the updated row */ + *tid = tuple->t_data->t_ctid; + /* updated row should have xmin matching this xmax */ + priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); + ReleaseBuffer(buffer); + /* loop back to fetch next in chain */ + } + } + else + { + /* tuple was deleted, so give up */ + return TM_Deleted; + } + } + + slot->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slot->tts_tableOid; + + /* store in slot, transferring existing pin */ + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); + + return result; +} + + +/* ------------------------------------------------------------------------ + * DDL related callbacks for heap AM. + * ------------------------------------------------------------------------ + */ + +static void +pg_tdeam_relation_set_new_filelocator(Relation rel, + const RelFileLocator *newrlocator, + char persistence, + TransactionId *freezeXid, + MultiXactId *minmulti) +{ + SMgrRelation srel; + + /* + * Initialize to the minimum XID that could put tuples in the table. We + * know that no xacts older than RecentXmin are still running, so that + * will do. + */ + *freezeXid = RecentXmin; + + /* + * Similarly, initialize the minimum Multixact to the first value that + * could possibly be stored in tuples in the table. Running transactions + * could reuse values from their local cache, so we are careful to + * consider all currently running multis. + * + * XXX this could be refined further, but is it worth the hassle? + */ + *minmulti = GetOldestMultiXactId(); + + srel = RelationCreateStorage(*newrlocator, persistence, true); + + /* + * If required, set up an init fork for an unlogged table so that it can + * be correctly reinitialized on restart. An immediate sync is required + * even if the page has been logged, because the write did not go through + * shared_buffers and therefore a concurrent checkpoint may have moved the + * redo pointer past our xlog record. Recovery may as well remove it + * while replaying, for example, XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE + * record. Therefore, logging is necessary even if wal_level=minimal. + */ + if (persistence == RELPERSISTENCE_UNLOGGED) + { + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); + smgrcreate(srel, INIT_FORKNUM, false); + log_smgrcreate(newrlocator, INIT_FORKNUM); + smgrimmedsync(srel, INIT_FORKNUM); + } + + smgrclose(srel); +} + +static void +pg_tdeam_relation_nontransactional_truncate(Relation rel) +{ + RelationTruncate(rel, 0); +} + +static void +pg_tdeam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) +{ + SMgrRelation dstrel; + + dstrel = smgropen(*newrlocator, rel->rd_backend); + + /* + * Since we copy the file directly without looking at the shared buffers, + * we'd better first flush out any pages of the source relation that are + * in shared buffers. We assume no new changes will be made while we are + * holding exclusive lock on the rel. + */ + FlushRelationBuffers(rel); + + /* + * Create and copy all forks of the relation, and schedule unlinking of + * old physical files. + * + * NOTE: any conflict in relfilenumber value will be caught in + * RelationCreateStorage(). + */ + RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true); + + /* copy main fork */ + RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM, + rel->rd_rel->relpersistence); + + /* copy those extra forks that exist */ + for (ForkNumber forkNum = MAIN_FORKNUM + 1; + forkNum <= MAX_FORKNUM; forkNum++) + { + if (smgrexists(RelationGetSmgr(rel), forkNum)) + { + smgrcreate(dstrel, forkNum, false); + + /* + * WAL log creation if the relation is persistent, or this is the + * init fork of an unlogged relation. + */ + if (RelationIsPermanent(rel) || + (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && + forkNum == INIT_FORKNUM)) + log_smgrcreate(newrlocator, forkNum); + RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum, + rel->rd_rel->relpersistence); + } + } + + + /* drop old relation, and close new one */ + RelationDropStorage(rel); + smgrclose(dstrel); +} + +static void +pg_tdeam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, + Relation OldIndex, bool use_sort, + TransactionId OldestXmin, + TransactionId *xid_cutoff, + MultiXactId *multi_cutoff, + double *num_tuples, + double *tups_vacuumed, + double *tups_recently_dead) +{ + RewriteState rwstate; + IndexScanDesc indexScan; + TableScanDesc tableScan; + HeapScanDesc heapScan; + bool is_system_catalog; + Tuplesortstate *tuplesort; + TupleDesc oldTupDesc = RelationGetDescr(OldHeap); + TupleDesc newTupDesc = RelationGetDescr(NewHeap); + TupleTableSlot *slot; + int natts; + Datum *values; + bool *isnull; + BufferHeapTupleTableSlot *hslot; + BlockNumber prev_cblock = InvalidBlockNumber; + + /* Remember if it's a system catalog */ + is_system_catalog = IsSystemRelation(OldHeap); + + /* + * Valid smgr_targblock implies something already wrote to the relation. + * This may be harmless, but this function hasn't planned for it. + */ + Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber); + + /* Preallocate values/isnull arrays */ + natts = newTupDesc->natts; + values = (Datum *) palloc(natts * sizeof(Datum)); + isnull = (bool *) palloc(natts * sizeof(bool)); + + /* Initialize the rewrite operation */ + rwstate = begin_tdeheap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff, + *multi_cutoff); + + + /* Set up sorting if wanted */ + if (use_sort) + tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex, + maintenance_work_mem, + NULL, TUPLESORT_NONE); + else + tuplesort = NULL; + + /* + * Prepare to scan the OldHeap. To ensure we see recently-dead tuples + * that still need to be copied, we scan with SnapshotAny and use + * HeapTupleSatisfiesVacuum for the visibility test. + */ + if (OldIndex != NULL && !use_sort) + { + const int ci_index[] = { + PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_INDEX_RELID + }; + int64 ci_val[2]; + + /* Set phase and OIDOldIndex to columns */ + ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; + ci_val[1] = RelationGetRelid(OldIndex); + pgstat_progress_update_multi_param(2, ci_index, ci_val); + + tableScan = NULL; + heapScan = NULL; + indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0); + index_rescan(indexScan, NULL, 0, NULL, 0); + } + else + { + /* In scan-and-sort mode and also VACUUM FULL, set phase */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); + + tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); + heapScan = (HeapScanDesc) tableScan; + indexScan = NULL; + + /* Set total heap blocks */ + pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, + heapScan->rs_nblocks); + } + + slot = table_slot_create(OldHeap, NULL); + hslot = (BufferHeapTupleTableSlot *) slot; + + /* + * Scan through the OldHeap, either in OldIndex order or sequentially; + * copy each tuple into the NewHeap, or transiently to the tuplesort + * module. Note that we don't bother sorting dead tuples (they won't get + * to the new table anyway). + */ + for (;;) + { + HeapTuple tuple; + Buffer buf; + bool isdead; + + CHECK_FOR_INTERRUPTS(); + + if (indexScan != NULL) + { + if (!index_getnext_slot(indexScan, ForwardScanDirection, slot)) + break; + + /* Since we used no scan keys, should never need to recheck */ + if (indexScan->xs_recheck) + elog(ERROR, "CLUSTER does not support lossy index conditions"); + } + else + { + if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot)) + { + /* + * If the last pages of the scan were empty, we would go to + * the next phase while tdeheap_blks_scanned != tdeheap_blks_total. + * Instead, to ensure that tdeheap_blks_scanned is equivalent to + * tdeheap_blks_total after the table scan phase, this parameter + * is manually updated to the correct value when the table + * scan finishes. + */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + heapScan->rs_nblocks); + break; + } + + /* + * In scan-and-sort mode and also VACUUM FULL, set heap blocks + * scanned + * + * Note that heapScan may start at an offset and wrap around, i.e. + * rs_startblock may be >0, and rs_cblock may end with a number + * below rs_startblock. To prevent showing this wraparound to the + * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks). + */ + if (prev_cblock != heapScan->rs_cblock) + { + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + (heapScan->rs_cblock + + heapScan->rs_nblocks - + heapScan->rs_startblock + ) % heapScan->rs_nblocks + 1); + prev_cblock = heapScan->rs_cblock; + } + } + + tuple = ExecFetchSlotHeapTuple(slot, false, NULL); + buf = hslot->buffer; + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf)) + { + case HEAPTUPLE_DEAD: + /* Definitely dead */ + isdead = true; + break; + case HEAPTUPLE_RECENTLY_DEAD: + *tups_recently_dead += 1; + /* fall through */ + case HEAPTUPLE_LIVE: + /* Live or recently dead, must copy it */ + isdead = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * Since we hold exclusive lock on the relation, normally the + * only way to see this is if it was inserted earlier in our + * own transaction. However, it can happen in system + * catalogs, since we tend to release write lock before commit + * there. Give a warning if neither case applies; but in any + * case we had better copy it. + */ + if (!is_system_catalog && + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) + elog(WARNING, "concurrent insert in progress within table \"%s\"", + RelationGetRelationName(OldHeap)); + /* treat as live */ + isdead = false; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * Similar situation to INSERT_IN_PROGRESS case. + */ + if (!is_system_catalog && + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data))) + elog(WARNING, "concurrent delete in progress within table \"%s\"", + RelationGetRelationName(OldHeap)); + /* treat as recently dead */ + *tups_recently_dead += 1; + isdead = false; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + isdead = false; /* keep compiler quiet */ + break; + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (isdead) + { + *tups_vacuumed += 1; + /* heap rewrite module still needs to see it... */ + if (rewrite_tdeheap_dead_tuple(rwstate, tuple)) + { + /* A previous recently-dead tuple is now known dead */ + *tups_vacuumed += 1; + *tups_recently_dead -= 1; + } + continue; + } + + *num_tuples += 1; + if (tuplesort != NULL) + { + tuplesort_putheaptuple(tuplesort, tuple); + + /* + * In scan-and-sort mode, report increase in number of tuples + * scanned + */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, + *num_tuples); + } + else + { + const int ct_index[] = { + PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, + PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN + }; + int64 ct_val[2]; + + reform_and_rewrite_tuple(tuple, OldHeap, NewHeap, + values, isnull, rwstate); + + /* + * In indexscan mode and also VACUUM FULL, report increase in + * number of tuples scanned and written + */ + ct_val[0] = *num_tuples; + ct_val[1] = *num_tuples; + pgstat_progress_update_multi_param(2, ct_index, ct_val); + } + } + + if (indexScan != NULL) + index_endscan(indexScan); + if (tableScan != NULL) + table_endscan(tableScan); + if (slot) + ExecDropSingleTupleTableSlot(slot); + + /* + * In scan-and-sort mode, complete the sort, then read out all live tuples + * from the tuplestore and write them to the new relation. + */ + if (tuplesort != NULL) + { + double n_tuples = 0; + + /* Report that we are now sorting tuples */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SORT_TUPLES); + + tuplesort_performsort(tuplesort); + + /* Report that we are now writing new heap */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP); + + for (;;) + { + HeapTuple tuple; + + CHECK_FOR_INTERRUPTS(); + + tuple = tuplesort_getheaptuple(tuplesort, true); + if (tuple == NULL) + break; + + n_tuples += 1; + reform_and_rewrite_tuple(tuple, + OldHeap, NewHeap, + values, isnull, + rwstate); + /* Report n_tuples */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, + n_tuples); + } + + tuplesort_end(tuplesort); + } + + /* Write out any remaining tuples, and fsync if needed */ + end_tdeheap_rewrite(rwstate); + + /* Clean up */ + pfree(values); + pfree(isnull); +} + +static bool +pg_tdeam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, + BufferAccessStrategy bstrategy) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + /* + * We must maintain a pin on the target page's buffer to ensure that + * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from + * under us. Hence, pin the page until we are done looking at it. We + * also choose to hold sharelock on the buffer throughout --- we could + * release and re-acquire sharelock for each tuple, but since we aren't + * doing much work per tuple, the extra lock traffic is probably better + * avoided. + */ + hscan->rs_cblock = blockno; + hscan->rs_cindex = FirstOffsetNumber; + hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, + blockno, RBM_NORMAL, bstrategy); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + /* in heap all blocks can contain tuples, so always return true */ + return true; +} + +static bool +pg_tdeam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, + double *liverows, double *deadrows, + TupleTableSlot *slot) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + Page targpage; + OffsetNumber maxoffset; + BufferHeapTupleTableSlot *hslot; + + Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); + + hslot = (BufferHeapTupleTableSlot *) slot; + targpage = BufferGetPage(hscan->rs_cbuf); + maxoffset = PageGetMaxOffsetNumber(targpage); + + /* Inner loop over all tuples on the selected page */ + for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++) + { + ItemId itemid; + HeapTuple targtuple = &hslot->base.tupdata; + bool sample_it = false; + + itemid = PageGetItemId(targpage, hscan->rs_cindex); + + /* + * We ignore unused and redirect line pointers. DEAD line pointers + * should be counted as dead, because we need vacuum to run to get rid + * of them. Note that this rule agrees with the way that + * tdeheap_page_prune() counts things. + */ + if (!ItemIdIsNormal(itemid)) + { + if (ItemIdIsDead(itemid)) + *deadrows += 1; + continue; + } + + ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex); + + targtuple->t_tableOid = RelationGetRelid(scan->rs_rd); + targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); + targtuple->t_len = ItemIdGetLength(itemid); + + switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin, + hscan->rs_cbuf)) + { + case HEAPTUPLE_LIVE: + sample_it = true; + *liverows += 1; + break; + + case HEAPTUPLE_DEAD: + case HEAPTUPLE_RECENTLY_DEAD: + /* Count dead and recently-dead rows */ + *deadrows += 1; + break; + + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * Insert-in-progress rows are not counted. We assume that + * when the inserting transaction commits or aborts, it will + * send a stats message to increment the proper count. This + * works right only if that transaction ends after we finish + * analyzing the table; if things happen in the other order, + * its stats update will be overwritten by ours. However, the + * error will be large only if the other transaction runs long + * enough to insert many tuples, so assuming it will finish + * after us is the safer option. + * + * A special case is that the inserting transaction might be + * our own. In this case we should count and sample the row, + * to accommodate users who load a table and analyze it in one + * transaction. (pgstat_report_analyze has to adjust the + * numbers we report to the cumulative stats system to make + * this come out right.) + */ + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data))) + { + sample_it = true; + *liverows += 1; + } + break; + + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * We count and sample delete-in-progress rows the same as + * live ones, so that the stats counters come out right if the + * deleting transaction commits after us, per the same + * reasoning given above. + * + * If the delete was done by our own transaction, however, we + * must count the row as dead to make pgstat_report_analyze's + * stats adjustments come out right. (Note: this works out + * properly when the row was both inserted and deleted in our + * xact.) + * + * The net effect of these choices is that we act as though an + * IN_PROGRESS transaction hasn't happened yet, except if it + * is our own transaction, which we assume has happened. + * + * This approach ensures that we behave sanely if we see both + * the pre-image and post-image rows for a row being updated + * by a concurrent transaction: we will sample the pre-image + * but not the post-image. We also get sane results if the + * concurrent transaction never commits. + */ + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data))) + *deadrows += 1; + else + { + sample_it = true; + *liverows += 1; + } + break; + + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + if (sample_it) + { + ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf); + hscan->rs_cindex++; + + /* note that we leave the buffer locked here! */ + return true; + } + } + + /* Now release the lock and pin on the page */ + UnlockReleaseBuffer(hscan->rs_cbuf); + hscan->rs_cbuf = InvalidBuffer; + + /* also prevent old slot contents from having pin on page */ + ExecClearTuple(slot); + + return false; +} + +static double +pg_tdeam_index_build_range_scan(Relation heapRelation, + Relation indexRelation, + IndexInfo *indexInfo, + bool allow_sync, + bool anyvisible, + bool progress, + BlockNumber start_blockno, + BlockNumber numblocks, + IndexBuildCallback callback, + void *callback_state, + TableScanDesc scan) +{ + HeapScanDesc hscan; + bool is_system_catalog; + bool checking_uniqueness; + HeapTuple heapTuple; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + double reltuples; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + Snapshot snapshot; + bool need_unregister_snapshot = false; + TransactionId OldestXmin; + BlockNumber previous_blkno = InvalidBlockNumber; + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + /* Remember if it's a system catalog */ + is_system_catalog = IsSystemRelation(heapRelation); + + /* See whether we're verifying uniqueness/exclusion properties */ + checking_uniqueness = (indexInfo->ii_Unique || + indexInfo->ii_ExclusionOps != NULL); + + /* + * "Any visible" mode is not compatible with uniqueness checks; make sure + * only one of those is requested. + */ + Assert(!(anyvisible && checking_uniqueness)); + + /* + * Need an EState for evaluation of index expressions and partial-index + * predicates. Also a slot to hold the current tuple. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + slot = table_slot_create(heapRelation, NULL); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + /* + * Prepare for scan of the base relation. In a normal index build, we use + * SnapshotAny because we must retrieve all tuples and do our own time + * qual checks (because we have to index RECENTLY_DEAD tuples). In a + * concurrent build, or during bootstrap, we take a regular MVCC snapshot + * and index whatever's live according to that. + */ + OldestXmin = InvalidTransactionId; + + /* okay to ignore lazy VACUUMs here */ + if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) + OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); + + if (!scan) + { + /* + * Serial index build. + * + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + */ + if (!TransactionIdIsValid(OldestXmin)) + { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + need_unregister_snapshot = true; + } + else + snapshot = SnapshotAny; + + scan = table_beginscan_strat(heapRelation, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + allow_sync); /* syncscan OK? */ + } + else + { + /* + * Parallel index build. + * + * Parallel case never registers/unregisters own snapshot. Snapshot + * is taken from parallel heap scan, and is SnapshotAny or an MVCC + * snapshot, based on same criteria as serial case. + */ + Assert(!IsBootstrapProcessingMode()); + Assert(allow_sync); + snapshot = scan->rs_snapshot; + } + + hscan = (HeapScanDesc) scan; + + /* + * Must have called GetOldestNonRemovableTransactionId() if using + * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially + * worth checking this for parallel builds, since ambuild routines that + * support parallel builds must work these details out for themselves.) + */ + Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); + Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : + !TransactionIdIsValid(OldestXmin)); + Assert(snapshot == SnapshotAny || !anyvisible); + + /* Publish number of blocks to scan */ + if (progress) + { + BlockNumber nblocks; + + if (hscan->rs_base.rs_parallel != NULL) + { + ParallelBlockTableScanDesc pbscan; + + pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; + nblocks = pbscan->phs_nblocks; + } + else + nblocks = hscan->rs_nblocks; + + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, + nblocks); + } + + /* set our scan endpoints */ + if (!allow_sync) + tdeheap_setscanlimits(scan, start_blockno, numblocks); + else + { + /* syncscan can only be requested on whole relation */ + Assert(start_blockno == 0); + Assert(numblocks == InvalidBlockNumber); + } + + reltuples = 0; + + /* + * Scan all tuples in the base relation. + */ + while ((heapTuple = tdeheap_getnext(scan, ForwardScanDirection)) != NULL) + { + bool tupleIsAlive; + + CHECK_FOR_INTERRUPTS(); + + /* Report scan progress, if asked to. */ + if (progress) + { + BlockNumber blocks_done = pg_tdeam_scan_get_blocks_done(hscan); + + if (blocks_done != previous_blkno) + { + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + blocks_done); + previous_blkno = blocks_done; + } + } + + /* + * When dealing with a HOT-chain of updated tuples, we want to index + * the values of the live tuple (if any), but index it under the TID + * of the chain's root tuple. This approach is necessary to preserve + * the HOT-chain structure in the heap. So we need to be able to find + * the root item offset for every tuple that's in a HOT-chain. When + * first reaching a new page of the relation, call + * tdeheap_get_root_tuples() to build a map of root item offsets on the + * page. + * + * It might look unsafe to use this information across buffer + * lock/unlock. However, we hold ShareLock on the table so no + * ordinary insert/update/delete should occur; and we hold pin on the + * buffer continuously while visiting the page, so no pruning + * operation can occur either. + * + * In cases with only ShareUpdateExclusiveLock on the table, it's + * possible for some HOT tuples to appear that we didn't know about + * when we first read the page. To handle that case, we re-obtain the + * list of root offsets when a HOT tuple points to a root item that we + * don't know about. + * + * Also, although our opinions about tuple liveness could change while + * we scan the page (due to concurrent transaction commits/aborts), + * the chain root locations won't, so this info doesn't need to be + * rebuilt after waiting for another transaction. + * + * Note the implied assumption that there is no more than one live + * tuple per HOT-chain --- else we could create more than one index + * entry pointing to the same root tuple. + */ + if (hscan->rs_cblock != root_blkno) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + tdeheap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + root_blkno = hscan->rs_cblock; + } + + if (snapshot == SnapshotAny) + { + /* do our own time qual check */ + bool indexIt; + TransactionId xwait; + + recheck: + + /* + * We could possibly get away with not locking the buffer here, + * since caller should hold ShareLock on the relation, but let's + * be conservative about it. (This remark is still correct even + * with HOT-pruning: our pin on the buffer prevents pruning.) + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's pg_tdeam_scan_analyze_next_tuple() does, + * otherwise CREATE INDEX and ANALYZE may produce wildly different + * reltuples values, e.g. when there are many recently-dead + * tuples. + */ + switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin, + hscan->rs_cbuf)) + { + case HEAPTUPLE_DEAD: + /* Definitely dead, we can ignore it */ + indexIt = false; + tupleIsAlive = false; + break; + case HEAPTUPLE_LIVE: + /* Normal case, index and unique-check it */ + indexIt = true; + tupleIsAlive = true; + /* Count it as live, too */ + reltuples += 1; + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * If tuple is recently deleted then we must index it + * anyway to preserve MVCC semantics. (Pre-existing + * transactions could try to use the index after we finish + * building it, and may need to see such tuples.) + * + * However, if it was HOT-updated then we must only index + * the live tuple at the end of the HOT-chain. Since this + * breaks semantics for pre-existing snapshots, mark the + * index as unusable for them. + * + * We don't count recently-dead tuples in reltuples, even + * if we index them; see pg_tdeam_scan_analyze_next_tuple(). + */ + if (HeapTupleIsHotUpdated(heapTuple)) + { + indexIt = false; + /* mark the index as unsafe for old snapshots */ + indexInfo->ii_BrokenHotChain = true; + } + else + indexIt = true; + /* In any case, exclude the tuple from unique-checking */ + tupleIsAlive = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * In "anyvisible" mode, this tuple is visible and we + * don't need any further checks. + */ + if (anyvisible) + { + indexIt = true; + tupleIsAlive = true; + reltuples += 1; + break; + } + + /* + * Since caller should hold ShareLock or better, normally + * the only way to see this is if it was inserted earlier + * in our own transaction. However, it can happen in + * system catalogs, since we tend to release write lock + * before commit there. Give a warning if neither case + * applies. + */ + xwait = HeapTupleHeaderGetXmin(heapTuple->t_data); + if (!TransactionIdIsCurrentTransactionId(xwait)) + { + if (!is_system_catalog) + elog(WARNING, "concurrent insert in progress within table \"%s\"", + RelationGetRelationName(heapRelation)); + + /* + * If we are performing uniqueness checks, indexing + * such a tuple could lead to a bogus uniqueness + * failure. In that case we wait for the inserting + * transaction to finish and check again. + */ + if (checking_uniqueness) + { + /* + * Must drop the lock on the buffer before we wait + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(xwait, heapRelation, + &heapTuple->t_self, + XLTW_InsertIndexUnique); + CHECK_FOR_INTERRUPTS(); + goto recheck; + } + } + else + { + /* + * For consistency with + * pg_tdeam_scan_analyze_next_tuple(), count + * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only + * when inserted by our own transaction. + */ + reltuples += 1; + } + + /* + * We must index such tuples, since if the index build + * commits then they're good. + */ + indexIt = true; + tupleIsAlive = true; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * As with INSERT_IN_PROGRESS case, this is unexpected + * unless it's our own deletion or a system catalog; but + * in anyvisible mode, this tuple is visible. + */ + if (anyvisible) + { + indexIt = true; + tupleIsAlive = false; + reltuples += 1; + break; + } + + xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); + if (!TransactionIdIsCurrentTransactionId(xwait)) + { + if (!is_system_catalog) + elog(WARNING, "concurrent delete in progress within table \"%s\"", + RelationGetRelationName(heapRelation)); + + /* + * If we are performing uniqueness checks, assuming + * the tuple is dead could lead to missing a + * uniqueness violation. In that case we wait for the + * deleting transaction to finish and check again. + * + * Also, if it's a HOT-updated tuple, we should not + * index it but rather the live tuple at the end of + * the HOT-chain. However, the deleting transaction + * could abort, possibly leaving this tuple as live + * after all, in which case it has to be indexed. The + * only way to know what to do is to wait for the + * deleting transaction to finish and check again. + */ + if (checking_uniqueness || + HeapTupleIsHotUpdated(heapTuple)) + { + /* + * Must drop the lock on the buffer before we wait + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(xwait, heapRelation, + &heapTuple->t_self, + XLTW_InsertIndexUnique); + CHECK_FOR_INTERRUPTS(); + goto recheck; + } + + /* + * Otherwise index it but don't check for uniqueness, + * the same as a RECENTLY_DEAD tuple. + */ + indexIt = true; + + /* + * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live, + * if they were not deleted by the current + * transaction. That's what + * pg_tdeam_scan_analyze_next_tuple() does, and we want + * the behavior to be consistent. + */ + reltuples += 1; + } + else if (HeapTupleIsHotUpdated(heapTuple)) + { + /* + * It's a HOT-updated tuple deleted by our own xact. + * We can assume the deletion will commit (else the + * index contents don't matter), so treat the same as + * RECENTLY_DEAD HOT-updated tuples. + */ + indexIt = false; + /* mark the index as unsafe for old snapshots */ + indexInfo->ii_BrokenHotChain = true; + } + else + { + /* + * It's a regular tuple deleted by our own xact. Index + * it, but don't check for uniqueness nor count in + * reltuples, the same as a RECENTLY_DEAD tuple. + */ + indexIt = true; + } + /* In any case, exclude the tuple from unique-checking */ + tupleIsAlive = false; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + indexIt = tupleIsAlive = false; /* keep compiler quiet */ + break; + } + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + if (!indexIt) + continue; + } + else + { + /* tdeheap_getnext did the time qual check */ + tupleIsAlive = true; + reltuples += 1; + } + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + /* Set up for predicate or expression evaluation */ + ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf); + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate != NULL) + { + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * For the current heap tuple, extract all the attributes we use in + * this index, and note which are null. This also performs evaluation + * of any expressions needed. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* + * You'd think we should go ahead and build the index tuple here, but + * some index AMs want to do further processing on the data first. So + * pass the values[] and isnull[] arrays, instead. + */ + + if (HeapTupleIsHeapOnly(heapTuple)) + { + /* + * For a heap-only tuple, pretend its TID is that of the root. See + * src/backend/access/heap/README.HOT for discussion. + */ + ItemPointerData tid; + OffsetNumber offnum; + + offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self); + + /* + * If a HOT tuple points to a root that we don't know about, + * obtain root items afresh. If that still fails, report it as + * corruption. + */ + if (root_offsets[offnum - 1] == InvalidOffsetNumber) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + tdeheap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + } + + if (!OffsetNumberIsValid(root_offsets[offnum - 1])) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", + ItemPointerGetBlockNumber(&heapTuple->t_self), + offnum, + RelationGetRelationName(heapRelation)))); + + ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self), + root_offsets[offnum - 1]); + + /* Call the AM's callback routine to process the tuple */ + callback(indexRelation, &tid, values, isnull, tupleIsAlive, + callback_state); + } + else + { + /* Call the AM's callback routine to process the tuple */ + callback(indexRelation, &heapTuple->t_self, values, isnull, + tupleIsAlive, callback_state); + } + } + + /* Report scan progress one last time. */ + if (progress) + { + BlockNumber blks_done; + + if (hscan->rs_base.rs_parallel != NULL) + { + ParallelBlockTableScanDesc pbscan; + + pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; + blks_done = pbscan->phs_nblocks; + } + else + blks_done = hscan->rs_nblocks; + + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + blks_done); + } + + table_endscan(scan); + + /* we can now forget our snapshot, if set and registered by us */ + if (need_unregister_snapshot) + UnregisterSnapshot(snapshot); + + ExecDropSingleTupleTableSlot(slot); + + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; + + return reltuples; +} + +static void +pg_tdeam_index_validate_scan(Relation heapRelation, + Relation indexRelation, + IndexInfo *indexInfo, + Snapshot snapshot, + ValidateIndexState *state) +{ + TableScanDesc scan; + HeapScanDesc hscan; + HeapTuple heapTuple; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + bool in_index[MaxHeapTuplesPerPage]; + BlockNumber previous_blkno = InvalidBlockNumber; + + /* state variables for the merge */ + ItemPointer indexcursor = NULL; + ItemPointerData decoded; + bool tuplesort_empty = false; + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + /* + * Need an EState for evaluation of index expressions and partial-index + * predicates. Also a slot to hold the current tuple. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + /* + * Prepare for scan of the base relation. We need just those tuples + * satisfying the passed-in reference snapshot. We must disable syncscan + * here, because it's critical that we read from block zero forward to + * match the sorted TIDs. + */ + scan = table_beginscan_strat(heapRelation, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + false); /* syncscan not OK */ + hscan = (HeapScanDesc) scan; + + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, + hscan->rs_nblocks); + + /* + * Scan all tuples matching the snapshot. + */ + while ((heapTuple = tdeheap_getnext(scan, ForwardScanDirection)) != NULL) + { + ItemPointer heapcursor = &heapTuple->t_self; + ItemPointerData rootTuple; + OffsetNumber root_offnum; + + CHECK_FOR_INTERRUPTS(); + + state->htups += 1; + + if ((previous_blkno == InvalidBlockNumber) || + (hscan->rs_cblock != previous_blkno)) + { + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + hscan->rs_cblock); + previous_blkno = hscan->rs_cblock; + } + + /* + * As commented in table_index_build_scan, we should index heap-only + * tuples under the TIDs of their root tuples; so when we advance onto + * a new heap page, build a map of root item offsets on the page. + * + * This complicates merging against the tuplesort output: we will + * visit the live tuples in order by their offsets, but the root + * offsets that we need to compare against the index contents might be + * ordered differently. So we might have to "look back" within the + * tuplesort output, but only within the current page. We handle that + * by keeping a bool array in_index[] showing all the + * already-passed-over tuplesort output TIDs of the current page. We + * clear that array here, when advancing onto a new heap page. + */ + if (hscan->rs_cblock != root_blkno) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + tdeheap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + memset(in_index, 0, sizeof(in_index)); + + root_blkno = hscan->rs_cblock; + } + + /* Convert actual tuple TID to root TID */ + rootTuple = *heapcursor; + root_offnum = ItemPointerGetOffsetNumber(heapcursor); + + if (HeapTupleIsHeapOnly(heapTuple)) + { + root_offnum = root_offsets[root_offnum - 1]; + if (!OffsetNumberIsValid(root_offnum)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", + ItemPointerGetBlockNumber(heapcursor), + ItemPointerGetOffsetNumber(heapcursor), + RelationGetRelationName(heapRelation)))); + ItemPointerSetOffsetNumber(&rootTuple, root_offnum); + } + + /* + * "merge" by skipping through the index tuples until we find or pass + * the current root tuple. + */ + while (!tuplesort_empty && + (!indexcursor || + ItemPointerCompare(indexcursor, &rootTuple) < 0)) + { + Datum ts_val; + bool ts_isnull; + + if (indexcursor) + { + /* + * Remember index items seen earlier on the current heap page + */ + if (ItemPointerGetBlockNumber(indexcursor) == root_blkno) + in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true; + } + + tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, + false, &ts_val, &ts_isnull, + NULL); + Assert(tuplesort_empty || !ts_isnull); + if (!tuplesort_empty) + { + itemptr_decode(&decoded, DatumGetInt64(ts_val)); + indexcursor = &decoded; + } + else + { + /* Be tidy */ + indexcursor = NULL; + } + } + + /* + * If the tuplesort has overshot *and* we didn't see a match earlier, + * then this tuple is missing from the index, so insert it. + */ + if ((tuplesort_empty || + ItemPointerCompare(indexcursor, &rootTuple) > 0) && + !in_index[root_offnum - 1]) + { + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + /* Set up for predicate or expression evaluation */ + ExecStoreHeapTuple(heapTuple, slot, false); + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate != NULL) + { + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * For the current heap tuple, extract all the attributes we use + * in this index, and note which are null. This also performs + * evaluation of any expressions needed. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* + * You'd think we should go ahead and build the index tuple here, + * but some index AMs want to do further processing on the data + * first. So pass the values[] and isnull[] arrays, instead. + */ + + /* + * If the tuple is already committed dead, you might think we + * could suppress uniqueness checking, but this is no longer true + * in the presence of HOT, because the insert is actually a proxy + * for a uniqueness check on the whole HOT-chain. That is, the + * tuple we have here could be dead because it was already + * HOT-updated, and if so the updating transaction will not have + * thought it should insert index entries. The index AM will + * check the whole HOT-chain and correctly detect a conflict if + * there is one. + */ + + index_insert(indexRelation, + values, + isnull, + &rootTuple, + heapRelation, + indexInfo->ii_Unique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + false, + indexInfo); + + state->tups_inserted += 1; + } + } + + table_endscan(scan); + + ExecDropSingleTupleTableSlot(slot); + + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; +} + +/* + * Return the number of blocks that have been read by this scan since + * starting. This is meant for progress reporting rather than be fully + * accurate: in a parallel scan, workers can be concurrently reading blocks + * further ahead than what we report. + */ +static BlockNumber +pg_tdeam_scan_get_blocks_done(HeapScanDesc hscan) +{ + ParallelBlockTableScanDesc bpscan = NULL; + BlockNumber startblock; + BlockNumber blocks_done; + + if (hscan->rs_base.rs_parallel != NULL) + { + bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; + startblock = bpscan->phs_startblock; + } + else + startblock = hscan->rs_startblock; + + /* + * Might have wrapped around the end of the relation, if startblock was + * not zero. + */ + if (hscan->rs_cblock > startblock) + blocks_done = hscan->rs_cblock - startblock; + else + { + BlockNumber nblocks; + + nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks; + blocks_done = nblocks - startblock + + hscan->rs_cblock; + } + + return blocks_done; +} + + +/* ------------------------------------------------------------------------ + * Miscellaneous callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +/* + * Check to see whether the table needs a TOAST table. It does only if + * (1) there are any toastable attributes, and (2) the maximum length + * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to + * create a toast table for something like "f1 varchar(20)".) + */ +static bool +pg_tdeam_relation_needs_toast_table(Relation rel) +{ + int32 data_length = 0; + bool maxlength_unknown = false; + bool has_toastable_attrs = false; + TupleDesc tupdesc = rel->rd_att; + int32 tuple_length; + int i; + + for (i = 0; i < tupdesc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(tupdesc, i); + + if (att->attisdropped) + continue; + data_length = att_align_nominal(data_length, att->attalign); + if (att->attlen > 0) + { + /* Fixed-length types are never toastable */ + data_length += att->attlen; + } + else + { + int32 maxlen = type_maximum_size(att->atttypid, + att->atttypmod); + + if (maxlen < 0) + maxlength_unknown = true; + else + data_length += maxlen; + if (att->attstorage != TYPSTORAGE_PLAIN) + has_toastable_attrs = true; + } + } + if (!has_toastable_attrs) + return false; /* nothing to toast? */ + if (maxlength_unknown) + return true; /* any unlimited-length attrs? */ + tuple_length = MAXALIGN(SizeofHeapTupleHeader + + BITMAPLEN(tupdesc->natts)) + + MAXALIGN(data_length); + return (tuple_length > TOAST_TUPLE_THRESHOLD); +} + +/* + * TOAST tables for heap relations are just heap relations. + */ +static Oid +pg_tdeam_relation_toast_am(Relation rel) +{ + return rel->rd_rel->relam; +} + + +/* ------------------------------------------------------------------------ + * Planner related callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +#define HEAP_OVERHEAD_BYTES_PER_TUPLE \ + (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData)) +#define HEAP_USABLE_BYTES_PER_PAGE \ + (BLCKSZ - SizeOfPageHeaderData) + +static void +pg_tdeam_estimate_rel_size(Relation rel, int32 *attr_widths, + BlockNumber *pages, double *tuples, + double *allvisfrac) +{ + table_block_relation_estimate_size(rel, attr_widths, pages, + tuples, allvisfrac, + HEAP_OVERHEAD_BYTES_PER_TUPLE, + HEAP_USABLE_BYTES_PER_PAGE); +} + + +/* ------------------------------------------------------------------------ + * Executor related callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +static bool +pg_tdeam_scan_bitmap_next_block(TableScanDesc scan, + TBMIterateResult *tbmres) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + BlockNumber block = tbmres->blockno; + Buffer buffer; + Snapshot snapshot; + int ntup; + + hscan->rs_cindex = 0; + hscan->rs_ntuples = 0; + + /* + * Ignore any claimed entries past what we think is the end of the + * relation. It may have been extended after the start of our scan (we + * only hold an AccessShareLock, and it could be inserts from this + * backend). We don't take this optimization in SERIALIZABLE isolation + * though, as we need to examine all invisible tuples reachable by the + * index. + */ + if (!IsolationIsSerializable() && block >= hscan->rs_nblocks) + return false; + + /* + * Acquire pin on the target heap page, trading in any pin we held before. + */ + hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf, + scan->rs_rd, + block); + hscan->rs_cblock = block; + buffer = hscan->rs_cbuf; + snapshot = scan->rs_snapshot; + + ntup = 0; + + /* + * Prune and repair fragmentation for the whole page, if possible. + */ + tdeheap_page_prune_opt(scan->rs_rd, buffer); + + /* + * We must hold share lock on the buffer content while examining tuple + * visibility. Afterwards, however, the tuples we have found to be + * visible are guaranteed good as long as we hold the buffer pin. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + /* + * We need two separate strategies for lossy and non-lossy cases. + */ + if (tbmres->ntuples >= 0) + { + /* + * Bitmap is non-lossy, so we just look through the offsets listed in + * tbmres; but we have to follow any HOT chain starting at each such + * offset. + */ + int curslot; + + for (curslot = 0; curslot < tbmres->ntuples; curslot++) + { + OffsetNumber offnum = tbmres->offsets[curslot]; + ItemPointerData tid; + HeapTupleData heapTuple; + + ItemPointerSet(&tid, block, offnum); + if (tdeheap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, + &heapTuple, NULL, true)) + hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + } + } + else + { + /* + * Bitmap is lossy, so we must examine each line pointer on the page. + * But we can ignore HOT chains, since we'll check each tuple anyway. + */ + Page page = BufferGetPage(buffer); + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + OffsetNumber offnum; + + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId lp; + HeapTupleData loctup; + bool valid; + + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) + continue; + loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); + loctup.t_len = ItemIdGetLength(lp); + loctup.t_tableOid = scan->rs_rd->rd_id; + ItemPointerSet(&loctup.t_self, block, offnum); + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + if (valid) + { + hscan->rs_vistuples[ntup++] = offnum; + PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot, + HeapTupleHeaderGetXmin(loctup.t_data)); + } + HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, + buffer, snapshot); + } + } + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + Assert(ntup <= MaxHeapTuplesPerPage); + hscan->rs_ntuples = ntup; + + return ntup > 0; +} + +static bool +pg_tdeam_scan_bitmap_next_tuple(TableScanDesc scan, + TBMIterateResult *tbmres, + TupleTableSlot *slot) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + OffsetNumber targoffset; + Page page; + ItemId lp; + + /* + * Out of range? If so, nothing more to look at on this page + */ + if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples) + return false; + + targoffset = hscan->rs_vistuples[hscan->rs_cindex]; + page = BufferGetPage(hscan->rs_cbuf); + lp = PageGetItemId(page, targoffset); + Assert(ItemIdIsNormal(lp)); + + hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); + hscan->rs_ctup.t_len = ItemIdGetLength(lp); + hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; + ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset); + + pgstat_count_tdeheap_fetch(scan->rs_rd); + + /* + * Set up the result slot to point to this tuple. Note that the slot + * acquires a pin on the buffer. + */ + ExecStoreBufferHeapTuple(&hscan->rs_ctup, + slot, + hscan->rs_cbuf); + + hscan->rs_cindex++; + + return true; +} + +static bool +pg_tdeam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + TsmRoutine *tsm = scanstate->tsmroutine; + BlockNumber blockno; + + /* return false immediately if relation is empty */ + if (hscan->rs_nblocks == 0) + return false; + + if (tsm->NextSampleBlock) + { + blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks); + hscan->rs_cblock = blockno; + } + else + { + /* scanning table sequentially */ + + if (hscan->rs_cblock == InvalidBlockNumber) + { + Assert(!hscan->rs_inited); + blockno = hscan->rs_startblock; + } + else + { + Assert(hscan->rs_inited); + + blockno = hscan->rs_cblock + 1; + + if (blockno >= hscan->rs_nblocks) + { + /* wrap to beginning of rel, might not have started at 0 */ + blockno = 0; + } + + /* + * Report our new scan position for synchronization purposes. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_flags & SO_ALLOW_SYNC) + ss_report_location(scan->rs_rd, blockno); + + if (blockno == hscan->rs_startblock) + { + blockno = InvalidBlockNumber; + } + } + } + + if (!BlockNumberIsValid(blockno)) + { + if (BufferIsValid(hscan->rs_cbuf)) + ReleaseBuffer(hscan->rs_cbuf); + hscan->rs_cbuf = InvalidBuffer; + hscan->rs_cblock = InvalidBlockNumber; + hscan->rs_inited = false; + + return false; + } + + tdeheapgetpage(scan, blockno); + hscan->rs_inited = true; + + return true; +} + +static bool +pg_tdeam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, + TupleTableSlot *slot) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + TsmRoutine *tsm = scanstate->tsmroutine; + BlockNumber blockno = hscan->rs_cblock; + bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0; + + Page page; + bool all_visible; + OffsetNumber maxoffset; + + /* + * When not using pagemode, we must lock the buffer during tuple + * visibility checks. + */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(hscan->rs_cbuf); + all_visible = PageIsAllVisible(page) && + !scan->rs_snapshot->takenDuringRecovery; + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + OffsetNumber tupoffset; + + CHECK_FOR_INTERRUPTS(); + + /* Ask the tablesample method which tuples to check on this page. */ + tupoffset = tsm->NextSampleTuple(scanstate, + blockno, + maxoffset); + + if (OffsetNumberIsValid(tupoffset)) + { + ItemId itemid; + bool visible; + HeapTuple tuple = &(hscan->rs_ctup); + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + + if (all_visible) + visible = true; + else + visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf, + tuple, tupoffset); + + /* in pagemode, tdeheapgetpage did this for us */ + if (!pagemode) + HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple, + hscan->rs_cbuf, scan->rs_snapshot); + + /* Try next tuple from same page. */ + if (!visible) + continue; + + /* Found visible tuple, return it. */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf); + + /* Count successfully-fetched tuples as heap fetches */ + pgstat_count_tdeheap_getnext(scan->rs_rd); + + return true; + } + else + { + /* + * If we get here, it means we've exhausted the items on this page + * and it's time to move to the next. + */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + ExecClearTuple(slot); + return false; + } + } + + Assert(0); +} + + +/* ---------------------------------------------------------------------------- + * Helper functions for the above. + * ---------------------------------------------------------------------------- + */ + +/* + * Reconstruct and rewrite the given tuple + * + * We cannot simply copy the tuple as-is, for several reasons: + * + * 1. We'd like to squeeze out the values of any dropped columns, both + * to save space and to ensure we have no corner-case failures. (It's + * possible for example that the new table hasn't got a TOAST table + * and so is unable to store any large values of dropped cols.) + * + * 2. The tuple might not even be legal for the new table; this is + * currently only known to happen as an after-effect of ALTER TABLE + * SET WITHOUT OIDS. + * + * So, we must reconstruct the tuple from component Datums. + */ +static void +reform_and_rewrite_tuple(HeapTuple tuple, + Relation OldHeap, Relation NewHeap, + Datum *values, bool *isnull, RewriteState rwstate) +{ + TupleDesc oldTupDesc = RelationGetDescr(OldHeap); + TupleDesc newTupDesc = RelationGetDescr(NewHeap); + HeapTuple copiedTuple; + int i; + + tdeheap_deform_tuple(tuple, oldTupDesc, values, isnull); + + /* Be sure to null out any dropped columns */ + for (i = 0; i < newTupDesc->natts; i++) + { + if (TupleDescAttr(newTupDesc, i)->attisdropped) + isnull[i] = true; + } + + copiedTuple = tdeheap_form_tuple(newTupDesc, values, isnull); + + /* The heap rewrite module does the rest */ + rewrite_tdeheap_tuple(rwstate, tuple, copiedTuple); + + tdeheap_freetuple(copiedTuple); +} + +/* + * Check visibility of the tuple. + */ +static bool +SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, + HeapTuple tuple, + OffsetNumber tupoffset) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + if (scan->rs_flags & SO_ALLOW_PAGEMODE) + { + /* + * In pageatatime mode, tdeheapgetpage() already did visibility checks, + * so just look at the info it left in rs_vistuples[]. + * + * We use a binary search over the known-sorted array. Note: we could + * save some effort if we insisted that NextSampleTuple select tuples + * in increasing order, but it's not clear that there would be enough + * gain to justify the restriction. + */ + int start = 0, + end = hscan->rs_ntuples - 1; + + while (start <= end) + { + int mid = (start + end) / 2; + OffsetNumber curoffset = hscan->rs_vistuples[mid]; + + if (tupoffset == curoffset) + return true; + else if (tupoffset < curoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* Otherwise, we have to check the tuple individually. */ + return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot, + buffer); + } +} + + +/* ------------------------------------------------------------------------ + * Definition of the heap table access method. + * ------------------------------------------------------------------------ + */ + +static const TableAmRoutine pg_tdeam_methods = { + .type = T_TableAmRoutine, + + .slot_callbacks = pg_tdeam_slot_callbacks, + + .scan_begin = tdeheap_beginscan, + .scan_end = tdeheap_endscan, + .scan_rescan = tdeheap_rescan, + .scan_getnextslot = tdeheap_getnextslot, + + .scan_set_tidrange = tdeheap_set_tidrange, + .scan_getnextslot_tidrange = tdeheap_getnextslot_tidrange, + + .parallelscan_estimate = table_block_parallelscan_estimate, + .parallelscan_initialize = table_block_parallelscan_initialize, + .parallelscan_reinitialize = table_block_parallelscan_reinitialize, + + .index_fetch_begin = pg_tdeam_index_fetch_begin, + .index_fetch_reset = pg_tdeam_index_fetch_reset, + .index_fetch_end = pg_tdeam_index_fetch_end, + .index_fetch_tuple = pg_tdeam_index_fetch_tuple, + + .tuple_insert = pg_tdeam_tuple_insert, + .tuple_insert_speculative = pg_tdeam_tuple_insert_speculative, + .tuple_complete_speculative = pg_tdeam_tuple_complete_speculative, + .multi_insert = tdeheap_multi_insert, + .tuple_delete = pg_tdeam_tuple_delete, + .tuple_update = pg_tdeam_tuple_update, + .tuple_lock = pg_tdeam_tuple_lock, + + .tuple_fetch_row_version = pg_tdeam_fetch_row_version, + .tuple_get_latest_tid = tdeheap_get_latest_tid, + .tuple_tid_valid = pg_tdeam_tuple_tid_valid, + .tuple_satisfies_snapshot = pg_tdeam_tuple_satisfies_snapshot, + .index_delete_tuples = tdeheap_index_delete_tuples, + + .relation_set_new_filelocator = pg_tdeam_relation_set_new_filelocator, + .relation_nontransactional_truncate = pg_tdeam_relation_nontransactional_truncate, + .relation_copy_data = pg_tdeam_relation_copy_data, + .relation_copy_for_cluster = pg_tdeam_relation_copy_for_cluster, + .relation_vacuum = tdeheap_vacuum_rel, + .scan_analyze_next_block = pg_tdeam_scan_analyze_next_block, + .scan_analyze_next_tuple = pg_tdeam_scan_analyze_next_tuple, + .index_build_range_scan = pg_tdeam_index_build_range_scan, + .index_validate_scan = pg_tdeam_index_validate_scan, + + .relation_size = table_block_relation_size, + .relation_needs_toast_table = pg_tdeam_relation_needs_toast_table, + .relation_toast_am = pg_tdeam_relation_toast_am, + .relation_fetch_toast_slice = tdeheap_fetch_toast_slice, + + .relation_estimate_size = pg_tdeam_estimate_rel_size, + + .scan_bitmap_next_block = pg_tdeam_scan_bitmap_next_block, + .scan_bitmap_next_tuple = pg_tdeam_scan_bitmap_next_tuple, + .scan_sample_next_block = pg_tdeam_scan_sample_next_block, + .scan_sample_next_tuple = pg_tdeam_scan_sample_next_tuple +}; + + +const TableAmRoutine * +GetHeapamTableAmRoutine(void) +{ + return &pg_tdeam_methods; +} + +Datum +tdeheap_tableam_handler(PG_FUNCTION_ARGS) +{ + PG_RETURN_POINTER(&pg_tdeam_methods); +} diff --git a/src16/access/pg_tdeam_visibility.c b/src16/access/pg_tdeam_visibility.c new file mode 100644 index 00000000..5e5d184d --- /dev/null +++ b/src16/access/pg_tdeam_visibility.c @@ -0,0 +1,1790 @@ +/*------------------------------------------------------------------------- + * + * pg_tdeam_visibility.c + * Tuple visibility rules for tuples stored in heap. + * + * NOTE: all the HeapTupleSatisfies routines will update the tuple's + * "hint" status bits if we see that the inserting or deleting transaction + * has now committed or aborted (and it is safe to set the hint bits). + * If the hint bits are changed, MarkBufferDirtyHint is called on + * the passed-in buffer. The caller must hold not only a pin, but at least + * shared buffer content lock on the buffer containing the tuple. + * + * NOTE: When using a non-MVCC snapshot, we must check + * TransactionIdIsInProgress (which looks in the PGPROC array) before + * TransactionIdDidCommit (which look in pg_xact). Otherwise we have a race + * condition: we might decide that a just-committed transaction crashed, + * because none of the tests succeed. xact.c is careful to record + * commit/abort in pg_xact before it unsets MyProc->xid in the PGPROC array. + * That fixes that problem, but it also means there is a window where + * TransactionIdIsInProgress and TransactionIdDidCommit will both return true. + * If we check only TransactionIdDidCommit, we could consider a tuple + * committed when a later GetSnapshotData call will still think the + * originating transaction is in progress, which leads to application-level + * inconsistency. The upshot is that we gotta check TransactionIdIsInProgress + * first in all code paths, except for a few cases where we are looking at + * subtransactions of our own main transaction and so there can't be any race + * condition. + * + * We can't use TransactionIdDidAbort here because it won't treat transactions + * that were in progress during a crash as aborted. We determine that + * transactions aborted/crashed through process of elimination instead. + * + * When using an MVCC snapshot, we rely on XidInMVCCSnapshot rather than + * TransactionIdIsInProgress, but the logic is otherwise the same: do not + * check pg_xact until after deciding that the xact is no longer in progress. + * + * + * Summary of visibility functions: + * + * HeapTupleSatisfiesMVCC() + * visible to supplied snapshot, excludes current command + * HeapTupleSatisfiesUpdate() + * visible to instant snapshot, with user-supplied command + * counter and more complex result + * HeapTupleSatisfiesSelf() + * visible to instant snapshot and current command + * HeapTupleSatisfiesDirty() + * like HeapTupleSatisfiesSelf(), but includes open transactions + * HeapTupleSatisfiesVacuum() + * visible to any running transaction, used by VACUUM + * HeapTupleSatisfiesNonVacuumable() + * Snapshot-style API for HeapTupleSatisfiesVacuum + * HeapTupleSatisfiesToast() + * visible unless part of interrupted vacuum, used for TOAST + * HeapTupleSatisfiesAny() + * all tuples are visible + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/heap/pg_tdeam_visibility.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/subtrans.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "storage/bufmgr.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/combocid.h" +#include "utils/snapmgr.h" + + +/* + * SetHintBits() + * + * Set commit/abort hint bits on a tuple, if appropriate at this time. + * + * It is only safe to set a transaction-committed hint bit if we know the + * transaction's commit record is guaranteed to be flushed to disk before the + * buffer, or if the table is temporary or unlogged and will be obliterated by + * a crash anyway. We cannot change the LSN of the page here, because we may + * hold only a share lock on the buffer, so we can only use the LSN to + * interlock this if the buffer's LSN already is newer than the commit LSN; + * otherwise we have to just refrain from setting the hint bit until some + * future re-examination of the tuple. + * + * We can always set hint bits when marking a transaction aborted. (Some + * code in heapam.c relies on that!) + * + * Also, if we are cleaning up HEAP_MOVED_IN or HEAP_MOVED_OFF entries, then + * we can always set the hint bits, since pre-9.0 VACUUM FULL always used + * synchronous commits and didn't move tuples that weren't previously + * hinted. (This is not known by this subroutine, but is applied by its + * callers.) Note: old-style VACUUM FULL is gone, but we have to keep this + * module's support for MOVED_OFF/MOVED_IN flag bits for as long as we + * support in-place update from pre-9.0 databases. + * + * Normal commits may be asynchronous, so for those we need to get the LSN + * of the transaction and then check whether this is flushed. + * + * The caller should pass xid as the XID of the transaction to check, or + * InvalidTransactionId if no check is needed. + */ +static inline void +SetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid) +{ + if (TransactionIdIsValid(xid)) + { + /* NB: xid must be known committed here! */ + XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); + + if (BufferIsPermanent(buffer) && XLogNeedsFlush(commitLSN) && + BufferGetLSNAtomic(buffer) < commitLSN) + { + /* not flushed and no LSN interlock, so don't set hint */ + return; + } + } + + tuple->t_infomask |= infomask; + MarkBufferDirtyHint(buffer, true); +} + +/* + * HeapTupleSetHintBits --- exported version of SetHintBits() + * + * This must be separate because of C99's brain-dead notions about how to + * implement inline functions. + */ +void +HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid) +{ + SetHintBits(tuple, buffer, infomask, xid); +} + + +/* + * HeapTupleSatisfiesSelf + * True iff heap tuple is valid "for itself". + * + * See SNAPSHOT_MVCC's definition for the intended behaviour. + * + * Note: + * Assumes heap tuple is valid. + * + * The satisfaction of "itself" requires the following: + * + * ((Xmin == my-transaction && the row was updated by the current transaction, and + * (Xmax is null it was not deleted + * [|| Xmax != my-transaction)]) [or it was deleted by another transaction] + * || + * + * (Xmin is committed && the row was modified by a committed transaction, and + * (Xmax is null || the row has not been deleted, or + * (Xmax != my-transaction && the row was deleted by another transaction + * Xmax is not committed))) that has not been committed + */ +static bool +HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return false; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; + if (!TransactionIdIsInProgress(xvac)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (TransactionIdIsInProgress(xvac)) + return false; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return true; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else + return false; + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + return false; + } + else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + return false; + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + return false; /* updated by other */ + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) + return true; + if (TransactionIdDidCommit(xmax)) + return false; + /* it must have aborted or crashed */ + return true; + } + + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + return false; + } + + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + return true; + + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + /* xmax transaction committed */ + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + return false; +} + +/* + * HeapTupleSatisfiesAny + * Dummy "satisfies" routine: any tuple satisfies SnapshotAny. + */ +static bool +HeapTupleSatisfiesAny(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + return true; +} + +/* + * HeapTupleSatisfiesToast + * True iff heap tuple is valid as a TOAST row. + * + * See SNAPSHOT_TOAST's definition for the intended behaviour. + * + * This is a simplified version that only checks for VACUUM moving conditions. + * It's appropriate for TOAST usage because TOAST really doesn't want to do + * its own time qual checks; if you can see the main table row that contains + * a TOAST reference, you should be able to see the TOASTed value. However, + * vacuuming a TOAST table is independent of the main table, and in case such + * a vacuum fails partway through, we'd better do this much checking. + * + * Among other things, this means you can't do UPDATEs of rows in a TOAST + * table. + */ +static bool +HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return false; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; + if (!TransactionIdIsInProgress(xvac)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (TransactionIdIsInProgress(xvac)) + return false; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + } + + /* + * An invalid Xmin can be left behind by a speculative insertion that + * is canceled by super-deleting the tuple. This also applies to + * TOAST tuples created during speculative insertion. + */ + else if (!TransactionIdIsValid(HeapTupleHeaderGetXmin(tuple))) + return false; + } + + /* otherwise assume the tuple is valid for TOAST. */ + return true; +} + +/* + * HeapTupleSatisfiesUpdate + * + * This function returns a more detailed result code than most of the + * functions in this file, since UPDATE needs to know more than "is it + * visible?". It also allows for user-supplied CommandId rather than + * relying on CurrentCommandId. + * + * The possible return codes are: + * + * TM_Invisible: the tuple didn't exist at all when the scan started, e.g. it + * was created by a later CommandId. + * + * TM_Ok: The tuple is valid and visible, so it may be updated. + * + * TM_SelfModified: The tuple was updated by the current transaction, after + * the current scan started. + * + * TM_Updated: The tuple was updated by a committed transaction (including + * the case where the tuple was moved into a different partition). + * + * TM_Deleted: The tuple was deleted by a committed transaction. + * + * TM_BeingModified: The tuple is being updated by an in-progress transaction + * other than the current transaction. (Note: this includes the case where + * the tuple is share-locked by a MultiXact, even if the MultiXact includes + * the current transaction. Callers that want to distinguish that case must + * test for it themselves.) + */ +TM_Result +HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return TM_Invisible; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return TM_Invisible; + if (!TransactionIdIsInProgress(xvac)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return TM_Invisible; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (TransactionIdIsInProgress(xvac)) + return TM_Invisible; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return TM_Invisible; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (HeapTupleHeaderGetCmin(tuple) >= curcid) + return TM_Invisible; /* inserted after scan started */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return TM_Ok; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + TransactionId xmax; + + xmax = HeapTupleHeaderGetRawXmax(tuple); + + /* + * Careful here: even though this tuple was created by our own + * transaction, it might be locked by other transactions, if + * the original version was key-share locked when we updated + * it. + */ + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + if (MultiXactIdIsRunning(xmax, true)) + return TM_BeingModified; + else + return TM_Ok; + } + + /* + * If the locker is gone, then there is nothing of interest + * left in this Xmax; otherwise, report the tuple as + * locked/updated. + */ + if (!TransactionIdIsInProgress(xmax)) + return TM_Ok; + return TM_BeingModified; + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* deleting subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + { + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), + false)) + return TM_BeingModified; + return TM_Ok; + } + else + { + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return TM_Ok; + } + + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + return TM_Invisible; + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return TM_Invisible; + } + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + return TM_Ok; + + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return TM_Ok; + if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) + return TM_Updated; /* updated by other */ + else + return TM_Deleted; /* deleted by other */ + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + if (HEAP_LOCKED_UPGRADED(tuple->t_infomask)) + return TM_Ok; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), true)) + return TM_BeingModified; + + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + return TM_Ok; + } + + xmax = HeapTupleGetUpdateXid(tuple); + if (!TransactionIdIsValid(xmax)) + { + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + return TM_BeingModified; + } + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + return TM_BeingModified; + + if (TransactionIdDidCommit(xmax)) + { + if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) + return TM_Updated; + else + return TM_Deleted; + } + + /* + * By here, the update in the Xmax is either aborted or crashed, but + * what about the other members? + */ + + if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + { + /* + * There's no member, even just a locker, alive anymore, so we can + * mark the Xmax as invalid. + */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return TM_Ok; + } + else + { + /* There are lockers running */ + return TM_BeingModified; + } + } + + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return TM_BeingModified; + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + return TM_BeingModified; + + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return TM_Ok; + } + + /* xmax transaction committed */ + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return TM_Ok; + } + + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) + return TM_Updated; /* updated by other */ + else + return TM_Deleted; /* deleted by other */ +} + +/* + * HeapTupleSatisfiesDirty + * True iff heap tuple is valid including effects of open transactions. + * + * See SNAPSHOT_DIRTY's definition for the intended behaviour. + * + * This is essentially like HeapTupleSatisfiesSelf as far as effects of + * the current transaction and committed/aborted xacts are concerned. + * However, we also include the effects of other xacts still in progress. + * + * A special hack is that the passed-in snapshot struct is used as an + * output argument to return the xids of concurrent xacts that affected the + * tuple. snapshot->xmin is set to the tuple's xmin if that is another + * transaction that's still in progress; or to InvalidTransactionId if the + * tuple's xmin is committed good, committed dead, or my own xact. + * Similarly for snapshot->xmax and the tuple's xmax. If the tuple was + * inserted speculatively, meaning that the inserter might still back down + * on the insertion without aborting the whole transaction, the associated + * token is also returned in snapshot->speculativeToken. + */ +static bool +HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + snapshot->xmin = snapshot->xmax = InvalidTransactionId; + snapshot->speculativeToken = 0; + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return false; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; + if (!TransactionIdIsInProgress(xvac)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (TransactionIdIsInProgress(xvac)) + return false; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return true; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else + return false; + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + return false; + } + else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + { + /* + * Return the speculative token to caller. Caller can worry about + * xmax, since it requires a conclusively locked row version, and + * a concurrent update to this tuple is a conflict of its + * purposes. + */ + if (HeapTupleHeaderIsSpeculative(tuple)) + { + snapshot->speculativeToken = + HeapTupleHeaderGetSpeculativeToken(tuple); + + Assert(snapshot->speculativeToken != 0); + } + + snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple); + /* XXX shouldn't we fall through to look at xmax? */ + return true; /* in insertion by other */ + } + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + return false; /* updated by other */ + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) + { + snapshot->xmax = xmax; + return true; + } + if (TransactionIdDidCommit(xmax)) + return false; + /* it must have aborted or crashed */ + return true; + } + + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + return false; + } + + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + { + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple); + return true; + } + + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + /* xmax transaction committed */ + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + return false; /* updated by other */ +} + +/* + * HeapTupleSatisfiesMVCC + * True iff heap tuple is valid for the given MVCC snapshot. + * + * See SNAPSHOT_MVCC's definition for the intended behaviour. + * + * Notice that here, we will not update the tuple status hint bits if the + * inserting/deleting transaction is still running according to our snapshot, + * even if in reality it's committed or aborted by now. This is intentional. + * Checking the true transaction state would require access to high-traffic + * shared data structures, creating contention we'd rather do without, and it + * would not change the result of our visibility check anyway. The hint bits + * will be updated by the first visitor that has a snapshot new enough to see + * the inserting/deleting transaction as done. In the meantime, the cost of + * leaving the hint bits unset is basically that each HeapTupleSatisfiesMVCC + * call will need to run TransactionIdIsCurrentTransactionId in addition to + * XidInMVCCSnapshot (but it would have to do the latter anyway). In the old + * coding where we tried to set the hint bits as soon as possible, we instead + * did TransactionIdIsInProgress in each call --- to no avail, as long as the + * inserting/deleting transaction was still running --- which was more cycles + * and more contention on ProcArrayLock. + */ +static bool +HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return false; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; + if (!XidInMVCCSnapshot(xvac, snapshot)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (XidInMVCCSnapshot(xvac, snapshot)) + return false; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) + return false; /* inserted after scan started */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return true; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* updated after scan started */ + else + return false; /* updated before scan started */ + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) + return false; + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + else + { + /* xmin is committed, but maybe not according to our snapshot */ + if (!HeapTupleHeaderXminFrozen(tuple) && + XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) + return false; /* treat as still in progress */ + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + return true; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + /* already checked above */ + Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + if (XidInMVCCSnapshot(xmax, snapshot)) + return true; + if (TransactionIdDidCommit(xmax)) + return false; /* updating transaction committed */ + /* it must have aborted or crashed */ + return true; + } + + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + + if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) + return true; + + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + /* xmax transaction committed */ + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + } + else + { + /* xmax is committed, but maybe not according to our snapshot */ + if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) + return true; /* treat as still in progress */ + } + + /* xmax transaction committed */ + + return false; +} + + +/* + * HeapTupleSatisfiesVacuum + * + * Determine the status of tuples for VACUUM purposes. Here, what + * we mainly want to know is if a tuple is potentially visible to *any* + * running transaction. If so, it can't be removed yet by VACUUM. + * + * OldestXmin is a cutoff XID (obtained from + * GetOldestNonRemovableTransactionId()). Tuples deleted by XIDs >= + * OldestXmin are deemed "recently dead"; they might still be visible to some + * open transaction, so we can't remove them, even if we see that the deleting + * transaction has committed. + */ +HTSV_Result +HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, + Buffer buffer) +{ + TransactionId dead_after = InvalidTransactionId; + HTSV_Result res; + + res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); + + if (res == HEAPTUPLE_RECENTLY_DEAD) + { + Assert(TransactionIdIsValid(dead_after)); + + if (TransactionIdPrecedes(dead_after, OldestXmin)) + res = HEAPTUPLE_DEAD; + } + else + Assert(!TransactionIdIsValid(dead_after)); + + return res; +} + +/* + * Work horse for HeapTupleSatisfiesVacuum and similar routines. + * + * In contrast to HeapTupleSatisfiesVacuum this routine, when encountering a + * tuple that could still be visible to some backend, stores the xid that + * needs to be compared with the horizon in *dead_after, and returns + * HEAPTUPLE_RECENTLY_DEAD. The caller then can perform the comparison with + * the horizon. This is e.g. useful when comparing with different horizons. + * + * Note: HEAPTUPLE_DEAD can still be returned here, e.g. if the inserting + * transaction aborted. + */ +HTSV_Result +HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + Assert(dead_after != NULL); + + *dead_after = InvalidTransactionId; + + /* + * Has inserting transaction committed? + * + * If the inserting transaction aborted, then the tuple was never visible + * to any other transaction, so we can delete it immediately. + */ + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return HEAPTUPLE_DEAD; + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return HEAPTUPLE_DELETE_IN_PROGRESS; + if (TransactionIdIsInProgress(xvac)) + return HEAPTUPLE_DELETE_IN_PROGRESS; + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return HEAPTUPLE_DEAD; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return HEAPTUPLE_INSERT_IN_PROGRESS; + if (TransactionIdIsInProgress(xvac)) + return HEAPTUPLE_INSERT_IN_PROGRESS; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return HEAPTUPLE_DEAD; + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return HEAPTUPLE_INSERT_IN_PROGRESS; + /* only locked? run infomask-only check first, for performance */ + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) || + HeapTupleHeaderIsOnlyLocked(tuple)) + return HEAPTUPLE_INSERT_IN_PROGRESS; + /* inserted and then deleted by same xact */ + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple))) + return HEAPTUPLE_DELETE_IN_PROGRESS; + /* deleting subtransaction must have aborted */ + return HEAPTUPLE_INSERT_IN_PROGRESS; + } + else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + { + /* + * It'd be possible to discern between INSERT/DELETE in progress + * here by looking at xmax - but that doesn't seem beneficial for + * the majority of callers and even detrimental for some. We'd + * rather have callers look at/wait for xmin than xmax. It's + * always correct to return INSERT_IN_PROGRESS because that's + * what's happening from the view of other backends. + */ + return HEAPTUPLE_INSERT_IN_PROGRESS; + } + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* + * Not in Progress, Not Committed, so either Aborted or crashed + */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return HEAPTUPLE_DEAD; + } + + /* + * At this point the xmin is known committed, but we might not have + * been able to set the hint bit yet; so we can no longer Assert that + * it's set. + */ + } + + /* + * Okay, the inserter committed, so it was good at some point. Now what + * about the deleting transaction? + */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return HEAPTUPLE_LIVE; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + /* + * "Deleting" xact really only locked it, so the tuple is live in any + * case. However, we should make sure that either XMAX_COMMITTED or + * XMAX_INVALID gets set once the xact is gone, to reduce the costs of + * examining the tuple for future xacts. + */ + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + /* + * If it's a pre-pg_upgrade tuple, the multixact cannot + * possibly be running; otherwise have to check. + */ + if (!HEAP_LOCKED_UPGRADED(tuple->t_infomask) && + MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), + true)) + return HEAPTUPLE_LIVE; + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + } + else + { + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + return HEAPTUPLE_LIVE; + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + } + } + + /* + * We don't really care whether xmax did commit, abort or crash. We + * know that xmax did lock the tuple, but it did not and will never + * actually update it. + */ + + return HEAPTUPLE_LIVE; + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax = HeapTupleGetUpdateXid(tuple); + + /* already checked above */ + Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsInProgress(xmax)) + return HEAPTUPLE_DELETE_IN_PROGRESS; + else if (TransactionIdDidCommit(xmax)) + { + /* + * The multixact might still be running due to lockers. Need to + * allow for pruning if below the xid horizon regardless -- + * otherwise we could end up with a tuple where the updater has to + * be removed due to the horizon, but is not pruned away. It's + * not a problem to prune that tuple, because any remaining + * lockers will also be present in newer tuple versions. + */ + *dead_after = xmax; + return HEAPTUPLE_RECENTLY_DEAD; + } + else if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + { + /* + * Not in Progress, Not Committed, so either Aborted or crashed. + * Mark the Xmax as invalid. + */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + } + + return HEAPTUPLE_LIVE; + } + + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + return HEAPTUPLE_DELETE_IN_PROGRESS; + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + else + { + /* + * Not in Progress, Not Committed, so either Aborted or crashed + */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return HEAPTUPLE_LIVE; + } + + /* + * At this point the xmax is known committed, but we might not have + * been able to set the hint bit yet; so we can no longer Assert that + * it's set. + */ + } + + /* + * Deleter committed, allow caller to check if it was recent enough that + * some open transactions could still see the tuple. + */ + *dead_after = HeapTupleHeaderGetRawXmax(tuple); + return HEAPTUPLE_RECENTLY_DEAD; +} + + +/* + * HeapTupleSatisfiesNonVacuumable + * + * True if tuple might be visible to some transaction; false if it's + * surely dead to everyone, ie, vacuumable. + * + * See SNAPSHOT_NON_VACUUMABLE's definition for the intended behaviour. + * + * This is an interface to HeapTupleSatisfiesVacuum that's callable via + * HeapTupleSatisfiesSnapshot, so it can be used through a Snapshot. + * snapshot->vistest must have been set up with the horizon to use. + */ +static bool +HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + TransactionId dead_after = InvalidTransactionId; + HTSV_Result res; + + res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); + + if (res == HEAPTUPLE_RECENTLY_DEAD) + { + Assert(TransactionIdIsValid(dead_after)); + + if (GlobalVisTestIsRemovableXid(snapshot->vistest, dead_after)) + res = HEAPTUPLE_DEAD; + } + else + Assert(!TransactionIdIsValid(dead_after)); + + return res != HEAPTUPLE_DEAD; +} + + +/* + * HeapTupleIsSurelyDead + * + * Cheaply determine whether a tuple is surely dead to all onlookers. + * We sometimes use this in lieu of HeapTupleSatisfiesVacuum when the + * tuple has just been tested by another visibility routine (usually + * HeapTupleSatisfiesMVCC) and, therefore, any hint bits that can be set + * should already be set. We assume that if no hint bits are set, the xmin + * or xmax transaction is still running. This is therefore faster than + * HeapTupleSatisfiesVacuum, because we consult neither procarray nor CLOG. + * It's okay to return false when in doubt, but we must return true only + * if the tuple is removable. + */ +bool +HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + /* + * If the inserting transaction is marked invalid, then it aborted, and + * the tuple is definitely dead. If it's marked neither committed nor + * invalid, then we assume it's still alive (since the presumption is that + * all relevant hint bits were just set moments ago). + */ + if (!HeapTupleHeaderXminCommitted(tuple)) + return HeapTupleHeaderXminInvalid(tuple); + + /* + * If the inserting transaction committed, but any deleting transaction + * aborted, the tuple is still alive. + */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return false; + + /* + * If the XMAX is just a lock, the tuple is still alive. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return false; + + /* + * If the Xmax is a MultiXact, it might be dead or alive, but we cannot + * know without checking pg_multixact. + */ + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + return false; + + /* If deleter isn't known to have committed, assume it's still running. */ + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + return false; + + /* Deleter committed, so tuple is dead if the XID is old enough. */ + return GlobalVisTestIsRemovableXid(vistest, + HeapTupleHeaderGetRawXmax(tuple)); +} + +/* + * Is the tuple really only locked? That is, is it not updated? + * + * It's easy to check just infomask bits if the locker is not a multi; but + * otherwise we need to verify that the updating transaction has not aborted. + * + * This function is here because it follows the same visibility rules laid out + * at the top of this file. + */ +bool +HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) +{ + TransactionId xmax; + + /* if there's no valid Xmax, then there's obviously no update either */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return true; + + if (tuple->t_infomask & HEAP_XMAX_LOCK_ONLY) + return true; + + /* invalid xmax means no update */ + if (!TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple))) + return true; + + /* + * if HEAP_XMAX_LOCK_ONLY is not set and not a multi, then this must + * necessarily have been updated + */ + if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)) + return false; + + /* ... but if it's a multi, then perhaps the updating Xid aborted. */ + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) + return false; + if (TransactionIdDidCommit(xmax)) + return false; + + /* + * not current, not in progress, not committed -- must have aborted or + * crashed + */ + return true; +} + +/* + * check whether the transaction id 'xid' is in the pre-sorted array 'xip'. + */ +static bool +TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num) +{ + return num > 0 && + bsearch(&xid, xip, num, sizeof(TransactionId), xidComparator) != NULL; +} + +/* + * See the comments for HeapTupleSatisfiesMVCC for the semantics this function + * obeys. + * + * Only usable on tuples from catalog tables! + * + * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support + * reading catalog pages which couldn't have been created in an older version. + * + * We don't set any hint bits in here as it seems unlikely to be beneficial as + * those should already be set by normal access and it seems to be too + * dangerous to do so as the semantics of doing so during timetravel are more + * complicated than when dealing "only" with the present. + */ +static bool +HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + TransactionId xmin = HeapTupleHeaderGetXmin(tuple); + TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple); + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + /* inserting transaction aborted */ + if (HeapTupleHeaderXminInvalid(tuple)) + { + Assert(!TransactionIdDidCommit(xmin)); + return false; + } + /* check if it's one of our txids, toplevel is also in there */ + else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt)) + { + bool resolved; + CommandId cmin = HeapTupleHeaderGetRawCommandId(tuple); + CommandId cmax = InvalidCommandId; + + /* + * another transaction might have (tried to) delete this tuple or + * cmin/cmax was stored in a combo CID. So we need to lookup the + * actual values externally. + */ + resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, + htup, buffer, + &cmin, &cmax); + + /* + * If we haven't resolved the combo CID to cmin/cmax, that means we + * have not decoded the combo CID yet. That means the cmin is + * definitely in the future, and we're not supposed to see the tuple + * yet. + * + * XXX This only applies to decoding of in-progress transactions. In + * regular logical decoding we only execute this code at commit time, + * at which point we should have seen all relevant combo CIDs. So + * ideally, we should error out in this case but in practice, this + * won't happen. If we are too worried about this then we can add an + * elog inside ResolveCminCmaxDuringDecoding. + * + * XXX For the streaming case, we can track the largest combo CID + * assigned, and error out based on this (when unable to resolve combo + * CID below that observed maximum value). + */ + if (!resolved) + return false; + + Assert(cmin != InvalidCommandId); + + if (cmin >= snapshot->curcid) + return false; /* inserted after scan started */ + /* fall through */ + } + /* committed before our xmin horizon. Do a normal visibility check. */ + else if (TransactionIdPrecedes(xmin, snapshot->xmin)) + { + Assert(!(HeapTupleHeaderXminCommitted(tuple) && + !TransactionIdDidCommit(xmin))); + + /* check for hint bit first, consult clog afterwards */ + if (!HeapTupleHeaderXminCommitted(tuple) && + !TransactionIdDidCommit(xmin)) + return false; + /* fall through */ + } + /* beyond our xmax horizon, i.e. invisible */ + else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax)) + { + return false; + } + /* check if it's a committed transaction in [xmin, xmax) */ + else if (TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt)) + { + /* fall through */ + } + + /* + * none of the above, i.e. between [xmin, xmax) but hasn't committed. I.e. + * invisible. + */ + else + { + return false; + } + + /* at this point we know xmin is visible, go on to check xmax */ + + /* xid invalid or aborted */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return true; + /* locked tuples are always visible */ + else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + + /* + * We can see multis here if we're looking at user tables or if somebody + * SELECT ... FOR SHARE/UPDATE a system table. + */ + else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + xmax = HeapTupleGetUpdateXid(tuple); + } + + /* check if it's one of our txids, toplevel is also in there */ + if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt)) + { + bool resolved; + CommandId cmin; + CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple); + + /* Lookup actual cmin/cmax values */ + resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, + htup, buffer, + &cmin, &cmax); + + /* + * If we haven't resolved the combo CID to cmin/cmax, that means we + * have not decoded the combo CID yet. That means the cmax is + * definitely in the future, and we're still supposed to see the + * tuple. + * + * XXX This only applies to decoding of in-progress transactions. In + * regular logical decoding we only execute this code at commit time, + * at which point we should have seen all relevant combo CIDs. So + * ideally, we should error out in this case but in practice, this + * won't happen. If we are too worried about this then we can add an + * elog inside ResolveCminCmaxDuringDecoding. + * + * XXX For the streaming case, we can track the largest combo CID + * assigned, and error out based on this (when unable to resolve combo + * CID below that observed maximum value). + */ + if (!resolved || cmax == InvalidCommandId) + return true; + + if (cmax >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + /* below xmin horizon, normal transaction state is valid */ + else if (TransactionIdPrecedes(xmax, snapshot->xmin)) + { + Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED && + !TransactionIdDidCommit(xmax))); + + /* check hint bit first */ + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + return false; + + /* check clog */ + return !TransactionIdDidCommit(xmax); + } + /* above xmax horizon, we cannot possibly see the deleting transaction */ + else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax)) + return true; + /* xmax is between [xmin, xmax), check known committed array */ + else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt)) + return false; + /* xmax is between [xmin, xmax), but known not to have committed yet */ + else + return true; +} + +/* + * HeapTupleSatisfiesVisibility + * True iff heap tuple satisfies a time qual. + * + * Notes: + * Assumes heap tuple is valid, and buffer at least share locked. + * + * Hint bits in the HeapTuple's t_infomask may be updated as a side effect; + * if so, the indicated buffer is marked dirty. + */ +bool +HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + switch (snapshot->snapshot_type) + { + case SNAPSHOT_MVCC: + return HeapTupleSatisfiesMVCC(htup, snapshot, buffer); + case SNAPSHOT_SELF: + return HeapTupleSatisfiesSelf(htup, snapshot, buffer); + case SNAPSHOT_ANY: + return HeapTupleSatisfiesAny(htup, snapshot, buffer); + case SNAPSHOT_TOAST: + return HeapTupleSatisfiesToast(htup, snapshot, buffer); + case SNAPSHOT_DIRTY: + return HeapTupleSatisfiesDirty(htup, snapshot, buffer); + case SNAPSHOT_HISTORIC_MVCC: + return HeapTupleSatisfiesHistoricMVCC(htup, snapshot, buffer); + case SNAPSHOT_NON_VACUUMABLE: + return HeapTupleSatisfiesNonVacuumable(htup, snapshot, buffer); + } + + return false; /* keep compiler quiet */ +} diff --git a/src16/access/pg_tdetoast.c b/src16/access/pg_tdetoast.c new file mode 100644 index 00000000..56b7de0c --- /dev/null +++ b/src16/access/pg_tdetoast.c @@ -0,0 +1,793 @@ +/*------------------------------------------------------------------------- + * + * heaptoast.c + * Heap-specific definitions for external and compressed storage + * of variable size attributes. + * + * Copyright (c) 2000-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/access/heap/heaptoast.c + * + * + * INTERFACE ROUTINES + * tdeheap_toast_insert_or_update - + * Try to make a given tuple fit into one page by compressing + * or moving off attributes + * + * tdeheap_toast_delete - + * Reclaim toast storage when a tuple is deleted + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/toast_helper.h" +#include "access/toast_internals.h" +#include "utils/fmgroids.h" + + +/* ---------- + * tdeheap_toast_delete - + * + * Cascaded delete toast-entries on DELETE + * ---------- + */ +void +tdeheap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative) +{ + TupleDesc tupleDesc; + Datum toast_values[MaxHeapAttributeNumber]; + bool toast_isnull[MaxHeapAttributeNumber]; + + /* + * We should only ever be called for tuples of plain relations or + * materialized views --- recursing on a toast rel is bad news. + */ + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW); + + /* + * Get the tuple descriptor and break down the tuple into fields. + * + * NOTE: it's debatable whether to use tdeheap_deform_tuple() here or just + * tdeheap_getattr() only the varlena columns. The latter could win if there + * are few varlena columns and many non-varlena ones. However, + * tdeheap_deform_tuple costs only O(N) while the tdeheap_getattr way would cost + * O(N^2) if there are many varlena columns, so it seems better to err on + * the side of linear cost. (We won't even be here unless there's at + * least one varlena column, by the way.) + */ + tupleDesc = rel->rd_att; + + Assert(tupleDesc->natts <= MaxHeapAttributeNumber); + tdeheap_deform_tuple(oldtup, tupleDesc, toast_values, toast_isnull); + + /* Do the real work. */ + toast_delete_external(rel, toast_values, toast_isnull, is_speculative); +} + + +/* ---------- + * tdeheap_toast_insert_or_update - + * + * Delete no-longer-used toast-entries and create new ones to + * make the new tuple fit on INSERT or UPDATE + * + * Inputs: + * newtup: the candidate new tuple to be inserted + * oldtup: the old row version for UPDATE, or NULL for INSERT + * options: options to be passed to tdeheap_insert() for toast rows + * Result: + * either newtup if no toasting is needed, or a palloc'd modified tuple + * that is what should actually get stored + * + * NOTE: neither newtup nor oldtup will be modified. This is a change + * from the pre-8.1 API of this routine. + * ---------- + */ +HeapTuple +tdeheap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, + int options) +{ + HeapTuple result_tuple; + TupleDesc tupleDesc; + int numAttrs; + + Size maxDataLen; + Size hoff; + + bool toast_isnull[MaxHeapAttributeNumber]; + bool toast_oldisnull[MaxHeapAttributeNumber]; + Datum toast_values[MaxHeapAttributeNumber]; + Datum toast_oldvalues[MaxHeapAttributeNumber]; + ToastAttrInfo toast_attr[MaxHeapAttributeNumber]; + ToastTupleContext ttc; + + /* + * Ignore the INSERT_SPECULATIVE option. Speculative insertions/super + * deletions just normally insert/delete the toast values. It seems + * easiest to deal with that here, instead on, potentially, multiple + * callers. + */ + options &= ~HEAP_INSERT_SPECULATIVE; + + /* + * We should only ever be called for tuples of plain relations or + * materialized views --- recursing on a toast rel is bad news. + */ + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW); + + /* + * Get the tuple descriptor and break down the tuple(s) into fields. + */ + tupleDesc = rel->rd_att; + numAttrs = tupleDesc->natts; + + Assert(numAttrs <= MaxHeapAttributeNumber); + tdeheap_deform_tuple(newtup, tupleDesc, toast_values, toast_isnull); + if (oldtup != NULL) + tdeheap_deform_tuple(oldtup, tupleDesc, toast_oldvalues, toast_oldisnull); + + /* ---------- + * Prepare for toasting + * ---------- + */ + ttc.ttc_rel = rel; + ttc.ttc_values = toast_values; + ttc.ttc_isnull = toast_isnull; + if (oldtup == NULL) + { + ttc.ttc_oldvalues = NULL; + ttc.ttc_oldisnull = NULL; + } + else + { + ttc.ttc_oldvalues = toast_oldvalues; + ttc.ttc_oldisnull = toast_oldisnull; + } + ttc.ttc_attr = toast_attr; + toast_tuple_init(&ttc); + + /* ---------- + * Compress and/or save external until data fits into target length + * + * 1: Inline compress attributes with attstorage EXTENDED, and store very + * large attributes with attstorage EXTENDED or EXTERNAL external + * immediately + * 2: Store attributes with attstorage EXTENDED or EXTERNAL external + * 3: Inline compress attributes with attstorage MAIN + * 4: Store attributes with attstorage MAIN external + * ---------- + */ + + /* compute header overhead --- this should match tdeheap_form_tuple() */ + hoff = SizeofHeapTupleHeader; + if ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) + hoff += BITMAPLEN(numAttrs); + hoff = MAXALIGN(hoff); + /* now convert to a limit on the tuple data size */ + maxDataLen = RelationGetToastTupleTarget(rel, TOAST_TUPLE_TARGET) - hoff; + + /* + * Look for attributes with attstorage EXTENDED to compress. Also find + * large attributes with attstorage EXTENDED or EXTERNAL, and store them + * external. + */ + while (tdeheap_compute_data_size(tupleDesc, + toast_values, toast_isnull) > maxDataLen) + { + int biggest_attno; + + biggest_attno = toast_tuple_find_biggest_attribute(&ttc, true, false); + if (biggest_attno < 0) + break; + + /* + * Attempt to compress it inline, if it has attstorage EXTENDED + */ + if (TupleDescAttr(tupleDesc, biggest_attno)->attstorage == TYPSTORAGE_EXTENDED) + toast_tuple_try_compression(&ttc, biggest_attno); + else + { + /* + * has attstorage EXTERNAL, ignore on subsequent compression + * passes + */ + toast_attr[biggest_attno].tai_colflags |= TOASTCOL_INCOMPRESSIBLE; + } + + /* + * If this value is by itself more than maxDataLen (after compression + * if any), push it out to the toast table immediately, if possible. + * This avoids uselessly compressing other fields in the common case + * where we have one long field and several short ones. + * + * XXX maybe the threshold should be less than maxDataLen? + */ + if (toast_attr[biggest_attno].tai_size > maxDataLen && + rel->rd_rel->reltoastrelid != InvalidOid) + tdeheap_toast_tuple_externalize(&ttc, biggest_attno, options); + } + + /* + * Second we look for attributes of attstorage EXTENDED or EXTERNAL that + * are still inline, and make them external. But skip this if there's no + * toast table to push them to. + */ + while (tdeheap_compute_data_size(tupleDesc, + toast_values, toast_isnull) > maxDataLen && + rel->rd_rel->reltoastrelid != InvalidOid) + { + int biggest_attno; + + biggest_attno = toast_tuple_find_biggest_attribute(&ttc, false, false); + if (biggest_attno < 0) + break; + tdeheap_toast_tuple_externalize(&ttc, biggest_attno, options); + } + + /* + * Round 3 - this time we take attributes with storage MAIN into + * compression + */ + while (tdeheap_compute_data_size(tupleDesc, + toast_values, toast_isnull) > maxDataLen) + { + int biggest_attno; + + biggest_attno = toast_tuple_find_biggest_attribute(&ttc, true, true); + if (biggest_attno < 0) + break; + + toast_tuple_try_compression(&ttc, biggest_attno); + } + + /* + * Finally we store attributes of type MAIN externally. At this point we + * increase the target tuple size, so that MAIN attributes aren't stored + * externally unless really necessary. + */ + maxDataLen = TOAST_TUPLE_TARGET_MAIN - hoff; + + while (tdeheap_compute_data_size(tupleDesc, + toast_values, toast_isnull) > maxDataLen && + rel->rd_rel->reltoastrelid != InvalidOid) + { + int biggest_attno; + + biggest_attno = toast_tuple_find_biggest_attribute(&ttc, false, true); + if (biggest_attno < 0) + break; + + tdeheap_toast_tuple_externalize(&ttc, biggest_attno, options); + } + + /* + * In the case we toasted any values, we need to build a new heap tuple + * with the changed values. + */ + if ((ttc.ttc_flags & TOAST_NEEDS_CHANGE) != 0) + { + HeapTupleHeader olddata = newtup->t_data; + HeapTupleHeader new_data; + int32 new_header_len; + int32 new_data_len; + int32 new_tuple_len; + + /* + * Calculate the new size of the tuple. + * + * Note: we used to assume here that the old tuple's t_hoff must equal + * the new_header_len value, but that was incorrect. The old tuple + * might have a smaller-than-current natts, if there's been an ALTER + * TABLE ADD COLUMN since it was stored; and that would lead to a + * different conclusion about the size of the null bitmap, or even + * whether there needs to be one at all. + */ + new_header_len = SizeofHeapTupleHeader; + if ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) + new_header_len += BITMAPLEN(numAttrs); + new_header_len = MAXALIGN(new_header_len); + new_data_len = tdeheap_compute_data_size(tupleDesc, + toast_values, toast_isnull); + new_tuple_len = new_header_len + new_data_len; + + /* + * Allocate and zero the space needed, and fill HeapTupleData fields. + */ + result_tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + new_tuple_len); + result_tuple->t_len = new_tuple_len; + result_tuple->t_self = newtup->t_self; + result_tuple->t_tableOid = newtup->t_tableOid; + new_data = (HeapTupleHeader) ((char *) result_tuple + HEAPTUPLESIZE); + result_tuple->t_data = new_data; + + /* + * Copy the existing tuple header, but adjust natts and t_hoff. + */ + memcpy(new_data, olddata, SizeofHeapTupleHeader); + HeapTupleHeaderSetNatts(new_data, numAttrs); + new_data->t_hoff = new_header_len; + + /* Copy over the data, and fill the null bitmap if needed */ + tdeheap_fill_tuple(tupleDesc, + toast_values, + toast_isnull, + (char *) new_data + new_header_len, + new_data_len, + &(new_data->t_infomask), + ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) ? + new_data->t_bits : NULL); + } + else + result_tuple = newtup; + + toast_tuple_cleanup(&ttc); + + return result_tuple; +} + + +/* ---------- + * toast_flatten_tuple - + * + * "Flatten" a tuple to contain no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * + * Note: we expect the caller already checked HeapTupleHasExternal(tup), + * so there is no need for a short-circuit path. + * ---------- + */ +HeapTuple +toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc) +{ + HeapTuple new_tuple; + int numAttrs = tupleDesc->natts; + int i; + Datum toast_values[MaxTupleAttributeNumber]; + bool toast_isnull[MaxTupleAttributeNumber]; + bool toast_free[MaxTupleAttributeNumber]; + + /* + * Break down the tuple into fields. + */ + Assert(numAttrs <= MaxTupleAttributeNumber); + tdeheap_deform_tuple(tup, tupleDesc, toast_values, toast_isnull); + + memset(toast_free, 0, numAttrs * sizeof(bool)); + + for (i = 0; i < numAttrs; i++) + { + /* + * Look at non-null varlena attributes + */ + if (!toast_isnull[i] && TupleDescAttr(tupleDesc, i)->attlen == -1) + { + struct varlena *new_value; + + new_value = (struct varlena *) DatumGetPointer(toast_values[i]); + if (VARATT_IS_EXTERNAL(new_value)) + { + new_value = detoast_external_attr(new_value); + toast_values[i] = PointerGetDatum(new_value); + toast_free[i] = true; + } + } + } + + /* + * Form the reconfigured tuple. + */ + new_tuple = tdeheap_form_tuple(tupleDesc, toast_values, toast_isnull); + + /* + * Be sure to copy the tuple's identity fields. We also make a point of + * copying visibility info, just in case anybody looks at those fields in + * a syscache entry. + */ + new_tuple->t_self = tup->t_self; + new_tuple->t_tableOid = tup->t_tableOid; + + new_tuple->t_data->t_choice = tup->t_data->t_choice; + new_tuple->t_data->t_ctid = tup->t_data->t_ctid; + new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; + new_tuple->t_data->t_infomask |= + tup->t_data->t_infomask & HEAP_XACT_MASK; + new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; + new_tuple->t_data->t_infomask2 |= + tup->t_data->t_infomask2 & HEAP2_XACT_MASK; + + /* + * Free allocated temp values + */ + for (i = 0; i < numAttrs; i++) + if (toast_free[i]) + pfree(DatumGetPointer(toast_values[i])); + + return new_tuple; +} + + +/* ---------- + * toast_flatten_tuple_to_datum - + * + * "Flatten" a tuple containing out-of-line toasted fields into a Datum. + * The result is always palloc'd in the current memory context. + * + * We have a general rule that Datums of container types (rows, arrays, + * ranges, etc) must not contain any external TOAST pointers. Without + * this rule, we'd have to look inside each Datum when preparing a tuple + * for storage, which would be expensive and would fail to extend cleanly + * to new sorts of container types. + * + * However, we don't want to say that tuples represented as HeapTuples + * can't contain toasted fields, so instead this routine should be called + * when such a HeapTuple is being converted into a Datum. + * + * While we're at it, we decompress any compressed fields too. This is not + * necessary for correctness, but reflects an expectation that compression + * will be more effective if applied to the whole tuple not individual + * fields. We are not so concerned about that that we want to deconstruct + * and reconstruct tuples just to get rid of compressed fields, however. + * So callers typically won't call this unless they see that the tuple has + * at least one external field. + * + * On the other hand, in-line short-header varlena fields are left alone. + * If we "untoasted" them here, they'd just get changed back to short-header + * format anyway within tdeheap_fill_tuple. + * ---------- + */ +Datum +toast_flatten_tuple_to_datum(HeapTupleHeader tup, + uint32 tup_len, + TupleDesc tupleDesc) +{ + HeapTupleHeader new_data; + int32 new_header_len; + int32 new_data_len; + int32 new_tuple_len; + HeapTupleData tmptup; + int numAttrs = tupleDesc->natts; + int i; + bool has_nulls = false; + Datum toast_values[MaxTupleAttributeNumber]; + bool toast_isnull[MaxTupleAttributeNumber]; + bool toast_free[MaxTupleAttributeNumber]; + + /* Build a temporary HeapTuple control structure */ + tmptup.t_len = tup_len; + ItemPointerSetInvalid(&(tmptup.t_self)); + tmptup.t_tableOid = InvalidOid; + tmptup.t_data = tup; + + /* + * Break down the tuple into fields. + */ + Assert(numAttrs <= MaxTupleAttributeNumber); + tdeheap_deform_tuple(&tmptup, tupleDesc, toast_values, toast_isnull); + + memset(toast_free, 0, numAttrs * sizeof(bool)); + + for (i = 0; i < numAttrs; i++) + { + /* + * Look at non-null varlena attributes + */ + if (toast_isnull[i]) + has_nulls = true; + else if (TupleDescAttr(tupleDesc, i)->attlen == -1) + { + struct varlena *new_value; + + new_value = (struct varlena *) DatumGetPointer(toast_values[i]); + if (VARATT_IS_EXTERNAL(new_value) || + VARATT_IS_COMPRESSED(new_value)) + { + new_value = detoast_attr(new_value); + toast_values[i] = PointerGetDatum(new_value); + toast_free[i] = true; + } + } + } + + /* + * Calculate the new size of the tuple. + * + * This should match the reconstruction code in + * tdeheap_toast_insert_or_update. + */ + new_header_len = SizeofHeapTupleHeader; + if (has_nulls) + new_header_len += BITMAPLEN(numAttrs); + new_header_len = MAXALIGN(new_header_len); + new_data_len = tdeheap_compute_data_size(tupleDesc, + toast_values, toast_isnull); + new_tuple_len = new_header_len + new_data_len; + + new_data = (HeapTupleHeader) palloc0(new_tuple_len); + + /* + * Copy the existing tuple header, but adjust natts and t_hoff. + */ + memcpy(new_data, tup, SizeofHeapTupleHeader); + HeapTupleHeaderSetNatts(new_data, numAttrs); + new_data->t_hoff = new_header_len; + + /* Set the composite-Datum header fields correctly */ + HeapTupleHeaderSetDatumLength(new_data, new_tuple_len); + HeapTupleHeaderSetTypeId(new_data, tupleDesc->tdtypeid); + HeapTupleHeaderSetTypMod(new_data, tupleDesc->tdtypmod); + + /* Copy over the data, and fill the null bitmap if needed */ + tdeheap_fill_tuple(tupleDesc, + toast_values, + toast_isnull, + (char *) new_data + new_header_len, + new_data_len, + &(new_data->t_infomask), + has_nulls ? new_data->t_bits : NULL); + + /* + * Free allocated temp values + */ + for (i = 0; i < numAttrs; i++) + if (toast_free[i]) + pfree(DatumGetPointer(toast_values[i])); + + return PointerGetDatum(new_data); +} + + +/* ---------- + * toast_build_flattened_tuple - + * + * Build a tuple containing no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * + * This is essentially just like tdeheap_form_tuple, except that it will + * expand any external-data pointers beforehand. + * + * It's not very clear whether it would be preferable to decompress + * in-line compressed datums while at it. For now, we don't. + * ---------- + */ +HeapTuple +toast_build_flattened_tuple(TupleDesc tupleDesc, + Datum *values, + bool *isnull) +{ + HeapTuple new_tuple; + int numAttrs = tupleDesc->natts; + int num_to_free; + int i; + Datum new_values[MaxTupleAttributeNumber]; + Pointer freeable_values[MaxTupleAttributeNumber]; + + /* + * We can pass the caller's isnull array directly to tdeheap_form_tuple, but + * we potentially need to modify the values array. + */ + Assert(numAttrs <= MaxTupleAttributeNumber); + memcpy(new_values, values, numAttrs * sizeof(Datum)); + + num_to_free = 0; + for (i = 0; i < numAttrs; i++) + { + /* + * Look at non-null varlena attributes + */ + if (!isnull[i] && TupleDescAttr(tupleDesc, i)->attlen == -1) + { + struct varlena *new_value; + + new_value = (struct varlena *) DatumGetPointer(new_values[i]); + if (VARATT_IS_EXTERNAL(new_value)) + { + new_value = detoast_external_attr(new_value); + new_values[i] = PointerGetDatum(new_value); + freeable_values[num_to_free++] = (Pointer) new_value; + } + } + } + + /* + * Form the reconfigured tuple. + */ + new_tuple = tdeheap_form_tuple(tupleDesc, new_values, isnull); + + /* + * Free allocated temp values + */ + for (i = 0; i < num_to_free; i++) + pfree(freeable_values[i]); + + return new_tuple; +} + +/* + * Fetch a TOAST slice from a heap table. + * + * toastrel is the relation from which chunks are to be fetched. + * valueid identifies the TOAST value from which chunks are being fetched. + * attrsize is the total size of the TOAST value. + * sliceoffset is the byte offset within the TOAST value from which to fetch. + * slicelength is the number of bytes to be fetched from the TOAST value. + * result is the varlena into which the results should be written. + */ +void +tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, + int32 sliceoffset, int32 slicelength, + struct varlena *result) +{ + Relation *toastidxs; + ScanKeyData toastkey[3]; + TupleDesc toasttupDesc = toastrel->rd_att; + int nscankeys; + SysScanDesc toastscan; + HeapTuple ttup; + int32 expectedchunk; + int32 totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; + int startchunk; + int endchunk; + int num_indexes; + int validIndex; + SnapshotData SnapshotToast; + + /* Look for the valid index of toast relation */ + validIndex = toast_open_indexes(toastrel, + AccessShareLock, + &toastidxs, + &num_indexes); + + startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; + endchunk = (sliceoffset + slicelength - 1) / TOAST_MAX_CHUNK_SIZE; + Assert(endchunk <= totalchunks); + + /* Set up a scan key to fetch from the index. */ + ScanKeyInit(&toastkey[0], + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(valueid)); + + /* + * No additional condition if fetching all chunks. Otherwise, use an + * equality condition for one chunk, and a range condition otherwise. + */ + if (startchunk == 0 && endchunk == totalchunks - 1) + nscankeys = 1; + else if (startchunk == endchunk) + { + ScanKeyInit(&toastkey[1], + (AttrNumber) 2, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(startchunk)); + nscankeys = 2; + } + else + { + ScanKeyInit(&toastkey[1], + (AttrNumber) 2, + BTGreaterEqualStrategyNumber, F_INT4GE, + Int32GetDatum(startchunk)); + ScanKeyInit(&toastkey[2], + (AttrNumber) 2, + BTLessEqualStrategyNumber, F_INT4LE, + Int32GetDatum(endchunk)); + nscankeys = 3; + } + + /* Prepare for scan */ + init_toast_snapshot(&SnapshotToast); + toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], + &SnapshotToast, nscankeys, toastkey); + + /* + * Read the chunks by index + * + * The index is on (valueid, chunkidx) so they will come in order + */ + expectedchunk = startchunk; + while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) + { + int32 curchunk; + Pointer chunk; + bool isnull; + char *chunkdata; + int32 chunksize; + int32 expected_size; + int32 chcpystrt; + int32 chcpyend; + + /* + * Have a chunk, extract the sequence number and the data + */ + curchunk = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); + Assert(!isnull); + chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); + Assert(!isnull); + if (!VARATT_IS_EXTENDED(chunk)) + { + chunksize = VARSIZE(chunk) - VARHDRSZ; + chunkdata = VARDATA(chunk); + } + else if (VARATT_IS_SHORT(chunk)) + { + /* could happen due to tdeheap_form_tuple doing its thing */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + chunkdata = VARDATA_SHORT(chunk); + } + else + { + /* should never happen */ + elog(ERROR, "found toasted toast chunk for toast value %u in %s", + valueid, RelationGetRelationName(toastrel)); + chunksize = 0; /* keep compiler quiet */ + chunkdata = NULL; + } + + /* + * Some checks on the data we've found + */ + if (curchunk != expectedchunk) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk number %d (expected %d) for toast value %u in %s", + curchunk, expectedchunk, valueid, + RelationGetRelationName(toastrel)))); + if (curchunk > endchunk) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", + curchunk, + startchunk, endchunk, valueid, + RelationGetRelationName(toastrel)))); + expected_size = curchunk < totalchunks - 1 ? TOAST_MAX_CHUNK_SIZE + : attrsize - ((totalchunks - 1) * TOAST_MAX_CHUNK_SIZE); + if (chunksize != expected_size) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s", + chunksize, expected_size, + curchunk, totalchunks, valueid, + RelationGetRelationName(toastrel)))); + + /* + * Copy the data into proper place in our result + */ + chcpystrt = 0; + chcpyend = chunksize - 1; + if (curchunk == startchunk) + chcpystrt = sliceoffset % TOAST_MAX_CHUNK_SIZE; + if (curchunk == endchunk) + chcpyend = (sliceoffset + slicelength - 1) % TOAST_MAX_CHUNK_SIZE; + + memcpy(VARDATA(result) + + (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, + chunkdata + chcpystrt, + (chcpyend - chcpystrt) + 1); + + expectedchunk++; + } + + /* + * Final checks that we successfully fetched the datum + */ + if (expectedchunk != (endchunk + 1)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("missing chunk number %d for toast value %u in %s", + expectedchunk, valueid, + RelationGetRelationName(toastrel)))); + + /* End scan and close indexes. */ + systable_endscan_ordered(toastscan); + toast_close_indexes(toastidxs, num_indexes, AccessShareLock); +} diff --git a/src16/include/access/pg_tde_io.h b/src16/include/access/pg_tde_io.h new file mode 100644 index 00000000..7d36bd2c --- /dev/null +++ b/src16/include/access/pg_tde_io.h @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * hio.h + * POSTGRES heap access method input/output definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/hio.h + * + *------------------------------------------------------------------------- + */ +#ifndef HIO_H +#define HIO_H + +#include "access/htup.h" +#include "storage/buf.h" +#include "utils/relcache.h" + +/* + * state for bulk inserts --- private to heapam.c and hio.c + * + * If current_buf isn't InvalidBuffer, then we are holding an extra pin + * on that buffer. + * + * "typedef struct BulkInsertStateData *BulkInsertState" is in heapam.h + */ +typedef struct BulkInsertStateData +{ + BufferAccessStrategy strategy; /* our BULKWRITE strategy object */ + Buffer current_buf; /* current insertion target page */ + + /* + * State for bulk extensions. + * + * last_free..next_free are further pages that were unused at the time of + * the last extension. They might be in use by the time we use them + * though, so rechecks are needed. + * + * XXX: Eventually these should probably live in RelationData instead, + * alongside targetblock. + * + * already_extended_by is the number of pages that this bulk inserted + * extended by. If we already extended by a significant number of pages, + * we can be more aggressive about extending going forward. + */ + BlockNumber next_free; + BlockNumber last_free; + uint32 already_extended_by; +} BulkInsertStateData; + + +extern void tdeheap_RelationPutHeapTuple(Relation relation, Buffer buffer, + HeapTuple tuple, bool token); +extern Buffer tdeheap_RelationGetBufferForTuple(Relation relation, Size len, + Buffer otherBuffer, int options, + BulkInsertStateData *bistate, + Buffer *vmbuffer, Buffer *vmbuffer_other, + int num_pages); + +#endif /* HIO_H */ diff --git a/src16/include/access/pg_tde_rewrite.h b/src16/include/access/pg_tde_rewrite.h new file mode 100644 index 00000000..b1c7cf83 --- /dev/null +++ b/src16/include/access/pg_tde_rewrite.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * rewriteheap.h + * Declarations for heap rewrite support functions + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994-5, Regents of the University of California + * + * src/include/access/rewriteheap.h + * + *------------------------------------------------------------------------- + */ +#ifndef REWRITE_HEAP_H +#define REWRITE_HEAP_H + +#include "access/htup.h" +#include "storage/itemptr.h" +#include "storage/relfilelocator.h" +#include "utils/relcache.h" + +/* struct definition is private to rewriteheap.c */ +typedef struct RewriteStateData *RewriteState; + +extern RewriteState begin_tdeheap_rewrite(Relation old_heap, Relation new_heap, + TransactionId oldest_xmin, TransactionId freeze_xid, + MultiXactId cutoff_multi); +extern void end_tdeheap_rewrite(RewriteState state); +extern void rewrite_tdeheap_tuple(RewriteState state, HeapTuple old_tuple, + HeapTuple new_tuple); +extern bool rewrite_tdeheap_dead_tuple(RewriteState state, HeapTuple old_tuple); + +/* + * On-Disk data format for an individual logical rewrite mapping. + */ +typedef struct LogicalRewriteMappingData +{ + RelFileLocator old_locator; + RelFileLocator new_locator; + ItemPointerData old_tid; + ItemPointerData new_tid; +} LogicalRewriteMappingData; + +/* --- + * The filename consists of the following, dash separated, + * components: + * 1) database oid or InvalidOid for shared relations + * 2) the oid of the relation + * 3) upper 32bit of the LSN at which a rewrite started + * 4) lower 32bit of the LSN at which a rewrite started + * 5) xid we are mapping for + * 6) xid of the xact performing the mapping + * --- + */ +#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x" +extern void CheckPointLogicalRewriteHeap(void); + +#endif /* REWRITE_HEAP_H */ diff --git a/src16/include/access/pg_tde_visibilitymap.h b/src16/include/access/pg_tde_visibilitymap.h new file mode 100644 index 00000000..8c38ea11 --- /dev/null +++ b/src16/include/access/pg_tde_visibilitymap.h @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * visibilitymap.h + * visibility map interface + * + * + * Portions Copyright (c) 2007-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/visibilitymap.h + * + *------------------------------------------------------------------------- + */ +#ifndef VISIBILITYMAP_H +#define VISIBILITYMAP_H + +#include "access/visibilitymapdefs.h" +#include "access/xlogdefs.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "utils/relcache.h" + +/* Macros for visibilitymap test */ +#define VM_ALL_VISIBLE(r, b, v) \ + ((tdeheap_visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_VISIBLE) != 0) +#define VM_ALL_FROZEN(r, b, v) \ + ((tdeheap_visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0) + +extern bool tdeheap_visibilitymap_clear(Relation rel, BlockNumber heapBlk, + Buffer vmbuf, uint8 flags); +extern void tdeheap_visibilitymap_pin(Relation rel, BlockNumber heapBlk, + Buffer *vmbuf); +extern bool tdeheap_visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); +extern void tdeheap_visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags); +extern uint8 tdeheap_visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); +extern void tdeheap_visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); +extern BlockNumber tdeheap_visibilitymap_prepare_truncate(Relation rel, + BlockNumber nheapblocks); + +#endif /* VISIBILITYMAP_H */ diff --git a/src16/include/access/pg_tdeam.h b/src16/include/access/pg_tdeam.h new file mode 100644 index 00000000..7f9f6138 --- /dev/null +++ b/src16/include/access/pg_tdeam.h @@ -0,0 +1,332 @@ +/*------------------------------------------------------------------------- + * + * heapam.h + * POSTGRES heap access method definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/heapam.h + * + *------------------------------------------------------------------------- + */ +#ifndef HEAPAM_H +#define HEAPAM_H + +#include "access/relation.h" /* for backward compatibility */ +#include "access/relscan.h" +#include "access/sdir.h" +#include "access/skey.h" +#include "access/table.h" /* for backward compatibility */ +#include "access/tableam.h" +#include "nodes/lockoptions.h" +#include "nodes/primnodes.h" +#include "storage/bufpage.h" +#include "storage/dsm.h" +#include "storage/lockdefs.h" +#include "storage/shm_toc.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + + +/* "options" flag bits for tdeheap_insert */ +#define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM +#define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN +#define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL +#define HEAP_INSERT_SPECULATIVE 0x0010 + +typedef struct BulkInsertStateData *BulkInsertState; +struct TupleTableSlot; +struct VacuumCutoffs; + +#define MaxLockTupleMode LockTupleExclusive + +/* + * Descriptor for heap table scans. + */ +typedef struct HeapScanDescData +{ + TableScanDescData rs_base; /* AM independent part of the descriptor */ + + /* state set up at initscan time */ + BlockNumber rs_nblocks; /* total number of blocks in rel */ + BlockNumber rs_startblock; /* block # to start at */ + BlockNumber rs_numblocks; /* max number of blocks to scan */ + /* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */ + + /* scan current state */ + bool rs_inited; /* false = scan not init'd yet */ + OffsetNumber rs_coffset; /* current offset # in non-page-at-a-time mode */ + BlockNumber rs_cblock; /* current block # in scan, if any */ + Buffer rs_cbuf; /* current buffer in scan, if any */ + /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + + BufferAccessStrategy rs_strategy; /* access strategy for reads */ + + HeapTupleData rs_ctup; /* current tuple in scan, if any */ + + /* + * For parallel scans to store page allocation data. NULL when not + * performing a parallel scan. + */ + ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + + /* these fields only used in page-at-a-time mode and for bitmap scans */ + int rs_cindex; /* current tuple's index in vistuples */ + int rs_ntuples; /* number of visible tuples on page */ + OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ +} HeapScanDescData; +typedef struct HeapScanDescData *HeapScanDesc; + +/* + * Descriptor for fetches from heap via an index. + */ +typedef struct IndexFetchHeapData +{ + IndexFetchTableData xs_base; /* AM independent part of the descriptor */ + + Buffer xs_cbuf; /* current heap buffer in scan, if any */ + /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ +} IndexFetchHeapData; + +/* Result codes for HeapTupleSatisfiesVacuum */ +typedef enum +{ + HEAPTUPLE_DEAD, /* tuple is dead and deletable */ + HEAPTUPLE_LIVE, /* tuple is live (committed, no deleter) */ + HEAPTUPLE_RECENTLY_DEAD, /* tuple is dead, but not deletable yet */ + HEAPTUPLE_INSERT_IN_PROGRESS, /* inserting xact is still in progress */ + HEAPTUPLE_DELETE_IN_PROGRESS /* deleting xact is still in progress */ +} HTSV_Result; + +/* + * tdeheap_prepare_freeze_tuple may request that tdeheap_freeze_execute_prepared + * check any tuple's to-be-frozen xmin and/or xmax status using pg_xact + */ +#define HEAP_FREEZE_CHECK_XMIN_COMMITTED 0x01 +#define HEAP_FREEZE_CHECK_XMAX_ABORTED 0x02 + +/* tdeheap_prepare_freeze_tuple state describing how to freeze a tuple */ +typedef struct HeapTupleFreeze +{ + /* Fields describing how to process tuple */ + TransactionId xmax; + uint16 t_infomask2; + uint16 t_infomask; + uint8 frzflags; + + /* xmin/xmax check flags */ + uint8 checkflags; + /* Page offset number for tuple */ + OffsetNumber offset; +} HeapTupleFreeze; + +/* + * State used by VACUUM to track the details of freezing all eligible tuples + * on a given heap page. + * + * VACUUM prepares freeze plans for each page via tdeheap_prepare_freeze_tuple + * calls (every tuple with storage gets its own call). This page-level freeze + * state is updated across each call, which ultimately determines whether or + * not freezing the page is required. + * + * Aside from the basic question of whether or not freezing will go ahead, the + * state also tracks the oldest extant XID/MXID in the table as a whole, for + * the purposes of advancing relfrozenxid/relminmxid values in pg_class later + * on. Each tdeheap_prepare_freeze_tuple call pushes NewRelfrozenXid and/or + * NewRelminMxid back as required to avoid unsafe final pg_class values. Any + * and all unfrozen XIDs or MXIDs that remain after VACUUM finishes _must_ + * have values >= the final relfrozenxid/relminmxid values in pg_class. This + * includes XIDs that remain as MultiXact members from any tuple's xmax. + * + * When 'freeze_required' flag isn't set after all tuples are examined, the + * final choice on freezing is made by vacuumlazy.c. It can decide to trigger + * freezing based on whatever criteria it deems appropriate. However, it is + * recommended that vacuumlazy.c avoid early freezing when freezing does not + * enable setting the target page all-frozen in the visibility map afterwards. + */ +typedef struct HeapPageFreeze +{ + /* Is tdeheap_prepare_freeze_tuple caller required to freeze page? */ + bool freeze_required; + + /* + * "Freeze" NewRelfrozenXid/NewRelminMxid trackers. + * + * Trackers used when tdeheap_freeze_execute_prepared freezes, or when there + * are zero freeze plans for a page. It is always valid for vacuumlazy.c + * to freeze any page, by definition. This even includes pages that have + * no tuples with storage to consider in the first place. That way the + * 'totally_frozen' results from tdeheap_prepare_freeze_tuple can always be + * used in the same way, even when no freeze plans need to be executed to + * "freeze the page". Only the "freeze" path needs to consider the need + * to set pages all-frozen in the visibility map under this scheme. + * + * When we freeze a page, we generally freeze all XIDs < OldestXmin, only + * leaving behind XIDs that are ineligible for freezing, if any. And so + * you might wonder why these trackers are necessary at all; why should + * _any_ page that VACUUM freezes _ever_ be left with XIDs/MXIDs that + * ratchet back the top-level NewRelfrozenXid/NewRelminMxid trackers? + * + * It is useful to use a definition of "freeze the page" that does not + * overspecify how MultiXacts are affected. tdeheap_prepare_freeze_tuple + * generally prefers to remove Multis eagerly, but lazy processing is used + * in cases where laziness allows VACUUM to avoid allocating a new Multi. + * The "freeze the page" trackers enable this flexibility. + */ + TransactionId FreezePageRelfrozenXid; + MultiXactId FreezePageRelminMxid; + + /* + * "No freeze" NewRelfrozenXid/NewRelminMxid trackers. + * + * These trackers are maintained in the same way as the trackers used when + * VACUUM scans a page that isn't cleanup locked. Both code paths are + * based on the same general idea (do less work for this page during the + * ongoing VACUUM, at the cost of having to accept older final values). + */ + TransactionId NoFreezePageRelfrozenXid; + MultiXactId NoFreezePageRelminMxid; + +} HeapPageFreeze; + +/* ---------------- + * function prototypes for heap access method + * + * tdeheap_create, tdeheap_create_with_catalog, and tdeheap_drop_with_catalog + * are declared in catalog/heap.h + * ---------------- + */ + + +/* + * HeapScanIsValid + * True iff the heap scan is valid. + */ +#define HeapScanIsValid(scan) PointerIsValid(scan) + +extern TableScanDesc tdeheap_beginscan(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + ParallelTableScanDesc parallel_scan, + uint32 flags); +extern void tdeheap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, + BlockNumber numBlks); +extern void tdeheapgetpage(TableScanDesc sscan, BlockNumber block); +extern void tdeheap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, + bool allow_strat, bool allow_sync, bool allow_pagemode); +extern void tdeheap_endscan(TableScanDesc sscan); +extern HeapTuple tdeheap_getnext(TableScanDesc sscan, ScanDirection direction); +extern bool tdeheap_getnextslot(TableScanDesc sscan, + ScanDirection direction, struct TupleTableSlot *slot); +extern void tdeheap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, + ItemPointer maxtid); +extern bool tdeheap_getnextslot_tidrange(TableScanDesc sscan, + ScanDirection direction, + TupleTableSlot *slot); +extern bool tdeheap_fetch(Relation relation, Snapshot snapshot, + HeapTuple tuple, Buffer *userbuf, bool keep_buf); +extern bool tdeheap_hot_search_buffer(ItemPointer tid, Relation relation, + Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, + bool *all_dead, bool first_call); + +extern void tdeheap_get_latest_tid(TableScanDesc sscan, ItemPointer tid); + +extern BulkInsertState GetBulkInsertState(void); +extern void FreeBulkInsertState(BulkInsertState); +extern void ReleaseBulkInsertStatePin(BulkInsertState bistate); + +extern void tdeheap_insert(Relation relation, HeapTuple tup, CommandId cid, + int options, BulkInsertState bistate); +extern void tdeheap_multi_insert(Relation relation, struct TupleTableSlot **slots, + int ntuples, CommandId cid, int options, + BulkInsertState bistate); +extern TM_Result tdeheap_delete(Relation relation, ItemPointer tid, + CommandId cid, Snapshot crosscheck, bool wait, + struct TM_FailureData *tmfd, bool changingPart); +extern void tdeheap_finish_speculative(Relation relation, ItemPointer tid); +extern void tdeheap_abort_speculative(Relation relation, ItemPointer tid); +extern TM_Result tdeheap_update(Relation relation, ItemPointer otid, + HeapTuple newtup, + CommandId cid, Snapshot crosscheck, bool wait, + struct TM_FailureData *tmfd, LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes); +extern TM_Result tdeheap_lock_tuple(Relation relation, HeapTuple tuple, + CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, + bool follow_updates, + Buffer *buffer, struct TM_FailureData *tmfd); + +extern void tdeheap_inplace_update(Relation relation, HeapTuple tuple); +extern bool tdeheap_prepare_freeze_tuple(HeapTupleHeader tuple, + const struct VacuumCutoffs *cutoffs, + HeapPageFreeze *pagefrz, + HeapTupleFreeze *frz, bool *totally_frozen); +extern void tdeheap_freeze_execute_prepared(Relation rel, Buffer buffer, + TransactionId snapshotConflictHorizon, + HeapTupleFreeze *tuples, int ntuples); +extern bool tdeheap_freeze_tuple(HeapTupleHeader tuple, + TransactionId relfrozenxid, TransactionId relminmxid, + TransactionId FreezeLimit, TransactionId MultiXactCutoff); +extern bool tdeheap_tuple_should_freeze(HeapTupleHeader tuple, + const struct VacuumCutoffs *cutoffs, + TransactionId *NoFreezePageRelfrozenXid, + MultiXactId *NoFreezePageRelminMxid); +extern bool tdeheap_tuple_needs_eventual_freeze(HeapTupleHeader tuple); + +extern void simple_tdeheap_insert(Relation relation, HeapTuple tup); +extern void simple_tdeheap_delete(Relation relation, ItemPointer tid); +extern void simple_tdeheap_update(Relation relation, ItemPointer otid, + HeapTuple tup, TU_UpdateIndexes *update_indexes); + +extern TransactionId tdeheap_index_delete_tuples(Relation rel, + TM_IndexDeleteOp *delstate); + +/* in heap/pruneheap.c */ +struct GlobalVisState; +extern void tdeheap_page_prune_opt(Relation relation, Buffer buffer); +extern int tdeheap_page_prune(Relation relation, Buffer buffer, + struct GlobalVisState *vistest, + TransactionId old_snap_xmin, + TimestampTz old_snap_ts, + int *nnewlpdead, + OffsetNumber *off_loc); +extern void tdeheap_page_prune_execute(Buffer buffer, + OffsetNumber *redirected, int nredirected, + OffsetNumber *nowdead, int ndead, + OffsetNumber *nowunused, int nunused); +extern void tdeheap_get_root_tuples(Page page, OffsetNumber *root_offsets); + +/* in heap/vacuumlazy.c */ +struct VacuumParams; +extern void tdeheap_vacuum_rel(Relation rel, + struct VacuumParams *params, BufferAccessStrategy bstrategy); + +/* in heap/pg_tdeam_visibility.c */ +extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, + Buffer buffer); +extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, + Buffer buffer); +extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, + Buffer buffer); +extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, + TransactionId *dead_after); +extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid); +extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); +extern bool HeapTupleIsSurelyDead(HeapTuple htup, + struct GlobalVisState *vistest); + +/* + * To avoid leaking too much knowledge about reorderbuffer implementation + * details this is implemented in reorderbuffer.c not pg_tdeam_visibility.c + */ +struct HTAB; +extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data, + Snapshot snapshot, + HeapTuple htup, + Buffer buffer, + CommandId *cmin, CommandId *cmax); +extern void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, + Buffer buffer, Snapshot snapshot); + +#endif /* HEAPAM_H */ diff --git a/src16/include/access/pg_tdeam_xlog.h b/src16/include/access/pg_tdeam_xlog.h new file mode 100644 index 00000000..1a96ea9e --- /dev/null +++ b/src16/include/access/pg_tdeam_xlog.h @@ -0,0 +1,421 @@ +/*------------------------------------------------------------------------- + * + * pg_tdeam_xlog.h + * POSTGRES heap access XLOG definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/pg_tdeam_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef HEAPAM_XLOG_H +#define HEAPAM_XLOG_H + +#include "access/htup.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/buf.h" +#include "storage/bufpage.h" +#include "storage/relfilelocator.h" +#include "utils/relcache.h" + + +/* + * WAL record definitions for heapam.c's WAL operations + * + * XLOG allows to store some information in high 4 bits of log + * record xl_info field. We use 3 for opcode and one for init bit. + */ +#define XLOG_HEAP_INSERT 0x00 +#define XLOG_HEAP_DELETE 0x10 +#define XLOG_HEAP_UPDATE 0x20 +#define XLOG_HEAP_TRUNCATE 0x30 +#define XLOG_HEAP_HOT_UPDATE 0x40 +#define XLOG_HEAP_CONFIRM 0x50 +#define XLOG_HEAP_LOCK 0x60 +#define XLOG_HEAP_INPLACE 0x70 + +#define XLOG_HEAP_OPMASK 0x70 +/* + * When we insert 1st item on new page in INSERT, UPDATE, HOT_UPDATE, + * or MULTI_INSERT, we can (and we do) restore entire page in redo + */ +#define XLOG_HEAP_INIT_PAGE 0x80 +/* + * We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes + * are associated with RM_HEAP2_ID, but are not logically different from + * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to + * these, too. + */ +#define XLOG_HEAP2_REWRITE 0x00 +#define XLOG_HEAP2_PRUNE 0x10 +#define XLOG_HEAP2_VACUUM 0x20 +#define XLOG_HEAP2_FREEZE_PAGE 0x30 +#define XLOG_HEAP2_VISIBLE 0x40 +#define XLOG_HEAP2_MULTI_INSERT 0x50 +#define XLOG_HEAP2_LOCK_UPDATED 0x60 +#define XLOG_HEAP2_NEW_CID 0x70 + +/* + * xl_tdeheap_insert/xl_tdeheap_multi_insert flag values, 8 bits are available. + */ +/* PD_ALL_VISIBLE was cleared */ +#define XLH_INSERT_ALL_VISIBLE_CLEARED (1<<0) +#define XLH_INSERT_LAST_IN_MULTI (1<<1) +#define XLH_INSERT_IS_SPECULATIVE (1<<2) +#define XLH_INSERT_CONTAINS_NEW_TUPLE (1<<3) +#define XLH_INSERT_ON_TOAST_RELATION (1<<4) + +/* all_frozen_set always implies all_visible_set */ +#define XLH_INSERT_ALL_FROZEN_SET (1<<5) + +/* + * xl_tdeheap_update flag values, 8 bits are available. + */ +/* PD_ALL_VISIBLE was cleared */ +#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED (1<<0) +/* PD_ALL_VISIBLE was cleared in the 2nd page */ +#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED (1<<1) +#define XLH_UPDATE_CONTAINS_OLD_TUPLE (1<<2) +#define XLH_UPDATE_CONTAINS_OLD_KEY (1<<3) +#define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4) +#define XLH_UPDATE_PREFIX_FROM_OLD (1<<5) +#define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6) + +/* convenience macro for checking whether any form of old tuple was logged */ +#define XLH_UPDATE_CONTAINS_OLD \ + (XLH_UPDATE_CONTAINS_OLD_TUPLE | XLH_UPDATE_CONTAINS_OLD_KEY) + +/* + * xl_tdeheap_delete flag values, 8 bits are available. + */ +/* PD_ALL_VISIBLE was cleared */ +#define XLH_DELETE_ALL_VISIBLE_CLEARED (1<<0) +#define XLH_DELETE_CONTAINS_OLD_TUPLE (1<<1) +#define XLH_DELETE_CONTAINS_OLD_KEY (1<<2) +#define XLH_DELETE_IS_SUPER (1<<3) +#define XLH_DELETE_IS_PARTITION_MOVE (1<<4) + +/* convenience macro for checking whether any form of old tuple was logged */ +#define XLH_DELETE_CONTAINS_OLD \ + (XLH_DELETE_CONTAINS_OLD_TUPLE | XLH_DELETE_CONTAINS_OLD_KEY) + +/* This is what we need to know about delete */ +typedef struct xl_tdeheap_delete +{ + TransactionId xmax; /* xmax of the deleted tuple */ + OffsetNumber offnum; /* deleted tuple's offset */ + uint8 infobits_set; /* infomask bits */ + uint8 flags; +} xl_tdeheap_delete; + +#define SizeOfHeapDelete (offsetof(xl_tdeheap_delete, flags) + sizeof(uint8)) + +/* + * xl_tdeheap_truncate flag values, 8 bits are available. + */ +#define XLH_TRUNCATE_CASCADE (1<<0) +#define XLH_TRUNCATE_RESTART_SEQS (1<<1) + +/* + * For truncate we list all truncated relids in an array, followed by all + * sequence relids that need to be restarted, if any. + * All rels are always within the same database, so we just list dbid once. + */ +typedef struct xl_tdeheap_truncate +{ + Oid dbId; + uint32 nrelids; + uint8 flags; + Oid relids[FLEXIBLE_ARRAY_MEMBER]; +} xl_tdeheap_truncate; + +#define SizeOfHeapTruncate (offsetof(xl_tdeheap_truncate, relids)) + +/* + * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted + * or updated tuple in WAL; we can save a few bytes by reconstructing the + * fields that are available elsewhere in the WAL record, or perhaps just + * plain needn't be reconstructed. These are the fields we must store. + */ +typedef struct xl_tdeheap_header +{ + uint16 t_infomask2; + uint16 t_infomask; + uint8 t_hoff; +} xl_tdeheap_header; + +#define SizeOfHeapHeader (offsetof(xl_tdeheap_header, t_hoff) + sizeof(uint8)) + +/* This is what we need to know about insert */ +typedef struct xl_tdeheap_insert +{ + OffsetNumber offnum; /* inserted tuple's offset */ + uint8 flags; + + /* xl_tdeheap_header & TUPLE DATA in backup block 0 */ +} xl_tdeheap_insert; + +#define SizeOfHeapInsert (offsetof(xl_tdeheap_insert, flags) + sizeof(uint8)) + +/* + * This is what we need to know about a multi-insert. + * + * The main data of the record consists of this xl_tdeheap_multi_insert header. + * 'offsets' array is omitted if the whole page is reinitialized + * (XLOG_HEAP_INIT_PAGE). + * + * In block 0's data portion, there is an xl_multi_insert_tuple struct, + * followed by the tuple data for each tuple. There is padding to align + * each xl_multi_insert_tuple struct. + */ +typedef struct xl_tdeheap_multi_insert +{ + uint8 flags; + uint16 ntuples; + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} xl_tdeheap_multi_insert; + +#define SizeOfHeapMultiInsert offsetof(xl_tdeheap_multi_insert, offsets) + +typedef struct xl_multi_insert_tuple +{ + uint16 datalen; /* size of tuple data that follows */ + uint16 t_infomask2; + uint16 t_infomask; + uint8 t_hoff; + /* TUPLE DATA FOLLOWS AT END OF STRUCT */ +} xl_multi_insert_tuple; + +#define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8)) + +/* + * This is what we need to know about update|hot_update + * + * Backup blk 0: new page + * + * If XLH_UPDATE_PREFIX_FROM_OLD or XLH_UPDATE_SUFFIX_FROM_OLD flags are set, + * the prefix and/or suffix come first, as one or two uint16s. + * + * After that, xl_tdeheap_header and new tuple data follow. The new tuple + * data doesn't include the prefix and suffix, which are copied from the + * old tuple on replay. + * + * If XLH_UPDATE_CONTAINS_NEW_TUPLE flag is given, the tuple data is + * included even if a full-page image was taken. + * + * Backup blk 1: old page, if different. (no data, just a reference to the blk) + */ +typedef struct xl_tdeheap_update +{ + TransactionId old_xmax; /* xmax of the old tuple */ + OffsetNumber old_offnum; /* old tuple's offset */ + uint8 old_infobits_set; /* infomask bits to set on old tuple */ + uint8 flags; + TransactionId new_xmax; /* xmax of the new tuple */ + OffsetNumber new_offnum; /* new tuple's offset */ + + /* + * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags + * are set, xl_tdeheap_header and tuple data for the old tuple follow. + */ +} xl_tdeheap_update; + +#define SizeOfHeapUpdate (offsetof(xl_tdeheap_update, new_offnum) + sizeof(OffsetNumber)) + +/* + * This is what we need to know about page pruning (both during VACUUM and + * during opportunistic pruning) + * + * The array of OffsetNumbers following the fixed part of the record contains: + * * for each redirected item: the item offset, then the offset redirected to + * * for each now-dead item: the item offset + * * for each now-unused item: the item offset + * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused. + * Note that nunused is not explicitly stored, but may be found by reference + * to the total record length. + * + * Acquires a full cleanup lock. + */ +typedef struct xl_tdeheap_prune +{ + TransactionId snapshotConflictHorizon; + uint16 nredirected; + uint16 ndead; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ + /* OFFSET NUMBERS are in the block reference 0 */ +} xl_tdeheap_prune; + +#define SizeOfHeapPrune (offsetof(xl_tdeheap_prune, isCatalogRel) + sizeof(bool)) + +/* + * The vacuum page record is similar to the prune record, but can only mark + * already LP_DEAD items LP_UNUSED (during VACUUM's second heap pass) + * + * Acquires an ordinary exclusive lock only. + */ +typedef struct xl_tdeheap_vacuum +{ + uint16 nunused; + /* OFFSET NUMBERS are in the block reference 0 */ +} xl_tdeheap_vacuum; + +#define SizeOfHeapVacuum (offsetof(xl_tdeheap_vacuum, nunused) + sizeof(uint16)) + +/* flags for infobits_set */ +#define XLHL_XMAX_IS_MULTI 0x01 +#define XLHL_XMAX_LOCK_ONLY 0x02 +#define XLHL_XMAX_EXCL_LOCK 0x04 +#define XLHL_XMAX_KEYSHR_LOCK 0x08 +#define XLHL_KEYS_UPDATED 0x10 + +/* flag bits for xl_tdeheap_lock / xl_tdeheap_lock_updated's flag field */ +#define XLH_LOCK_ALL_FROZEN_CLEARED 0x01 + +/* This is what we need to know about lock */ +typedef struct xl_tdeheap_lock +{ + TransactionId xmax; /* might be a MultiXactId */ + OffsetNumber offnum; /* locked tuple's offset on page */ + uint8 infobits_set; /* infomask and infomask2 bits to set */ + uint8 flags; /* XLH_LOCK_* flag bits */ +} xl_tdeheap_lock; + +#define SizeOfHeapLock (offsetof(xl_tdeheap_lock, flags) + sizeof(uint8)) + +/* This is what we need to know about locking an updated version of a row */ +typedef struct xl_tdeheap_lock_updated +{ + TransactionId xmax; + OffsetNumber offnum; + uint8 infobits_set; + uint8 flags; +} xl_tdeheap_lock_updated; + +#define SizeOfHeapLockUpdated (offsetof(xl_tdeheap_lock_updated, flags) + sizeof(uint8)) + +/* This is what we need to know about confirmation of speculative insertion */ +typedef struct xl_tdeheap_confirm +{ + OffsetNumber offnum; /* confirmed tuple's offset on page */ +} xl_tdeheap_confirm; + +#define SizeOfHeapConfirm (offsetof(xl_tdeheap_confirm, offnum) + sizeof(OffsetNumber)) + +/* This is what we need to know about in-place update */ +typedef struct xl_tdeheap_inplace +{ + OffsetNumber offnum; /* updated tuple's offset on page */ + /* TUPLE DATA FOLLOWS AT END OF STRUCT */ +} xl_tdeheap_inplace; + +#define SizeOfHeapInplace (offsetof(xl_tdeheap_inplace, offnum) + sizeof(OffsetNumber)) + +/* + * This struct represents a 'freeze plan', which describes how to freeze a + * group of one or more heap tuples (appears in xl_tdeheap_freeze_page record) + */ +/* 0x01 was XLH_FREEZE_XMIN */ +#define XLH_FREEZE_XVAC 0x02 +#define XLH_INVALID_XVAC 0x04 + +typedef struct xl_tdeheap_freeze_plan +{ + TransactionId xmax; + uint16 t_infomask2; + uint16 t_infomask; + uint8 frzflags; + + /* Length of individual page offset numbers array for this plan */ + uint16 ntuples; +} xl_tdeheap_freeze_plan; + +/* + * This is what we need to know about a block being frozen during vacuum + * + * Backup block 0's data contains an array of xl_tdeheap_freeze_plan structs + * (with nplans elements), followed by one or more page offset number arrays. + * Each such page offset number array corresponds to a single freeze plan + * (REDO routine freezes corresponding heap tuples using freeze plan). + */ +typedef struct xl_tdeheap_freeze_page +{ + TransactionId snapshotConflictHorizon; + uint16 nplans; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ + + /* + * In payload of blk 0 : FREEZE PLANS and OFFSET NUMBER ARRAY + */ +} xl_tdeheap_freeze_page; + +#define SizeOfHeapFreezePage (offsetof(xl_tdeheap_freeze_page, isCatalogRel) + sizeof(bool)) + +/* + * This is what we need to know about setting a visibility map bit + * + * Backup blk 0: visibility map buffer + * Backup blk 1: heap buffer + */ +typedef struct xl_tdeheap_visible +{ + TransactionId snapshotConflictHorizon; + uint8 flags; +} xl_tdeheap_visible; + +#define SizeOfHeapVisible (offsetof(xl_tdeheap_visible, flags) + sizeof(uint8)) + +typedef struct xl_tdeheap_new_cid +{ + /* + * store toplevel xid so we don't have to merge cids from different + * transactions + */ + TransactionId top_xid; + CommandId cmin; + CommandId cmax; + CommandId combocid; /* just for debugging */ + + /* + * Store the relfilelocator/ctid pair to facilitate lookups. + */ + RelFileLocator target_locator; + ItemPointerData target_tid; +} xl_tdeheap_new_cid; + +#define SizeOfHeapNewCid (offsetof(xl_tdeheap_new_cid, target_tid) + sizeof(ItemPointerData)) + +/* logical rewrite xlog record header */ +typedef struct xl_tdeheap_rewrite_mapping +{ + TransactionId mapped_xid; /* xid that might need to see the row */ + Oid mapped_db; /* DbOid or InvalidOid for shared rels */ + Oid mapped_rel; /* Oid of the mapped relation */ + off_t offset; /* How far have we written so far */ + uint32 num_mappings; /* Number of in-memory mappings */ + XLogRecPtr start_lsn; /* Insert LSN at begin of rewrite */ +} xl_tdeheap_rewrite_mapping; + +extern void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, + TransactionId *snapshotConflictHorizon); + +extern void tdeheap_redo(XLogReaderState *record); +extern void tdeheap_desc(StringInfo buf, XLogReaderState *record); +extern const char *tdeheap_identify(uint8 info); +extern void tdeheap_mask(char *pagedata, BlockNumber blkno); +extern void tdeheap2_redo(XLogReaderState *record); +extern void tdeheap2_desc(StringInfo buf, XLogReaderState *record); +extern const char *tdeheap2_identify(uint8 info); +extern void tdeheap_xlog_logical_rewrite(XLogReaderState *r); + +extern XLogRecPtr log_tdeheap_visible(Relation rel, Buffer tdeheap_buffer, + Buffer vm_buffer, + TransactionId snapshotConflictHorizon, + uint8 vmflags); + +#endif /* HEAPAM_XLOG_H */ diff --git a/src16/include/access/pg_tdetoast.h b/src16/include/access/pg_tdetoast.h new file mode 100644 index 00000000..af79b756 --- /dev/null +++ b/src16/include/access/pg_tdetoast.h @@ -0,0 +1,149 @@ +/*------------------------------------------------------------------------- + * + * heaptoast.h + * Heap-specific definitions for external and compressed storage + * of variable size attributes. + * + * Copyright (c) 2000-2023, PostgreSQL Global Development Group + * + * src/include/access/heaptoast.h + * + *------------------------------------------------------------------------- + */ +#ifndef HEAPTOAST_H +#define HEAPTOAST_H + +#include "access/htup_details.h" +#include "storage/lockdefs.h" +#include "utils/relcache.h" + +/* + * Find the maximum size of a tuple if there are to be N tuples per page. + */ +#define MaximumBytesPerTuple(tuplesPerPage) \ + MAXALIGN_DOWN((BLCKSZ - \ + MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData))) \ + / (tuplesPerPage)) + +/* + * These symbols control toaster activation. If a tuple is larger than + * TOAST_TUPLE_THRESHOLD, we will try to toast it down to no more than + * TOAST_TUPLE_TARGET bytes through compressing compressible fields and + * moving EXTENDED and EXTERNAL data out-of-line. + * + * The numbers need not be the same, though they currently are. It doesn't + * make sense for TARGET to exceed THRESHOLD, but it could be useful to make + * it be smaller. + * + * Currently we choose both values to match the largest tuple size for which + * TOAST_TUPLES_PER_PAGE tuples can fit on a heap page. + * + * XXX while these can be modified without initdb, some thought needs to be + * given to needs_toast_table() in toasting.c before unleashing random + * changes. Also see LOBLKSIZE in large_object.h, which can *not* be + * changed without initdb. + */ +#define TOAST_TUPLES_PER_PAGE 4 + +#define TOAST_TUPLE_THRESHOLD MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE) + +#define TOAST_TUPLE_TARGET TOAST_TUPLE_THRESHOLD + +/* + * The code will also consider moving MAIN data out-of-line, but only as a + * last resort if the previous steps haven't reached the target tuple size. + * In this phase we use a different target size, currently equal to the + * largest tuple that will fit on a heap page. This is reasonable since + * the user has told us to keep the data in-line if at all possible. + */ +#define TOAST_TUPLES_PER_PAGE_MAIN 1 + +#define TOAST_TUPLE_TARGET_MAIN MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE_MAIN) + +/* + * If an index value is larger than TOAST_INDEX_TARGET, we will try to + * compress it (we can't move it out-of-line, however). Note that this + * number is per-datum, not per-tuple, for simplicity in index_form_tuple(). + */ +#define TOAST_INDEX_TARGET (MaxHeapTupleSize / 16) + +/* + * When we store an oversize datum externally, we divide it into chunks + * containing at most TOAST_MAX_CHUNK_SIZE data bytes. This number *must* + * be small enough that the completed toast-table tuple (including the + * ID and sequence fields and all overhead) will fit on a page. + * The coding here sets the size on the theory that we want to fit + * EXTERN_TUPLES_PER_PAGE tuples of maximum size onto a page. + * + * NB: Changing TOAST_MAX_CHUNK_SIZE requires an initdb. + */ +#define EXTERN_TUPLES_PER_PAGE 4 /* tweak only this */ + +#define EXTERN_TUPLE_MAX_SIZE MaximumBytesPerTuple(EXTERN_TUPLES_PER_PAGE) + +#define TOAST_MAX_CHUNK_SIZE \ + (EXTERN_TUPLE_MAX_SIZE - \ + MAXALIGN(SizeofHeapTupleHeader) - \ + sizeof(Oid) - \ + sizeof(int32) - \ + VARHDRSZ) + +/* ---------- + * tdeheap_toast_insert_or_update - + * + * Called by tdeheap_insert() and tdeheap_update(). + * ---------- + */ +extern HeapTuple tdeheap_toast_insert_or_update(Relation rel, HeapTuple newtup, + HeapTuple oldtup, int options); + +/* ---------- + * tdeheap_toast_delete - + * + * Called by tdeheap_delete(). + * ---------- + */ +extern void tdeheap_toast_delete(Relation rel, HeapTuple oldtup, + bool is_speculative); + +/* ---------- + * toast_flatten_tuple - + * + * "Flatten" a tuple to contain no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * ---------- + */ +extern HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc); + +/* ---------- + * toast_flatten_tuple_to_datum - + * + * "Flatten" a tuple containing out-of-line toasted fields into a Datum. + * ---------- + */ +extern Datum toast_flatten_tuple_to_datum(HeapTupleHeader tup, + uint32 tup_len, + TupleDesc tupleDesc); + +/* ---------- + * toast_build_flattened_tuple - + * + * Build a tuple containing no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * ---------- + */ +extern HeapTuple toast_build_flattened_tuple(TupleDesc tupleDesc, + Datum *values, + bool *isnull); + +/* ---------- + * tdeheap_fetch_toast_slice + * + * Fetch a slice from a toast value stored in a heap table. + * ---------- + */ +extern void tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, + int32 attrsize, int32 sliceoffset, + int32 slicelength, struct varlena *result); + +#endif /* HEAPTOAST_H */ From 1f2f3ffde1ffd9bc962c1af70f6c9e24c5243e20 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Sun, 4 Aug 2024 20:24:52 +0100 Subject: [PATCH 3/6] Applied tde patches on top of the src16 directory --- src16/access/pg_tde_io.c | 19 +- src16/access/pg_tde_prune.c | 410 ++++++++++++++++- src16/access/pg_tde_rewrite.c | 12 +- src16/access/pg_tde_vacuumlazy.c | 14 +- src16/access/pg_tde_visibilitymap.c | 9 +- src16/access/pg_tdeam.c | 132 ++++-- src16/access/pg_tdeam_handler.c | 78 +++- src16/access/pg_tdeam_visibility.c | 7 +- src16/access/pg_tdetoast.c | 475 +++++++++++++++++++- src16/include/access/pg_tde_io.h | 10 +- src16/include/access/pg_tde_rewrite.h | 8 +- src16/include/access/pg_tde_visibilitymap.h | 10 +- src16/include/access/pg_tdeam.h | 25 +- src16/include/access/pg_tdeam_xlog.h | 14 +- src16/include/access/pg_tdetoast.h | 6 +- 15 files changed, 1121 insertions(+), 108 deletions(-) diff --git a/src16/access/pg_tde_io.c b/src16/access/pg_tde_io.c index 125804d9..2ad4d366 100644 --- a/src16/access/pg_tde_io.c +++ b/src16/access/pg_tde_io.c @@ -13,12 +13,16 @@ *------------------------------------------------------------------------- */ +#include "pg_tde_defines.h" + #include "postgres.h" -#include "access/heapam.h" -#include "access/hio.h" +#include "access/pg_tdeam.h" +#include "access/pg_tde_io.h" +#include "access/pg_tde_visibilitymap.h" +#include "encryption/enc_tde.h" + #include "access/htup_details.h" -#include "access/visibilitymap.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" @@ -36,6 +40,7 @@ void tdeheap_RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, + bool encrypt, bool token) { Page pageHeader; @@ -59,8 +64,12 @@ tdeheap_RelationPutHeapTuple(Relation relation, /* Add the tuple to the page */ pageHeader = BufferGetPage(buffer); - offnum = PageAddItem(pageHeader, (Item) tuple->t_data, - tuple->t_len, InvalidOffsetNumber, false, true); + if (encrypt) + offnum = TDE_PageAddItem(relation->rd_locator, tuple->t_tableOid, BufferGetBlockNumber(buffer), pageHeader, (Item) tuple->t_data, + tuple->t_len, InvalidOffsetNumber, false, true); + else + offnum = PageAddItem(pageHeader, (Item) tuple->t_data, + tuple->t_len, InvalidOffsetNumber, false, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple to page"); diff --git a/src16/access/pg_tde_prune.c b/src16/access/pg_tde_prune.c index ee3daa46..552151c5 100644 --- a/src16/access/pg_tde_prune.c +++ b/src16/access/pg_tde_prune.c @@ -12,10 +12,15 @@ * *------------------------------------------------------------------------- */ +#include "pg_tde_defines.h" + #include "postgres.h" -#include "access/heapam.h" +#include "encryption/enc_tde.h" + +#include "access/pg_tdeam.h" #include "access/pg_tdeam_xlog.h" + #include "access/htup_details.h" #include "access/transam.h" #include "access/xlog.h" @@ -379,6 +384,14 @@ tdeheap_page_prune(Relation relation, Buffer buffer, if (off_loc) *off_loc = InvalidOffsetNumber; + /* + * Make sure relation key in the cahce to avoid pallocs in + * the critical section. + * We need it here as there is `pgtde_compactify_tuples()` down in + * the call stack wich reencrypt tuples. + */ + GetRelationKey(relation->rd_locator); + /* Any error while applying the changes is critical */ START_CRIT_SECTION(); @@ -389,7 +402,7 @@ tdeheap_page_prune(Relation relation, Buffer buffer, * Apply the planned item changes, then repair page fragmentation, and * update the page's hint bit about whether it has free line pointers. */ - tdeheap_page_prune_execute(buffer, + tdeheap_page_prune_execute(prstate.rel, buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused); @@ -902,6 +915,7 @@ tdeheap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) prstate->marked[offnum] = true; } +void TdePageRepairFragmentation(Relation rel, Buffer buffer, Page page); /* * Perform the actual page changes needed by tdeheap_page_prune. @@ -909,7 +923,7 @@ tdeheap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) * buffer. */ void -tdeheap_page_prune_execute(Buffer buffer, +tdeheap_page_prune_execute(Relation rel, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused) @@ -1036,7 +1050,7 @@ tdeheap_page_prune_execute(Buffer buffer, * Finally, repair any fragmentation, and update the page's hint bit about * whether it has free pointers. */ - PageRepairFragmentation(page); + TdePageRepairFragmentation(rel, buffer, page); /* * Now that the page has been modified, assert that redirect items still @@ -1211,3 +1225,391 @@ tdeheap_get_root_tuples(Page page, OffsetNumber *root_offsets) } } } + +// TODO: move to own file so it can be autoupdated +// FROM src/page/bufpage.c + +/* + * Tuple defrag support for PageRepairFragmentation and PageIndexMultiDelete + */ +typedef struct itemIdCompactData +{ + uint16 offsetindex; /* linp array index */ + int16 itemoff; /* page offset of item data */ + uint16 len; + uint16 alignedlen; /* MAXALIGN(item data len) */ +} itemIdCompactData; +typedef itemIdCompactData *itemIdCompact; + +/* + * After removing or marking some line pointers unused, move the tuples to + * remove the gaps caused by the removed items and reorder them back into + * reverse line pointer order in the page. + * + * This function can often be fairly hot, so it pays to take some measures to + * make it as optimal as possible. + * + * Callers may pass 'presorted' as true if the 'itemidbase' array is sorted in + * descending order of itemoff. When this is true we can just memmove() + * tuples towards the end of the page. This is quite a common case as it's + * the order that tuples are initially inserted into pages. When we call this + * function to defragment the tuples in the page then any new line pointers + * added to the page will keep that presorted order, so hitting this case is + * still very common for tables that are commonly updated. + * + * When the 'itemidbase' array is not presorted then we're unable to just + * memmove() tuples around freely. Doing so could cause us to overwrite the + * memory belonging to a tuple we've not moved yet. In this case, we copy all + * the tuples that need to be moved into a temporary buffer. We can then + * simply memcpy() out of that temp buffer back into the page at the correct + * location. Tuples are copied back into the page in the same order as the + * 'itemidbase' array, so we end up reordering the tuples back into reverse + * line pointer order. This will increase the chances of hitting the + * presorted case the next time around. + * + * Callers must ensure that nitems is > 0 + */ +static void // this is where it happens! +pgtde_compactify_tuples(Relation rel, Buffer buffer, itemIdCompact itemidbase, int nitems, Page page, bool presorted) +{ + PageHeader phdr = (PageHeader) page; + Offset upper; + Offset copy_tail; + Offset copy_head; + itemIdCompact itemidptr; + int i; + + /* Code within will not work correctly if nitems == 0 */ + Assert(nitems > 0); + + if (presorted) + { + +#ifdef USE_ASSERT_CHECKING + { + /* + * Verify we've not gotten any new callers that are incorrectly + * passing a true presorted value. + */ + Offset lastoff = phdr->pd_special; + + for (i = 0; i < nitems; i++) + { + itemidptr = &itemidbase[i]; + + Assert(lastoff > itemidptr->itemoff); + + lastoff = itemidptr->itemoff; + } + } +#endif /* USE_ASSERT_CHECKING */ + + /* + * 'itemidbase' is already in the optimal order, i.e, lower item + * pointers have a higher offset. This allows us to memmove() the + * tuples up to the end of the page without having to worry about + * overwriting other tuples that have not been moved yet. + * + * There's a good chance that there are tuples already right at the + * end of the page that we can simply skip over because they're + * already in the correct location within the page. We'll do that + * first... + */ + upper = phdr->pd_special; + i = 0; + do + { + itemidptr = &itemidbase[i]; + if (upper != itemidptr->itemoff + itemidptr->alignedlen) + break; + upper -= itemidptr->alignedlen; + + i++; + } while (i < nitems); + + /* + * Now that we've found the first tuple that needs to be moved, we can + * do the tuple compactification. We try and make the least number of + * memmove() calls and only call memmove() when there's a gap. When + * we see a gap we just move all tuples after the gap up until the + * point of the last move operation. + */ + copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen; + for (; i < nitems; i++) + { + ItemId lp; + + itemidptr = &itemidbase[i]; + + lp = PageGetItemId(page, itemidptr->offsetindex + 1); + + if (copy_head != itemidptr->itemoff + itemidptr->alignedlen && copy_head < copy_tail) + { + memmove((char *) page + upper, + page + copy_head, + copy_tail - copy_head); + + /* + * We've now moved all tuples already seen, but not the + * current tuple, so we set the copy_tail to the end of this + * tuple so it can be moved in another iteration of the loop. + */ + copy_tail = itemidptr->itemoff + itemidptr->alignedlen; + } + /* shift the target offset down by the length of this tuple */ + upper -= itemidptr->alignedlen; + /* point the copy_head to the start of this tuple */ + copy_head = itemidptr->itemoff; + + /* update the line pointer to reference the new offset */ + lp->lp_off = upper; + } + + /* move the remaining tuples. */ + memmove((char *) page + upper, + page + copy_head, + copy_tail - copy_head); + } + else + { + PGAlignedBlock scratch; + char *scratchptr = scratch.data; + + /* + * Non-presorted case: The tuples in the itemidbase array may be in + * any order. So, in order to move these to the end of the page we + * must make a temp copy of each tuple that needs to be moved before + * we copy them back into the page at the new offset. + * + * If a large percentage of tuples have been pruned (>75%) then we'll + * copy these into the temp buffer tuple-by-tuple, otherwise, we'll + * just do a single memcpy() for all tuples that need to be moved. + * When so many tuples have been removed there's likely to be a lot of + * gaps and it's unlikely that many non-movable tuples remain at the + * end of the page. + */ + if (nitems < PageGetMaxOffsetNumber(page) / 4) + { + i = 0; + do + { + itemidptr = &itemidbase[i]; + memcpy(scratchptr + itemidptr->itemoff, page + itemidptr->itemoff, + itemidptr->alignedlen); + i++; + } while (i < nitems); + + /* Set things up for the compactification code below */ + i = 0; + itemidptr = &itemidbase[0]; + upper = phdr->pd_special; + } + else + { + upper = phdr->pd_special; + + /* + * Many tuples are likely to already be in the correct location. + * There's no need to copy these into the temp buffer. Instead + * we'll just skip forward in the itemidbase array to the position + * that we do need to move tuples from so that the code below just + * leaves these ones alone. + */ + i = 0; + do + { + itemidptr = &itemidbase[i]; + if (upper != itemidptr->itemoff + itemidptr->alignedlen) + break; + upper -= itemidptr->alignedlen; + + i++; + } while (i < nitems); + + /* Copy all tuples that need to be moved into the temp buffer */ + memcpy(scratchptr + phdr->pd_upper, + page + phdr->pd_upper, + upper - phdr->pd_upper); + } + + /* + * Do the tuple compactification. itemidptr is already pointing to + * the first tuple that we're going to move. Here we collapse the + * memcpy calls for adjacent tuples into a single call. This is done + * by delaying the memcpy call until we find a gap that needs to be + * closed. + */ + copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen; + for (; i < nitems; i++) + { + ItemId lp; + + itemidptr = &itemidbase[i]; + + lp = PageGetItemId(page, itemidptr->offsetindex + 1); + + /* copy pending tuples when we detect a gap */ + if (copy_head != itemidptr->itemoff + itemidptr->alignedlen) + { + memcpy((char *) page + upper, + scratchptr + copy_head, + copy_tail - copy_head); + + /* + * We've now copied all tuples already seen, but not the + * current tuple, so we set the copy_tail to the end of this + * tuple. + */ + copy_tail = itemidptr->itemoff + itemidptr->alignedlen; + } + /* shift the target offset down by the length of this tuple */ + upper -= itemidptr->alignedlen; + /* point the copy_head to the start of this tuple */ + copy_head = itemidptr->itemoff; + + /* update the line pointer to reference the new offset */ + lp->lp_off = upper; + } + + /* Copy the remaining chunk */ + memcpy((char *) page + upper, + scratchptr + copy_head, + copy_tail - copy_head); + } + + phdr->pd_upper = upper; +} + +/* + * PageRepairFragmentation + * + * Frees fragmented space on a heap page following pruning. + * + * This routine is usable for heap pages only, but see PageIndexMultiDelete. + * + * This routine removes unused line pointers from the end of the line pointer + * array. This is possible when dead heap-only tuples get removed by pruning, + * especially when there were HOT chains with several tuples each beforehand. + * + * Caller had better have a full cleanup lock on page's buffer. As a side + * effect the page's PD_HAS_FREE_LINES hint bit will be set or unset as + * needed. Caller might also need to account for a reduction in the length of + * the line pointer array following array truncation. + */ +void +TdePageRepairFragmentation(Relation rel, Buffer buffer, Page page) +{ + Offset pd_lower = ((PageHeader) page)->pd_lower; + Offset pd_upper = ((PageHeader) page)->pd_upper; + Offset pd_special = ((PageHeader) page)->pd_special; + Offset last_offset; + itemIdCompactData itemidbase[MaxHeapTuplesPerPage]; + itemIdCompact itemidptr; + ItemId lp; + int nline, + nstorage, + nunused; + OffsetNumber finalusedlp = InvalidOffsetNumber; + int i; + Size totallen; + bool presorted = true; /* For now */ + + /* + * It's worth the trouble to be more paranoid here than in most places, + * because we are about to reshuffle data in (what is usually) a shared + * disk buffer. If we aren't careful then corrupted pointers, lengths, + * etc could cause us to clobber adjacent disk buffers, spreading the data + * loss further. So, check everything. + */ + if (pd_lower < SizeOfPageHeaderData || + pd_lower > pd_upper || + pd_upper > pd_special || + pd_special > BLCKSZ || + pd_special != MAXALIGN(pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + pd_lower, pd_upper, pd_special))); + + /* + * Run through the line pointer array and collect data about live items. + */ + nline = PageGetMaxOffsetNumber(page); + itemidptr = itemidbase; + nunused = totallen = 0; + last_offset = pd_special; + for (i = FirstOffsetNumber; i <= nline; i++) + { + lp = PageGetItemId(page, i); + if (ItemIdIsUsed(lp)) + { + if (ItemIdHasStorage(lp)) + { + itemidptr->offsetindex = i - 1; + itemidptr->itemoff = ItemIdGetOffset(lp); + + if (last_offset > itemidptr->itemoff) + last_offset = itemidptr->itemoff; + else + presorted = false; + + if (unlikely(itemidptr->itemoff < (int) pd_upper || + itemidptr->itemoff >= (int) pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: %u", + itemidptr->itemoff))); + itemidptr->len = ItemIdGetLength(lp); + itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); + totallen += itemidptr->alignedlen; + itemidptr++; + } + + finalusedlp = i; /* Could be the final non-LP_UNUSED item */ + } + else + { + /* Unused entries should have lp_len = 0, but make sure */ + Assert(!ItemIdHasStorage(lp)); + ItemIdSetUnused(lp); + nunused++; + } + } + + nstorage = itemidptr - itemidbase; + if (nstorage == 0) + { + /* Page is completely empty, so just reset it quickly */ + ((PageHeader) page)->pd_upper = pd_special; + } + else + { + /* Need to compact the page the hard way */ + if (totallen > (Size) (pd_special - pd_lower)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted item lengths: total %u, available space %u", + (unsigned int) totallen, pd_special - pd_lower))); + + pgtde_compactify_tuples(rel, buffer, itemidbase, nstorage, page, presorted); + } + + if (finalusedlp != nline) + { + /* The last line pointer is not the last used line pointer */ + int nunusedend = nline - finalusedlp; + + Assert(nunused >= nunusedend && nunusedend > 0); + + /* remove trailing unused line pointers from the count */ + nunused -= nunusedend; + /* truncate the line pointer array */ + ((PageHeader) page)->pd_lower -= (sizeof(ItemIdData) * nunusedend); + } + + /* Set hint bit for PageAddItemExtended */ + if (nunused > 0) + PageSetHasFreeLinePointers(page); + else + PageClearHasFreeLinePointers(page); +} diff --git a/src16/access/pg_tde_rewrite.c b/src16/access/pg_tde_rewrite.c index 7744cb84..964082a0 100644 --- a/src16/access/pg_tde_rewrite.c +++ b/src16/access/pg_tde_rewrite.c @@ -100,14 +100,18 @@ * *------------------------------------------------------------------------- */ +#include "pg_tde_defines.h" + #include "postgres.h" #include -#include "access/heapam.h" +#include "access/pg_tdeam.h" #include "access/pg_tdeam_xlog.h" -#include "access/heaptoast.h" -#include "access/rewriteheap.h" +#include "access/pg_tdetoast.h" +#include "access/pg_tde_rewrite.h" +#include "encryption/enc_tde.h" + #include "access/transam.h" #include "access/xact.h" #include "access/xloginsert.h" @@ -707,7 +711,7 @@ raw_tdeheap_insert(RewriteState state, HeapTuple tup) } /* And now we can insert the tuple into the page */ - newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, + newoff = TDE_PageAddItem(state->rs_new_rel->rd_locator, heaptup->t_tableOid, state->rs_blockno, page, (Item) heaptup->t_data, heaptup->t_len, InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add tuple"); diff --git a/src16/access/pg_tde_vacuumlazy.c b/src16/access/pg_tde_vacuumlazy.c index ed318621..8a3f49ef 100644 --- a/src16/access/pg_tde_vacuumlazy.c +++ b/src16/access/pg_tde_vacuumlazy.c @@ -17,7 +17,7 @@ * This frees up the memory space dedicated to storing dead TIDs. * * In practice VACUUM will often complete its initial pass over the target - * heap relation without ever running out of space to store TIDs. This means + * pg_tde relation without ever running out of space to store TIDs. This means * that there only needs to be one call to lazy_vacuum, after the initial pass * completes. * @@ -26,22 +26,26 @@ * * * IDENTIFICATION - * src/backend/access/heap/vacuumlazy.c + * src/backend/access/pg_tde/vacuumlazy.c * *------------------------------------------------------------------------- */ +#include "pg_tde_defines.h" + #include "postgres.h" #include +#include "access/pg_tdeam.h" +#include "access/pg_tdeam_xlog.h" +#include "access/pg_tde_visibilitymap.h" +#include "encryption/enc_tde.h" + #include "access/amapi.h" #include "access/genam.h" -#include "access/heapam.h" -#include "access/pg_tdeam_xlog.h" #include "access/htup_details.h" #include "access/multixact.h" #include "access/transam.h" -#include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" diff --git a/src16/access/pg_tde_visibilitymap.c b/src16/access/pg_tde_visibilitymap.c index 45e8d627..bef5bbff 100644 --- a/src16/access/pg_tde_visibilitymap.c +++ b/src16/access/pg_tde_visibilitymap.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * visibilitymap.c + * tdeheap_visibilitymap.c * bitmap for tracking visibility of heap tuples * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * src/backend/access/heap/visibilitymap.c + * src/backend/access/heap/pg_tde_visibilitymap.c * * INTERFACE ROUTINES * tdeheap_visibilitymap_clear - clear bits for one page in the visibility map @@ -84,10 +84,13 @@ * *------------------------------------------------------------------------- */ +#include "pg_tde_defines.h" + #include "postgres.h" #include "access/pg_tdeam_xlog.h" -#include "access/visibilitymap.h" +#include "access/pg_tde_visibilitymap.h" + #include "access/xloginsert.h" #include "access/xlogutils.h" #include "miscadmin.h" diff --git a/src16/access/pg_tdeam.c b/src16/access/pg_tdeam.c index 1d59ec7b..e4d1267a 100644 --- a/src16/access/pg_tdeam.c +++ b/src16/access/pg_tdeam.c @@ -1,14 +1,14 @@ /*------------------------------------------------------------------------- * - * heapam.c - * heap access method code + * pg_tdeam.c + * pg_tde access method code * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * src/backend/access/heap/heapam.c + * contrib/pg_tde/pg_tdeam.c * * * INTERFACE ROUTINES @@ -24,19 +24,26 @@ * * NOTES * This file contains the tdeheap_ routines which implement - * the POSTGRES heap access method used for all POSTGRES + * the POSTGRES pg_tde access method used for all POSTGRES * relations. * *------------------------------------------------------------------------- */ + +#include "pg_tde_defines.h" + #include "postgres.h" +#include "access/pg_tdeam.h" +#include "access/pg_tdeam_xlog.h" +#include "access/pg_tdetoast.h" +#include "access/pg_tde_io.h" +#include "access/pg_tde_visibilitymap.h" +#include "access/pg_tde_slot.h" +#include "encryption/enc_tde.h" + #include "access/bufmask.h" #include "access/genam.h" -#include "access/heapam.h" -#include "access/pg_tdeam_xlog.h" -#include "access/heaptoast.h" -#include "access/hio.h" #include "access/multixact.h" #include "access/parallel.h" #include "access/relscan.h" @@ -46,7 +53,6 @@ #include "access/tableam.h" #include "access/transam.h" #include "access/valid.h" -#include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -71,6 +77,7 @@ #include "utils/relcache.h" #include "utils/snapmgr.h" #include "utils/spccache.h" +#include "utils/memutils.h" static HeapTuple tdeheap_prepare_insert(Relation relation, HeapTuple tup, @@ -1101,10 +1108,10 @@ tdeheap_getnext(TableScanDesc sscan, ScanDirection direction) * rather than the AM oid, is that this allows to write regression tests * that create another AM reusing the heap handler. */ - if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine())) + if (unlikely(sscan->rs_rd->rd_tableam != GetPGTdeamTableAmRoutine())) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg_internal("only heap AM is supported"))); + errmsg_internal("only pg_tde AM is supported"))); /* * We don't expect direct calls to tdeheap_getnext with valid CheckXidAlive @@ -1152,6 +1159,7 @@ tdeheap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot if (scan->rs_ctup.t_data == NULL) { + TdeSlotForgetDecryptedTuple(slot); ExecClearTuple(slot); return false; } @@ -1163,7 +1171,7 @@ tdeheap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot pgstat_count_tdeheap_getnext(scan->rs_base.rs_rd); - ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, + PGTdeExecStoreBufferHeapTuple(sscan->rs_rd, &scan->rs_ctup, slot, scan->rs_cbuf); return true; } @@ -1259,6 +1267,7 @@ tdeheap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, if (scan->rs_ctup.t_data == NULL) { + TdeSlotForgetDecryptedTuple(slot); ExecClearTuple(slot); return false; } @@ -1311,7 +1320,7 @@ tdeheap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, */ pgstat_count_tdeheap_getnext(scan->rs_base.rs_rd); - ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf); + PGTdeExecStoreBufferHeapTuple(sscan->rs_rd, &scan->rs_ctup, slot, scan->rs_cbuf); return true; } @@ -1881,11 +1890,18 @@ tdeheap_insert(Relation relation, HeapTuple tup, CommandId cid, */ CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + /* + * Make sure relation keys in the cahce to avoid pallocs in + * the critical section. + */ + GetRelationKey(relation->rd_locator); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, - (options & HEAP_INSERT_SPECULATIVE) != 0); + (options & HEAP_INSERT_TDE_NO_ENCRYPT) == 0, + (options & HEAP_INSERT_SPECULATIVE) != 0); if (PageIsAllVisible(BufferGetPage(buffer))) { @@ -1975,9 +1991,11 @@ tdeheap_insert(Relation relation, HeapTuple tup, CommandId cid, */ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); + /* register encrypted tuple data from the buffer */ + PageHeader phdr = (PageHeader) BufferGetPage(buffer); /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ XLogRegisterBufData(0, - (char *) heaptup->t_data + SizeofHeapTupleHeader, + ((char *) phdr) + phdr->pd_upper + SizeofHeapTupleHeader, heaptup->t_len - SizeofHeapTupleHeader); /* filtering by origin on a row level is much more efficient */ @@ -2213,6 +2231,12 @@ tdeheap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN)) all_frozen_set = true; + /* + * Make sure relation keys in the cahce to avoid pallocs in + * the critical section. + */ + GetRelationKey(relation->rd_locator); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -2220,7 +2244,7 @@ tdeheap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, * tdeheap_RelationGetBufferForTuple has ensured that the first tuple fits. * Put that on the page, and then as many other tuples as fit. */ - tdeheap_RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); + tdeheap_RelationPutHeapTuple(relation, buffer, heaptuples[ndone], true, false); /* * For logical decoding we need combo CIDs to properly decode the @@ -2236,7 +2260,7 @@ tdeheap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) break; - tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, false); + tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, true, false); /* * For logical decoding we need combo CIDs to properly decode the @@ -2335,10 +2359,12 @@ tdeheap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, tuphdr->t_infomask = heaptup->t_data->t_infomask; tuphdr->t_hoff = heaptup->t_data->t_hoff; + /* Point to an encrypted tuple data in the Buffer */ + char *tup_data_on_page = (char *) page + ItemIdGetOffset(PageGetItemId(page, heaptup->t_self.ip_posid)); /* write bitmap [+ padding] [+ oid] + data */ datalen = heaptup->t_len - SizeofHeapTupleHeader; memcpy(scratchptr, - (char *) heaptup->t_data + SizeofHeapTupleHeader, + tup_data_on_page + SizeofHeapTupleHeader, datalen); tuphdr->datalen = datalen; scratchptr += datalen; @@ -2543,6 +2569,7 @@ tdeheap_delete(Relation relation, ItemPointer tid, bool all_visible_cleared = false; HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ bool old_key_copied = false; + HeapTuple decrypted_tuple; Assert(ItemPointerIsValid(tid)); @@ -2769,8 +2796,16 @@ tdeheap_delete(Relation relation, ItemPointer tid, /* * Compute replica identity tuple before entering the critical section so * we don't PANIC upon a memory allocation failure. + * + * ExtractReplicaIdentity has to get a decrypted tuple, otherwise it + * won't be able to extract varlen attributes. */ - old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied); + decrypted_tuple = tdeheap_copytuple(&tp); + PG_TDE_DECRYPT_TUPLE(&tp, decrypted_tuple, GetRelationKey(relation->rd_locator)); + + old_key_tuple = ExtractReplicaIdentity(relation, decrypted_tuple, true, &old_key_copied); + + tdeheap_freetuple(decrypted_tuple); /* * If this is the first possibly-multixact-able operation in the current @@ -3005,6 +3040,8 @@ tdeheap_update(Relation relation, ItemPointer otid, HeapTuple newtup, Bitmapset *modified_attrs; ItemId lp; HeapTupleData oldtup; + HeapTupleData oldtup_decrypted; + void* oldtup_data; HeapTuple heaptup; HeapTuple old_key_tuple = NULL; bool old_key_copied = false; @@ -3104,8 +3141,24 @@ tdeheap_update(Relation relation, ItemPointer otid, HeapTuple newtup, */ oldtup.t_tableOid = RelationGetRelid(relation); oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp); + oldtup_data = oldtup.t_data; oldtup.t_len = ItemIdGetLength(lp); oldtup.t_self = *otid; + /* decrypt the old tuple */ + { + char* new_ptr = NULL; + new_ptr = MemoryContextAlloc(CurTransactionContext, oldtup.t_len); + memcpy(new_ptr, oldtup.t_data, oldtup.t_data->t_hoff); + // only neccessary field + oldtup_decrypted.t_data = (HeapTupleHeader)new_ptr; + } + PG_TDE_DECRYPT_TUPLE(&oldtup, &oldtup_decrypted, + GetRelationKey(relation->rd_locator)); + + // change field in oldtup now. + // We can't do it before, as PG_TDE_DECRYPT_TUPLE uses t_data address in + // calculations + oldtup.t_data = oldtup_decrypted.t_data; /* the new tuple is ready, except for this: */ newtup->t_tableOid = RelationGetRelid(relation); @@ -3164,6 +3217,8 @@ tdeheap_update(Relation relation, ItemPointer otid, HeapTuple newtup, * use otid anymore. */ + oldtup.t_data = oldtup_data; + l2: checked_lockers = false; locker_remains = false; @@ -3610,7 +3665,7 @@ tdeheap_update(Relation relation, ItemPointer otid, HeapTuple newtup, if (need_toast) { /* Note we always use WAL and FSM during updates */ - heaptup = tdeheap_toast_insert_or_update(relation, newtup, &oldtup, 0); + heaptup = tdeheap_toast_insert_or_update(relation, newtup, &oldtup_decrypted, 0); newtupsize = MAXALIGN(heaptup->t_len); } else @@ -3746,6 +3801,12 @@ tdeheap_update(Relation relation, ItemPointer otid, HeapTuple newtup, id_has_external, &old_key_copied); + /* + * Make sure relation keys in the cahce to avoid pallocs in + * the critical section. + */ + GetRelationKey(relation->rd_locator); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -3780,7 +3841,7 @@ tdeheap_update(Relation relation, ItemPointer otid, HeapTuple newtup, HeapTupleClearHeapOnly(newtup); } - tdeheap_RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */ + tdeheap_RelationPutHeapTuple(relation, newbuf, heaptup, true, false); /* insert new tuple */ /* Clear obsolete visibility flags, possibly set by ourselves above... */ @@ -4024,7 +4085,6 @@ HeapDetermineColumnsInfo(Relation relation, */ value1 = tdeheap_getattr(oldtup, attrnum, tupdesc, &isnull1); value2 = tdeheap_getattr(newtup, attrnum, tupdesc, &isnull2); - if (!tdeheap_attr_equals(tupdesc, attrnum, value1, value2, isnull1, isnull2)) { @@ -5189,7 +5249,7 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, /* * Note: we *must* check TransactionIdIsInProgress before - * TransactionIdDidAbort/Commit; see comment at top of pg_tdeam_visibility.c + * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c * for an explanation. */ if (TransactionIdIsCurrentTransactionId(xid)) @@ -6262,7 +6322,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, /* * As with all tuple visibility routines, it's critical to test * TransactionIdIsInProgress before TransactionIdDidCommit, because of - * race conditions explained in detail in pg_tdeam_visibility.c. + * race conditions explained in detail in heapam_visibility.c. */ if (TransactionIdIsCurrentTransactionId(xid) || TransactionIdIsInProgress(xid)) @@ -8388,6 +8448,7 @@ log_tdeheap_update(Relation reln, Buffer oldbuf, suffixlen = 0; XLogRecPtr recptr; Page page = BufferGetPage(newbuf); + PageHeader phdr = (PageHeader) page; bool need_tuple_data = RelationIsLogicallyLogged(reln); bool init; int bufflags; @@ -8539,11 +8600,12 @@ log_tdeheap_update(Relation reln, Buffer oldbuf, * * The 'data' doesn't include the common prefix or suffix. */ + /* We write an encrypted newtuple data from the buffer */ XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); if (prefixlen == 0) { XLogRegisterBufData(0, - ((char *) newtup->t_data) + SizeofHeapTupleHeader, + ((char *) phdr) + phdr->pd_upper + SizeofHeapTupleHeader, newtup->t_len - SizeofHeapTupleHeader - suffixlen); } else @@ -8556,13 +8618,13 @@ log_tdeheap_update(Relation reln, Buffer oldbuf, if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0) { XLogRegisterBufData(0, - ((char *) newtup->t_data) + SizeofHeapTupleHeader, + ((char *) phdr) + phdr->pd_upper + SizeofHeapTupleHeader, newtup->t_data->t_hoff - SizeofHeapTupleHeader); } /* data after common prefix */ XLogRegisterBufData(0, - ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen, + ((char *) phdr) + phdr->pd_upper + newtup->t_data->t_hoff + prefixlen, newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen); } @@ -8808,6 +8870,7 @@ tdeheap_xlog_prune(XLogReaderState *record) int ndead; int nunused; Size datalen; + Relation reln; redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); @@ -8820,7 +8883,8 @@ tdeheap_xlog_prune(XLogReaderState *record) Assert(nunused >= 0); /* Update all line pointers per the record, and repair fragmentation */ - tdeheap_page_prune_execute(buffer, + reln = CreateFakeRelcacheEntry(rlocator); + tdeheap_page_prune_execute(reln, buffer, redirected, nredirected, nowdead, ndead, nowunused, nunused); @@ -9314,7 +9378,7 @@ tdeheap_xlog_insert(XLogReaderState *record) HeapTupleHeaderSetCmin(htup, FirstCommandId); htup->t_ctid = target_tid; - if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, + if (TDE_PageAddItem(target_locator, target_locator.spcOid, blkno, page, (Item) htup, newlen, xlrec->offnum, true, true) == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); @@ -9458,7 +9522,7 @@ tdeheap_xlog_multi_insert(XLogReaderState *record) ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); - offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + offnum = TDE_PageAddItem(rlocator, rlocator.spcOid, blkno, page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); } @@ -9732,7 +9796,7 @@ tdeheap_xlog_update(XLogReaderState *record, bool hot_update) /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; - offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + offnum = TDE_PageAddItem(rlocator, rlocator.spcOid, newblk, page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); @@ -10019,12 +10083,12 @@ tdeheap_redo(XLogReaderState *record) tdeheap_xlog_inplace(record); break; default: - elog(PANIC, "heap_redo: unknown op code %u", info); + elog(PANIC, "pg_tde_redo: unknown op code %u", info); } } void -tdeheap2_redo(XLogReaderState *record) +heapam2_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; @@ -10059,7 +10123,7 @@ tdeheap2_redo(XLogReaderState *record) tdeheap_xlog_logical_rewrite(record); break; default: - elog(PANIC, "tdeheap2_redo: unknown op code %u", info); + elog(PANIC, "heap2_redo: unknown op code %u", info); } } diff --git a/src16/access/pg_tdeam_handler.c b/src16/access/pg_tdeam_handler.c index afda52fb..f237444b 100644 --- a/src16/access/pg_tdeam_handler.c +++ b/src16/access/pg_tdeam_handler.c @@ -17,13 +17,22 @@ * *------------------------------------------------------------------------- */ + +#include "pg_tde_defines.h" + #include "postgres.h" +#include "access/pg_tde_slot.h" + +#include "access/pg_tdeam.h" +#include "access/pg_tdetoast.h" +#include "access/pg_tde_rewrite.h" +#include "access/pg_tde_tdemap.h" + +#include "encryption/enc_tde.h" + #include "access/genam.h" -#include "access/heapam.h" -#include "access/heaptoast.h" #include "access/multixact.h" -#include "access/rewriteheap.h" #include "access/syncscan.h" #include "access/tableam.h" #include "access/tsmapi.h" @@ -45,6 +54,12 @@ #include "utils/builtins.h" #include "utils/rel.h" +PG_FUNCTION_INFO_V1(pg_tdeam_basic_handler); +#ifdef PERCONA_FORK +PG_FUNCTION_INFO_V1(pg_tdeam_handler); +#endif + + static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, Datum *values, bool *isnull, RewriteState rwstate); @@ -159,7 +174,7 @@ pg_tdeam_index_fetch_tuple(struct IndexFetchTableData *scan, *call_again = !IsMVCCSnapshot(snapshot); slot->tts_tableOid = RelationGetRelid(scan->rel); - ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf); + PGTdeExecStoreBufferHeapTuple(scan->rel, &bslot->base.tupdata, slot, hscan->xs_cbuf); } else { @@ -191,7 +206,7 @@ pg_tdeam_fetch_row_version(Relation relation, if (tdeheap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false)) { /* store in slot, transferring existing pin */ - ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer); + PGTdeExecStorePinnedBufferHeapTuple(relation, &bslot->base.tupdata, slot, buffer); slot->tts_tableOid = RelationGetRelid(relation); return true; @@ -565,7 +580,7 @@ pg_tdeam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, tuple->t_tableOid = slot->tts_tableOid; /* store in slot, transferring existing pin */ - ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); + PGTdeExecStorePinnedBufferHeapTuple(relation, tuple, slot, buffer); return result; } @@ -624,6 +639,17 @@ pg_tdeam_relation_set_new_filelocator(Relation rel, } smgrclose(srel); + + /* Update TDE filemap */ + if (rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE) + { + ereport(DEBUG1, + (errmsg("creating key file for relation %s", RelationGetRelationName(rel)))); + + pg_tde_create_key_map_entry(newrlocator); + } } static void @@ -1148,7 +1174,7 @@ pg_tdeam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, if (sample_it) { - ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf); + PGTdeExecStoreBufferHeapTuple(scan->rs_rd, targtuple, slot, hscan->rs_cbuf); hscan->rs_cindex++; /* note that we leave the buffer locked here! */ @@ -1159,7 +1185,7 @@ pg_tdeam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, /* Now release the lock and pin on the page */ UnlockReleaseBuffer(hscan->rs_cbuf); hscan->rs_cbuf = InvalidBuffer; - + TdeSlotForgetDecryptedTuple(slot); /* also prevent old slot contents from having pin on page */ ExecClearTuple(slot); @@ -1628,7 +1654,7 @@ pg_tdeam_index_build_range_scan(Relation heapRelation, MemoryContextReset(econtext->ecxt_per_tuple_memory); /* Set up for predicate or expression evaluation */ - ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf); + PGTdeExecStoreBufferHeapTuple(heapRelation, heapTuple, slot, hscan->rs_cbuf); /* * In a partial index, discard tuples that don't satisfy the @@ -2262,7 +2288,7 @@ pg_tdeam_scan_bitmap_next_tuple(TableScanDesc scan, * Set up the result slot to point to this tuple. Note that the slot * acquires a pin on the buffer. */ - ExecStoreBufferHeapTuple(&hscan->rs_ctup, + PGTdeExecStoreBufferHeapTuple(scan->rs_rd, &hscan->rs_ctup, slot, hscan->rs_cbuf); @@ -2416,7 +2442,7 @@ pg_tdeam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, if (!pagemode) LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf); + PGTdeExecStoreBufferHeapTuple(scan->rs_rd, tuple, slot, hscan->rs_cbuf); /* Count successfully-fetched tuples as heap fetches */ pgstat_count_tdeheap_getnext(scan->rs_rd); @@ -2431,7 +2457,16 @@ pg_tdeam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, */ if (!pagemode) LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - + /* + * Hack: + * The issue is that, The previous call that would have used the same + * TupleTableSlot would have just deleted the memory context for the slot + * and refrained from calling the clear slot function. So, the slot would + * have the non NULL pointer to the decrypted tuple which is now invalid. + * So, we need to explicitly clear the decrypted tuple pointer before + * calling the clear slot function. + */ + TdeSlotForgetDecryptedTuple(slot); ExecClearTuple(slot); return false; } @@ -2601,15 +2636,28 @@ static const TableAmRoutine pg_tdeam_methods = { .scan_sample_next_tuple = pg_tdeam_scan_sample_next_tuple }; - const TableAmRoutine * -GetHeapamTableAmRoutine(void) +GetPGTdeamTableAmRoutine(void) { return &pg_tdeam_methods; } Datum -tdeheap_tableam_handler(PG_FUNCTION_ARGS) +pg_tdeam_basic_handler(PG_FUNCTION_ARGS) { PG_RETURN_POINTER(&pg_tdeam_methods); } + +#ifdef PERCONA_FORK +Datum +pg_tdeam_handler(PG_FUNCTION_ARGS) +{ + PG_RETURN_POINTER(GetHeapamTableAmRoutine()); +} +#endif + +bool +is_tdeheap_rel(Relation rel) +{ + return (rel->rd_tableam == (TableAmRoutine *) &pg_tdeam_methods); +} diff --git a/src16/access/pg_tdeam_visibility.c b/src16/access/pg_tdeam_visibility.c index 5e5d184d..c037e30c 100644 --- a/src16/access/pg_tdeam_visibility.c +++ b/src16/access/pg_tdeam_visibility.c @@ -64,9 +64,12 @@ *------------------------------------------------------------------------- */ +#include "pg_tde_defines.h" + #include "postgres.h" -#include "access/heapam.h" +#include "access/pg_tdeam.h" + #include "access/htup_details.h" #include "access/multixact.h" #include "access/subtrans.h" @@ -96,7 +99,7 @@ * future re-examination of the tuple. * * We can always set hint bits when marking a transaction aborted. (Some - * code in heapam.c relies on that!) + * code in pg_tdeam.c relies on that!) * * Also, if we are cleaning up HEAP_MOVED_IN or HEAP_MOVED_OFF entries, then * we can always set the hint bits, since pre-9.0 VACUUM FULL always used diff --git a/src16/access/pg_tdetoast.c b/src16/access/pg_tdetoast.c index 56b7de0c..6b4d45d5 100644 --- a/src16/access/pg_tdetoast.c +++ b/src16/access/pg_tdetoast.c @@ -21,16 +21,32 @@ * *------------------------------------------------------------------------- */ +#include "pg_tde_defines.h" #include "postgres.h" +#include "access/pg_tdeam.h" +#include "access/pg_tdetoast.h" + #include "access/detoast.h" #include "access/genam.h" -#include "access/heapam.h" -#include "access/heaptoast.h" #include "access/toast_helper.h" #include "access/toast_internals.h" +#include "miscadmin.h" #include "utils/fmgroids.h" +#include "utils/snapmgr.h" +#include "encryption/enc_tde.h" + +#define TDE_TOAST_COMPRESS_HEADER_SIZE (VARHDRSZ_COMPRESSED - VARHDRSZ) + +static void tdeheap_toast_tuple_externalize(ToastTupleContext *ttc, + int attribute, int options); +static Datum tdeheap_toast_save_datum(Relation rel, Datum value, + struct varlena *oldexternal, + int options); +static void tdeheap_toast_encrypt(Pointer dval, Oid valueid, RelKeyData *keys); +static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); +static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); /* ---------- @@ -640,6 +656,10 @@ tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, int num_indexes; int validIndex; SnapshotData SnapshotToast; + char decrypted_data[TOAST_MAX_CHUNK_SIZE]; + RelKeyData *key = GetRelationKey(toastrel->rd_locator); + char iv_prefix[16] = {0,}; + /* Look for the valid index of toast relation */ validIndex = toast_open_indexes(toastrel, @@ -689,6 +709,8 @@ tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], &SnapshotToast, nscankeys, toastkey); + memcpy(iv_prefix, &valueid, sizeof(Oid)); + /* * Read the chunks by index * @@ -705,6 +727,7 @@ tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, int32 expected_size; int32 chcpystrt; int32 chcpyend; + int32 encrypt_offset; /* * Have a chunk, extract the sequence number and the data @@ -769,9 +792,34 @@ tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, if (curchunk == endchunk) chcpyend = (sliceoffset + slicelength - 1) % TOAST_MAX_CHUNK_SIZE; + /* + * If TOAST is compressed, the first TDE_TOAST_COMPRESS_HEADER_SIZE (4 bytes) is + * not encrypted and contains compression info. It should be added to the + * result as it is and the rest should be decrypted. Encryption offset in + * that case will be 0 for the first chunk (despite the encrypted data + * starting with the offset TDE_TOAST_COMPRESS_HEADER_SIZE, we've encrypted it + * without compression headers) and `chunk start offset - 4` for the next + * chunks. + */ + encrypt_offset = chcpystrt; + if (VARATT_IS_COMPRESSED(result)) { + if (curchunk == 0) { + memcpy(VARDATA(result), chunkdata + chcpystrt, TDE_TOAST_COMPRESS_HEADER_SIZE); + chcpystrt += TDE_TOAST_COMPRESS_HEADER_SIZE; + } else { + encrypt_offset -= TDE_TOAST_COMPRESS_HEADER_SIZE; + } + } + /* Decrypt the data chunk by chunk here */ + + PG_TDE_DECRYPT_DATA(iv_prefix, (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + encrypt_offset, + chunkdata + chcpystrt, + (chcpyend - chcpystrt) + 1, + decrypted_data, key); + memcpy(VARDATA(result) + (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, - chunkdata + chcpystrt, + decrypted_data, (chcpyend - chcpystrt) + 1); expectedchunk++; @@ -791,3 +839,424 @@ tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, systable_endscan_ordered(toastscan); toast_close_indexes(toastidxs, num_indexes, AccessShareLock); } +// TODO: these should be in their own file so we can proplerly autoupdate them +/* pg_tde extension */ +static void +tdeheap_toast_encrypt(Pointer dval, Oid valueid, RelKeyData *key) +{ + int32 data_size =0; + char* data_p; + char* encrypted_data; + char iv_prefix[16] = {0,}; + + /* + * Encryption specific data_p and data_size as we have to avoid + * encryption of the compression info. + * See https://github.com/Percona-Lab/pg_tde/commit/dee6e357ef05d217a4c4df131249a80e5e909163 + */ + if (VARATT_IS_SHORT(dval)) + { + data_p = VARDATA_SHORT(dval); + data_size = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; + } + else if (VARATT_IS_COMPRESSED(dval)) + { + data_p = VARDATA_4B_C(dval); + data_size = VARSIZE(dval) - VARHDRSZ_COMPRESSED; + } + else + { + data_p = VARDATA(dval); + data_size = VARSIZE(dval) - VARHDRSZ; + } + /* Now encrypt the data and replace it in ttc */ + encrypted_data = (char *)palloc(data_size); + + memcpy(iv_prefix, &valueid, sizeof(Oid)); + PG_TDE_ENCRYPT_DATA(iv_prefix, 0, data_p, data_size, encrypted_data, key); + + memcpy(data_p, encrypted_data, data_size); + pfree(encrypted_data); +} + +/* + * Move an attribute to external storage. + * + * copy from PG src/backend/access/table/toast_helper.c + */ +static void +tdeheap_toast_tuple_externalize(ToastTupleContext *ttc, int attribute, int options) +{ + Datum *value = &ttc->ttc_values[attribute]; + Datum old_value = *value; + ToastAttrInfo *attr = &ttc->ttc_attr[attribute]; + + attr->tai_colflags |= TOASTCOL_IGNORE; + *value = tdeheap_toast_save_datum(ttc->ttc_rel, old_value, attr->tai_oldexternal, + options); + if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0) + pfree(DatumGetPointer(old_value)); + attr->tai_colflags |= TOASTCOL_NEEDS_FREE; + ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE); +} + +/* ---------- + * tdeheap_toast_save_datum - + * + * Save one single datum into the secondary relation and return + * a Datum reference for it. + * It also encrypts toasted data. + * + * rel: the main relation we're working with (not the toast rel!) + * value: datum to be pushed to toast storage + * oldexternal: if not NULL, toast pointer previously representing the datum + * options: options to be passed to tdeheap_insert() for toast rows + * + * based on toast_save_datum from PG src/backend/access/common/toast_internals.c + * ---------- + */ +static Datum +tdeheap_toast_save_datum(Relation rel, Datum value, + struct varlena *oldexternal, int options) +{ + Relation toastrel; + Relation *toastidxs; + HeapTuple toasttup; + TupleDesc toasttupDesc; + Datum t_values[3]; + bool t_isnull[3]; + CommandId mycid = GetCurrentCommandId(true); + struct varlena *result; + struct varatt_external toast_pointer; + union + { + struct varlena hdr; + /* this is to make the union big enough for a chunk: */ + char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; + /* ensure union is aligned well enough: */ + int32 align_it; + } chunk_data; + int32 chunk_size; + int32 chunk_seq = 0; + char *data_p; + int32 data_todo; + Pointer dval = DatumGetPointer(value); + int num_indexes; + int validIndex; + + + Assert(!VARATT_IS_EXTERNAL(value)); + + /* + * Open the toast relation and its indexes. We can use the index to check + * uniqueness of the OID we assign to the toasted item, even though it has + * additional columns besides OID. + */ + toastrel = table_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); + toasttupDesc = toastrel->rd_att; + + /* Open all the toast indexes and look for the valid one */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Get the data pointer and length, and compute va_rawsize and va_extinfo. + * + * va_rawsize is the size of the equivalent fully uncompressed datum, so + * we have to adjust for short headers. + * + * va_extinfo stored the actual size of the data payload in the toast + * records and the compression method in first 2 bits if data is + * compressed. + */ + if (VARATT_IS_SHORT(dval)) + { + data_p = VARDATA_SHORT(dval); + data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; + toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ + toast_pointer.va_extinfo = data_todo; + } + else if (VARATT_IS_COMPRESSED(dval)) + { + data_p = VARDATA(dval); + data_todo = VARSIZE(dval) - VARHDRSZ; + /* rawsize in a compressed datum is just the size of the payload */ + toast_pointer.va_rawsize = VARDATA_COMPRESSED_GET_EXTSIZE(dval) + VARHDRSZ; + + /* set external size and compression method */ + VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer, data_todo, + VARDATA_COMPRESSED_GET_COMPRESS_METHOD(dval)); + /* Assert that the numbers look like it's compressed */ + Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); + } + else + { + data_p = VARDATA(dval); + data_todo = VARSIZE(dval) - VARHDRSZ; + toast_pointer.va_rawsize = VARSIZE(dval); + toast_pointer.va_extinfo = data_todo; + } + + /* + * Insert the correct table OID into the result TOAST pointer. + * + * Normally this is the actual OID of the target toast table, but during + * table-rewriting operations such as CLUSTER, we have to insert the OID + * of the table's real permanent toast table instead. rd_toastoid is set + * if we have to substitute such an OID. + */ + if (OidIsValid(rel->rd_toastoid)) + toast_pointer.va_toastrelid = rel->rd_toastoid; + else + toast_pointer.va_toastrelid = RelationGetRelid(toastrel); + + /* + * Choose an OID to use as the value ID for this toast value. + * + * Normally we just choose an unused OID within the toast table. But + * during table-rewriting operations where we are preserving an existing + * toast table OID, we want to preserve toast value OIDs too. So, if + * rd_toastoid is set and we had a prior external value from that same + * toast table, re-use its value ID. If we didn't have a prior external + * value (which is a corner case, but possible if the table's attstorage + * options have been changed), we have to pick a value ID that doesn't + * conflict with either new or existing toast value OIDs. + */ + if (!OidIsValid(rel->rd_toastoid)) + { + /* normal case: just choose an unused OID */ + toast_pointer.va_valueid = + GetNewOidWithIndex(toastrel, + RelationGetRelid(toastidxs[validIndex]), + (AttrNumber) 1); + } + else + { + /* rewrite case: check to see if value was in old toast table */ + toast_pointer.va_valueid = InvalidOid; + if (oldexternal != NULL) + { + struct varatt_external old_toast_pointer; + + Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); + if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) + { + /* This value came from the old toast table; reuse its OID */ + toast_pointer.va_valueid = old_toast_pointer.va_valueid; + + /* + * There is a corner case here: the table rewrite might have + * to copy both live and recently-dead versions of a row, and + * those versions could easily reference the same toast value. + * When we copy the second or later version of such a row, + * reusing the OID will mean we select an OID that's already + * in the new toast table. Check for that, and if so, just + * fall through without writing the data again. + * + * While annoying and ugly-looking, this is a good thing + * because it ensures that we wind up with only one copy of + * the toast value when there is only one copy in the old + * toast table. Before we detected this case, we'd have made + * multiple copies, wasting space; and what's worse, the + * copies belonging to already-deleted heap tuples would not + * be reclaimed by VACUUM. + */ + if (toastrel_valueid_exists(toastrel, + toast_pointer.va_valueid)) + { + /* Match, so short-circuit the data storage loop below */ + data_todo = 0; + } + } + } + if (toast_pointer.va_valueid == InvalidOid) + { + /* + * new value; must choose an OID that doesn't conflict in either + * old or new toast table + */ + do + { + toast_pointer.va_valueid = + GetNewOidWithIndex(toastrel, + RelationGetRelid(toastidxs[validIndex]), + (AttrNumber) 1); + } while (toastid_valueid_exists(rel->rd_toastoid, + toast_pointer.va_valueid)); + } + } + + /* + * Encrypt toast data. + */ + tdeheap_toast_encrypt(dval, toast_pointer.va_valueid, GetRelationKey(toastrel->rd_locator)); + + /* + * Initialize constant parts of the tuple data + */ + t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); + t_values[2] = PointerGetDatum(&chunk_data); + t_isnull[0] = false; + t_isnull[1] = false; + t_isnull[2] = false; + + /* + * Split up the item into chunks + */ + while (data_todo > 0) + { + int i; + + CHECK_FOR_INTERRUPTS(); + + /* + * Calculate the size of this chunk + */ + chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); + + /* + * Build a tuple and store it + */ + t_values[1] = Int32GetDatum(chunk_seq++); + SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); + memcpy(VARDATA(&chunk_data), data_p, chunk_size); + toasttup = tdeheap_form_tuple(toasttupDesc, t_values, t_isnull); + + /* + * The tuple should be insterted not encrypted. + * TOAST data already encrypted. + */ + options |= HEAP_INSERT_TDE_NO_ENCRYPT; + tdeheap_insert(toastrel, toasttup, mycid, options, NULL); + + /* + * Create the index entry. We cheat a little here by not using + * FormIndexDatum: this relies on the knowledge that the index columns + * are the same as the initial columns of the table for all the + * indexes. We also cheat by not providing an IndexInfo: this is okay + * for now because btree doesn't need one, but we might have to be + * more honest someday. + * + * Note also that there had better not be any user-created index on + * the TOAST table, since we don't bother to update anything else. + */ + for (i = 0; i < num_indexes; i++) + { + /* Only index relations marked as ready can be updated */ + if (toastidxs[i]->rd_index->indisready) + index_insert(toastidxs[i], t_values, t_isnull, + &(toasttup->t_self), + toastrel, + toastidxs[i]->rd_index->indisunique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + false, NULL); + } + + /* + * Free memory + */ + tdeheap_freetuple(toasttup); + + /* + * Move on to next chunk + */ + data_todo -= chunk_size; + data_p += chunk_size; + } + + /* + * Done - close toast relation and its indexes but keep the lock until + * commit, so as a concurrent reindex done directly on the toast relation + * would be able to wait for this transaction. + */ + toast_close_indexes(toastidxs, num_indexes, NoLock); + table_close(toastrel, NoLock); + + /* + * Create the TOAST pointer value that we'll return + */ + result = (struct varlena *) palloc(TOAST_POINTER_SIZE); + SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); + memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); + + return PointerGetDatum(result); +} + +/* ---------- + * toastrel_valueid_exists - + * + * Test whether a toast value with the given ID exists in the toast relation. + * For safety, we consider a value to exist if there are either live or dead + * toast rows with that ID; see notes for GetNewOidWithIndex(). + * + * copy from PG src/backend/access/common/toast_internals.c + * ---------- + */ +static bool +toastrel_valueid_exists(Relation toastrel, Oid valueid) +{ + bool result = false; + ScanKeyData toastkey; + SysScanDesc toastscan; + int num_indexes; + int validIndex; + Relation *toastidxs; + + /* Fetch a valid index relation */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to find chunks with matching va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(valueid)); + + /* + * Is there any such chunk? + */ + toastscan = systable_beginscan(toastrel, + RelationGetRelid(toastidxs[validIndex]), + true, SnapshotAny, 1, &toastkey); + + if (systable_getnext(toastscan) != NULL) + result = true; + + systable_endscan(toastscan); + + /* Clean up */ + toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); + + return result; +} + +/* ---------- + * toastid_valueid_exists - + * + * As above, but work from toast rel's OID not an open relation + * + * copy from PG src/backend/access/common/toast_internals.c + * ---------- + */ +static bool +toastid_valueid_exists(Oid toastrelid, Oid valueid) +{ + bool result; + Relation toastrel; + + toastrel = table_open(toastrelid, AccessShareLock); + + result = toastrel_valueid_exists(toastrel, valueid); + + table_close(toastrel, AccessShareLock); + + return result; +} diff --git a/src16/include/access/pg_tde_io.h b/src16/include/access/pg_tde_io.h index 7d36bd2c..4d0a64bc 100644 --- a/src16/include/access/pg_tde_io.h +++ b/src16/include/access/pg_tde_io.h @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * hio.h + * tdeheap_io.h * POSTGRES heap access method input/output definitions. * * @@ -11,8 +11,8 @@ * *------------------------------------------------------------------------- */ -#ifndef HIO_H -#define HIO_H +#ifndef PG_TDE_IO_H +#define PG_TDE_IO_H #include "access/htup.h" #include "storage/buf.h" @@ -52,11 +52,11 @@ typedef struct BulkInsertStateData extern void tdeheap_RelationPutHeapTuple(Relation relation, Buffer buffer, - HeapTuple tuple, bool token); + HeapTuple tuple, bool encrypt, bool token); extern Buffer tdeheap_RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertStateData *bistate, Buffer *vmbuffer, Buffer *vmbuffer_other, int num_pages); -#endif /* HIO_H */ +#endif /* PG_TDE_IO_H */ diff --git a/src16/include/access/pg_tde_rewrite.h b/src16/include/access/pg_tde_rewrite.h index b1c7cf83..5285f39c 100644 --- a/src16/include/access/pg_tde_rewrite.h +++ b/src16/include/access/pg_tde_rewrite.h @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * rewriteheap.h + * tdeheap_rewrite.h * Declarations for heap rewrite support functions * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group @@ -10,8 +10,8 @@ * *------------------------------------------------------------------------- */ -#ifndef REWRITE_HEAP_H -#define REWRITE_HEAP_H +#ifndef PG_TDE_REWRITE_H +#define PG_TDE_REWRITE_H #include "access/htup.h" #include "storage/itemptr.h" @@ -54,4 +54,4 @@ typedef struct LogicalRewriteMappingData #define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x" extern void CheckPointLogicalRewriteHeap(void); -#endif /* REWRITE_HEAP_H */ +#endif /* PG_TDE_REWRITE_H */ diff --git a/src16/include/access/pg_tde_visibilitymap.h b/src16/include/access/pg_tde_visibilitymap.h index 8c38ea11..0b8213f0 100644 --- a/src16/include/access/pg_tde_visibilitymap.h +++ b/src16/include/access/pg_tde_visibilitymap.h @@ -1,18 +1,18 @@ /*------------------------------------------------------------------------- * - * visibilitymap.h + * tdeheap_visibilitymap.h * visibility map interface * * * Portions Copyright (c) 2007-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * src/include/access/visibilitymap.h + * src/include/access/pg_tde_visibilitymap.h * *------------------------------------------------------------------------- */ -#ifndef VISIBILITYMAP_H -#define VISIBILITYMAP_H +#ifndef PG_TDE_VISIBILITYMAP_H +#define PG_TDE_VISIBILITYMAP_H #include "access/visibilitymapdefs.h" #include "access/xlogdefs.h" @@ -39,4 +39,4 @@ extern void tdeheap_visibilitymap_count(Relation rel, BlockNumber *all_visible, extern BlockNumber tdeheap_visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks); -#endif /* VISIBILITYMAP_H */ +#endif /* PG_TDE_VISIBILITYMAP_H */ diff --git a/src16/include/access/pg_tdeam.h b/src16/include/access/pg_tdeam.h index 7f9f6138..b982c8ff 100644 --- a/src16/include/access/pg_tdeam.h +++ b/src16/include/access/pg_tdeam.h @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * heapam.h + * pg_tdeam.h * POSTGRES heap access method definitions. * * @@ -11,8 +11,8 @@ * *------------------------------------------------------------------------- */ -#ifndef HEAPAM_H -#define HEAPAM_H +#ifndef PG_TDEAM_H +#define PG_TDEAM_H #include "access/relation.h" /* for backward compatibility */ #include "access/relscan.h" @@ -31,10 +31,11 @@ /* "options" flag bits for tdeheap_insert */ -#define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM -#define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN -#define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL -#define HEAP_INSERT_SPECULATIVE 0x0010 +#define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM +#define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN +#define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL +#define HEAP_INSERT_SPECULATIVE 0x0010 +#define HEAP_INSERT_TDE_NO_ENCRYPT 0x2000 /* to specify rare cases when NO TDE enc */ typedef struct BulkInsertStateData *BulkInsertState; struct TupleTableSlot; @@ -290,7 +291,7 @@ extern int tdeheap_page_prune(Relation relation, Buffer buffer, TimestampTz old_snap_ts, int *nnewlpdead, OffsetNumber *off_loc); -extern void tdeheap_page_prune_execute(Buffer buffer, +extern void tdeheap_page_prune_execute(Relation rel, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused); @@ -329,4 +330,10 @@ extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data, extern void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot); -#endif /* HEAPAM_H */ +/* Defined in pg_tdeam_handler.c */ +extern bool is_tdeheap_rel(Relation rel); + +const TableAmRoutine * +GetPGTdeamTableAmRoutine(void); + +#endif /* PG_TDEAM_H */ diff --git a/src16/include/access/pg_tdeam_xlog.h b/src16/include/access/pg_tdeam_xlog.h index 1a96ea9e..9f07212c 100644 --- a/src16/include/access/pg_tdeam_xlog.h +++ b/src16/include/access/pg_tdeam_xlog.h @@ -1,18 +1,18 @@ /*------------------------------------------------------------------------- * * pg_tdeam_xlog.h - * POSTGRES heap access XLOG definitions. + * POSTGRES pg_tde access XLOG definitions. * * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * src/include/access/pg_tdeam_xlog.h + * src/include/access/heapam_xlog.h * *------------------------------------------------------------------------- */ -#ifndef HEAPAM_XLOG_H -#define HEAPAM_XLOG_H +#ifndef PG_TDEAM_XLOG_H +#define PG_TDEAM_XLOG_H #include "access/htup.h" #include "access/xlogreader.h" @@ -24,7 +24,7 @@ /* - * WAL record definitions for heapam.c's WAL operations + * WAL record definitions for pg_tdeam.c's WAL operations * * XLOG allows to store some information in high 4 bits of log * record xl_info field. We use 3 for opcode and one for init bit. @@ -45,7 +45,7 @@ */ #define XLOG_HEAP_INIT_PAGE 0x80 /* - * We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes + * We ran out of opcodes, so pg_tdeam.c now has a second RmgrId. These opcodes * are associated with RM_HEAP2_ID, but are not logically different from * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to * these, too. @@ -418,4 +418,4 @@ extern XLogRecPtr log_tdeheap_visible(Relation rel, Buffer tdeheap_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags); -#endif /* HEAPAM_XLOG_H */ +#endif /* PG_TDEAM_XLOG_H */ diff --git a/src16/include/access/pg_tdetoast.h b/src16/include/access/pg_tdetoast.h index af79b756..c17a7816 100644 --- a/src16/include/access/pg_tdetoast.h +++ b/src16/include/access/pg_tdetoast.h @@ -10,8 +10,8 @@ * *------------------------------------------------------------------------- */ -#ifndef HEAPTOAST_H -#define HEAPTOAST_H +#ifndef PG_TDE_TOAST_H +#define PG_TDE_TOAST_H #include "access/htup_details.h" #include "storage/lockdefs.h" @@ -146,4 +146,4 @@ extern void tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, int32 sliceoffset, int32 slicelength, struct varlena *result); -#endif /* HEAPTOAST_H */ +#endif /* PG_TDE_TOAST_H */ From f9ed3ce2ca338d2c65041dd5a4458422d1a58842 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Sun, 4 Aug 2024 20:25:11 +0100 Subject: [PATCH 4/6] Removed heap files from the non versioned src directory --- src/access/pg_tde_io.c | 895 -- src/access/pg_tde_prune.c | 1615 --- src/access/pg_tde_rewrite.c | 1291 --- src/access/pg_tde_vacuumlazy.c | 3476 ------- src/access/pg_tde_visibilitymap.c | 650 -- src/access/pg_tdeam.c | 10311 -------------------- src/access/pg_tdeam_handler.c | 2663 ----- src/access/pg_tdeam_visibility.c | 1793 ---- src/access/pg_tdetoast.c | 1262 --- src/include/access/pg_tde_io.h | 62 - src/include/access/pg_tde_rewrite.h | 57 - src/include/access/pg_tde_visibilitymap.h | 42 - src/include/access/pg_tdeam.h | 339 - src/include/access/pg_tdeam_xlog.h | 421 - src/include/access/pg_tdetoast.h | 149 - 15 files changed, 25026 deletions(-) delete mode 100644 src/access/pg_tde_io.c delete mode 100644 src/access/pg_tde_prune.c delete mode 100644 src/access/pg_tde_rewrite.c delete mode 100644 src/access/pg_tde_vacuumlazy.c delete mode 100644 src/access/pg_tde_visibilitymap.c delete mode 100644 src/access/pg_tdeam.c delete mode 100644 src/access/pg_tdeam_handler.c delete mode 100644 src/access/pg_tdeam_visibility.c delete mode 100644 src/access/pg_tdetoast.c delete mode 100644 src/include/access/pg_tde_io.h delete mode 100644 src/include/access/pg_tde_rewrite.h delete mode 100644 src/include/access/pg_tde_visibilitymap.h delete mode 100644 src/include/access/pg_tdeam.h delete mode 100644 src/include/access/pg_tdeam_xlog.h delete mode 100644 src/include/access/pg_tdetoast.h diff --git a/src/access/pg_tde_io.c b/src/access/pg_tde_io.c deleted file mode 100644 index 2ad4d366..00000000 --- a/src/access/pg_tde_io.c +++ /dev/null @@ -1,895 +0,0 @@ -/*------------------------------------------------------------------------- - * - * hio.c - * POSTGRES heap access method input/output code. - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/access/heap/hio.c - * - *------------------------------------------------------------------------- - */ - -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include "access/pg_tdeam.h" -#include "access/pg_tde_io.h" -#include "access/pg_tde_visibilitymap.h" -#include "encryption/enc_tde.h" - -#include "access/htup_details.h" -#include "storage/bufmgr.h" -#include "storage/freespace.h" -#include "storage/lmgr.h" -#include "storage/smgr.h" - - -/* - * tdeheap_RelationPutHeapTuple - place tuple at specified page - * - * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!! - * - * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer. - */ -void -tdeheap_RelationPutHeapTuple(Relation relation, - Buffer buffer, - HeapTuple tuple, - bool encrypt, - bool token) -{ - Page pageHeader; - OffsetNumber offnum; - - /* - * A tuple that's being inserted speculatively should already have its - * token set. - */ - Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data)); - - /* - * Do not allow tuples with invalid combinations of hint bits to be placed - * on a page. This combination is detected as corruption by the - * contrib/amcheck logic, so if you disable this assertion, make - * corresponding changes there. - */ - Assert(!((tuple->t_data->t_infomask & HEAP_XMAX_COMMITTED) && - (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI))); - - /* Add the tuple to the page */ - pageHeader = BufferGetPage(buffer); - - if (encrypt) - offnum = TDE_PageAddItem(relation->rd_locator, tuple->t_tableOid, BufferGetBlockNumber(buffer), pageHeader, (Item) tuple->t_data, - tuple->t_len, InvalidOffsetNumber, false, true); - else - offnum = PageAddItem(pageHeader, (Item) tuple->t_data, - tuple->t_len, InvalidOffsetNumber, false, true); - - if (offnum == InvalidOffsetNumber) - elog(PANIC, "failed to add tuple to page"); - - /* Update tuple->t_self to the actual position where it was stored */ - ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum); - - /* - * Insert the correct position into CTID of the stored tuple, too (unless - * this is a speculative insertion, in which case the token is held in - * CTID field instead) - */ - if (!token) - { - ItemId itemId = PageGetItemId(pageHeader, offnum); - HeapTupleHeader item = (HeapTupleHeader) PageGetItem(pageHeader, itemId); - - item->t_ctid = tuple->t_self; - } -} - -/* - * Read in a buffer in mode, using bulk-insert strategy if bistate isn't NULL. - */ -static Buffer -ReadBufferBI(Relation relation, BlockNumber targetBlock, - ReadBufferMode mode, BulkInsertState bistate) -{ - Buffer buffer; - - /* If not bulk-insert, exactly like ReadBuffer */ - if (!bistate) - return ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, - mode, NULL); - - /* If we have the desired block already pinned, re-pin and return it */ - if (bistate->current_buf != InvalidBuffer) - { - if (BufferGetBlockNumber(bistate->current_buf) == targetBlock) - { - /* - * Currently the LOCK variants are only used for extending - * relation, which should never reach this branch. - */ - Assert(mode != RBM_ZERO_AND_LOCK && - mode != RBM_ZERO_AND_CLEANUP_LOCK); - - IncrBufferRefCount(bistate->current_buf); - return bistate->current_buf; - } - /* ... else drop the old buffer */ - ReleaseBuffer(bistate->current_buf); - bistate->current_buf = InvalidBuffer; - } - - /* Perform a read using the buffer strategy */ - buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, - mode, bistate->strategy); - - /* Save the selected block as target for future inserts */ - IncrBufferRefCount(buffer); - bistate->current_buf = buffer; - - return buffer; -} - -/* - * For each heap page which is all-visible, acquire a pin on the appropriate - * visibility map page, if we haven't already got one. - * - * To avoid complexity in the callers, either buffer1 or buffer2 may be - * InvalidBuffer if only one buffer is involved. For the same reason, block2 - * may be smaller than block1. - * - * Returns whether buffer locks were temporarily released. - */ -static bool -GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, - BlockNumber block1, BlockNumber block2, - Buffer *vmbuffer1, Buffer *vmbuffer2) -{ - bool need_to_pin_buffer1; - bool need_to_pin_buffer2; - bool released_locks = false; - - /* - * Swap buffers around to handle case of a single block/buffer, and to - * handle if lock ordering rules require to lock block2 first. - */ - if (!BufferIsValid(buffer1) || - (BufferIsValid(buffer2) && block1 > block2)) - { - Buffer tmpbuf = buffer1; - Buffer *tmpvmbuf = vmbuffer1; - BlockNumber tmpblock = block1; - - buffer1 = buffer2; - vmbuffer1 = vmbuffer2; - block1 = block2; - - buffer2 = tmpbuf; - vmbuffer2 = tmpvmbuf; - block2 = tmpblock; - } - - Assert(BufferIsValid(buffer1)); - Assert(buffer2 == InvalidBuffer || block1 <= block2); - - while (1) - { - /* Figure out which pins we need but don't have. */ - need_to_pin_buffer1 = PageIsAllVisible(BufferGetPage(buffer1)) - && !tdeheap_visibilitymap_pin_ok(block1, *vmbuffer1); - need_to_pin_buffer2 = buffer2 != InvalidBuffer - && PageIsAllVisible(BufferGetPage(buffer2)) - && !tdeheap_visibilitymap_pin_ok(block2, *vmbuffer2); - if (!need_to_pin_buffer1 && !need_to_pin_buffer2) - break; - - /* We must unlock both buffers before doing any I/O. */ - released_locks = true; - LockBuffer(buffer1, BUFFER_LOCK_UNLOCK); - if (buffer2 != InvalidBuffer && buffer2 != buffer1) - LockBuffer(buffer2, BUFFER_LOCK_UNLOCK); - - /* Get pins. */ - if (need_to_pin_buffer1) - tdeheap_visibilitymap_pin(relation, block1, vmbuffer1); - if (need_to_pin_buffer2) - tdeheap_visibilitymap_pin(relation, block2, vmbuffer2); - - /* Relock buffers. */ - LockBuffer(buffer1, BUFFER_LOCK_EXCLUSIVE); - if (buffer2 != InvalidBuffer && buffer2 != buffer1) - LockBuffer(buffer2, BUFFER_LOCK_EXCLUSIVE); - - /* - * If there are two buffers involved and we pinned just one of them, - * it's possible that the second one became all-visible while we were - * busy pinning the first one. If it looks like that's a possible - * scenario, we'll need to make a second pass through this loop. - */ - if (buffer2 == InvalidBuffer || buffer1 == buffer2 - || (need_to_pin_buffer1 && need_to_pin_buffer2)) - break; - } - - return released_locks; -} - -/* - * Extend the relation. By multiple pages, if beneficial. - * - * If the caller needs multiple pages (num_pages > 1), we always try to extend - * by at least that much. - * - * If there is contention on the extension lock, we don't just extend "for - * ourselves", but we try to help others. We can do so by adding empty pages - * into the FSM. Typically there is no contention when we can't use the FSM. - * - * We do have to limit the number of pages to extend by to some value, as the - * buffers for all the extended pages need to, temporarily, be pinned. For now - * we define MAX_BUFFERS_TO_EXTEND_BY to be 64 buffers, it's hard to see - * benefits with higher numbers. This partially is because copyfrom.c's - * MAX_BUFFERED_TUPLES / MAX_BUFFERED_BYTES prevents larger multi_inserts. - * - * Returns a buffer for a newly extended block. If possible, the buffer is - * returned exclusively locked. *did_unlock is set to true if the lock had to - * be released, false otherwise. - * - * - * XXX: It would likely be beneficial for some workloads to extend more - * aggressively, e.g. using a heuristic based on the relation size. - */ -static Buffer -RelationAddBlocks(Relation relation, BulkInsertState bistate, - int num_pages, bool use_fsm, bool *did_unlock) -{ -#define MAX_BUFFERS_TO_EXTEND_BY 64 - Buffer victim_buffers[MAX_BUFFERS_TO_EXTEND_BY]; - BlockNumber first_block = InvalidBlockNumber; - BlockNumber last_block = InvalidBlockNumber; - uint32 extend_by_pages; - uint32 not_in_fsm_pages; - Buffer buffer; - Page page; - - /* - * Determine by how many pages to try to extend by. - */ - if (bistate == NULL && !use_fsm) - { - /* - * If we have neither bistate, nor can use the FSM, we can't bulk - * extend - there'd be no way to find the additional pages. - */ - extend_by_pages = 1; - } - else - { - uint32 waitcount; - - /* - * Try to extend at least by the number of pages the caller needs. We - * can remember the additional pages (either via FSM or bistate). - */ - extend_by_pages = num_pages; - - if (!RELATION_IS_LOCAL(relation)) - waitcount = RelationExtensionLockWaiterCount(relation); - else - waitcount = 0; - - /* - * Multiply the number of pages to extend by the number of waiters. Do - * this even if we're not using the FSM, as it still relieves - * contention, by deferring the next time this backend needs to - * extend. In that case the extended pages will be found via - * bistate->next_free. - */ - extend_by_pages += extend_by_pages * waitcount; - - /* --- - * If we previously extended using the same bistate, it's very likely - * we'll extend some more. Try to extend by as many pages as - * before. This can be important for performance for several reasons, - * including: - * - * - It prevents mdzeroextend() switching between extending the - * relation in different ways, which is inefficient for some - * filesystems. - * - * - Contention is often intermittent. Even if we currently don't see - * other waiters (see above), extending by larger amounts can - * prevent future contention. - * --- - */ - if (bistate) - extend_by_pages = Max(extend_by_pages, bistate->already_extended_by); - - /* - * Can't extend by more than MAX_BUFFERS_TO_EXTEND_BY, we need to pin - * them all concurrently. - */ - extend_by_pages = Min(extend_by_pages, MAX_BUFFERS_TO_EXTEND_BY); - } - - /* - * How many of the extended pages should be entered into the FSM? - * - * If we have a bistate, only enter pages that we don't need ourselves - * into the FSM. Otherwise every other backend will immediately try to - * use the pages this backend needs for itself, causing unnecessary - * contention. If we don't have a bistate, we can't avoid the FSM. - * - * Never enter the page returned into the FSM, we'll immediately use it. - */ - if (num_pages > 1 && bistate == NULL) - not_in_fsm_pages = 1; - else - not_in_fsm_pages = num_pages; - - /* prepare to put another buffer into the bistate */ - if (bistate && bistate->current_buf != InvalidBuffer) - { - ReleaseBuffer(bistate->current_buf); - bistate->current_buf = InvalidBuffer; - } - - /* - * Extend the relation. We ask for the first returned page to be locked, - * so that we are sure that nobody has inserted into the page - * concurrently. - * - * With the current MAX_BUFFERS_TO_EXTEND_BY there's no danger of - * [auto]vacuum trying to truncate later pages as REL_TRUNCATE_MINIMUM is - * way larger. - */ - first_block = ExtendBufferedRelBy(BMR_REL(relation), MAIN_FORKNUM, - bistate ? bistate->strategy : NULL, - EB_LOCK_FIRST, - extend_by_pages, - victim_buffers, - &extend_by_pages); - buffer = victim_buffers[0]; /* the buffer the function will return */ - last_block = first_block + (extend_by_pages - 1); - Assert(first_block == BufferGetBlockNumber(buffer)); - - /* - * Relation is now extended. Initialize the page. We do this here, before - * potentially releasing the lock on the page, because it allows us to - * double check that the page contents are empty (this should never - * happen, but if it does we don't want to risk wiping out valid data). - */ - page = BufferGetPage(buffer); - if (!PageIsNew(page)) - elog(ERROR, "page %u of relation \"%s\" should be empty but is not", - first_block, - RelationGetRelationName(relation)); - - PageInit(page, BufferGetPageSize(buffer), 0); - MarkBufferDirty(buffer); - - /* - * If we decided to put pages into the FSM, release the buffer lock (but - * not pin), we don't want to do IO while holding a buffer lock. This will - * necessitate a bit more extensive checking in our caller. - */ - if (use_fsm && not_in_fsm_pages < extend_by_pages) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - *did_unlock = true; - } - else - *did_unlock = false; - - /* - * Relation is now extended. Release pins on all buffers, except for the - * first (which we'll return). If we decided to put pages into the FSM, - * we can do that as part of the same loop. - */ - for (uint32 i = 1; i < extend_by_pages; i++) - { - BlockNumber curBlock = first_block + i; - - Assert(curBlock == BufferGetBlockNumber(victim_buffers[i])); - Assert(BlockNumberIsValid(curBlock)); - - ReleaseBuffer(victim_buffers[i]); - - if (use_fsm && i >= not_in_fsm_pages) - { - Size freespace = BufferGetPageSize(victim_buffers[i]) - - SizeOfPageHeaderData; - - RecordPageWithFreeSpace(relation, curBlock, freespace); - } - } - - if (use_fsm && not_in_fsm_pages < extend_by_pages) - { - BlockNumber first_fsm_block = first_block + not_in_fsm_pages; - - FreeSpaceMapVacuumRange(relation, first_fsm_block, last_block); - } - - if (bistate) - { - /* - * Remember the additional pages we extended by, so we later can use - * them without looking into the FSM. - */ - if (extend_by_pages > 1) - { - bistate->next_free = first_block + 1; - bistate->last_free = last_block; - } - else - { - bistate->next_free = InvalidBlockNumber; - bistate->last_free = InvalidBlockNumber; - } - - /* maintain bistate->current_buf */ - IncrBufferRefCount(buffer); - bistate->current_buf = buffer; - bistate->already_extended_by += extend_by_pages; - } - - return buffer; -#undef MAX_BUFFERS_TO_EXTEND_BY -} - -/* - * tdeheap_RelationGetBufferForTuple - * - * Returns pinned and exclusive-locked buffer of a page in given relation - * with free space >= given len. - * - * If num_pages is > 1, we will try to extend the relation by at least that - * many pages when we decide to extend the relation. This is more efficient - * for callers that know they will need multiple pages - * (e.g. tdeheap_multi_insert()). - * - * If otherBuffer is not InvalidBuffer, then it references a previously - * pinned buffer of another page in the same relation; on return, this - * buffer will also be exclusive-locked. (This case is used by tdeheap_update; - * the otherBuffer contains the tuple being updated.) - * - * The reason for passing otherBuffer is that if two backends are doing - * concurrent tdeheap_update operations, a deadlock could occur if they try - * to lock the same two buffers in opposite orders. To ensure that this - * can't happen, we impose the rule that buffers of a relation must be - * locked in increasing page number order. This is most conveniently done - * by having tdeheap_RelationGetBufferForTuple lock them both, with suitable care - * for ordering. - * - * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the - * same buffer we select for insertion of the new tuple (this could only - * happen if space is freed in that page after tdeheap_update finds there's not - * enough there). In that case, the page will be pinned and locked only once. - * - * We also handle the possibility that the all-visible flag will need to be - * cleared on one or both pages. If so, pin on the associated visibility map - * page must be acquired before acquiring buffer lock(s), to avoid possibly - * doing I/O while holding buffer locks. The pins are passed back to the - * caller using the input-output arguments vmbuffer and vmbuffer_other. - * Note that in some cases the caller might have already acquired such pins, - * which is indicated by these arguments not being InvalidBuffer on entry. - * - * We normally use FSM to help us find free space. However, - * if HEAP_INSERT_SKIP_FSM is specified, we just append a new empty page to - * the end of the relation if the tuple won't fit on the current target page. - * This can save some cycles when we know the relation is new and doesn't - * contain useful amounts of free space. - * - * HEAP_INSERT_SKIP_FSM is also useful for non-WAL-logged additions to a - * relation, if the caller holds exclusive lock and is careful to invalidate - * relation's smgr_targblock before the first insertion --- that ensures that - * all insertions will occur into newly added pages and not be intermixed - * with tuples from other transactions. That way, a crash can't risk losing - * any committed data of other transactions. (See tdeheap_insert's comments - * for additional constraints needed for safe usage of this behavior.) - * - * The caller can also provide a BulkInsertState object to optimize many - * insertions into the same relation. This keeps a pin on the current - * insertion target page (to save pin/unpin cycles) and also passes a - * BULKWRITE buffer selection strategy object to the buffer manager. - * Passing NULL for bistate selects the default behavior. - * - * We don't fill existing pages further than the fillfactor, except for large - * tuples in nearly-empty pages. This is OK since this routine is not - * consulted when updating a tuple and keeping it on the same page, which is - * the scenario fillfactor is meant to reserve space for. - * - * ereport(ERROR) is allowed here, so this routine *must* be called - * before any (unlogged) changes are made in buffer pool. - */ -Buffer -tdeheap_RelationGetBufferForTuple(Relation relation, Size len, - Buffer otherBuffer, int options, - BulkInsertState bistate, - Buffer *vmbuffer, Buffer *vmbuffer_other, - int num_pages) -{ - bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); - Buffer buffer = InvalidBuffer; - Page page; - Size nearlyEmptyFreeSpace, - pageFreeSpace = 0, - saveFreeSpace = 0, - targetFreeSpace = 0; - BlockNumber targetBlock, - otherBlock; - bool unlockedTargetBuffer; - bool recheckVmPins; - - len = MAXALIGN(len); /* be conservative */ - - /* if the caller doesn't know by how many pages to extend, extend by 1 */ - if (num_pages <= 0) - num_pages = 1; - - /* Bulk insert is not supported for updates, only inserts. */ - Assert(otherBuffer == InvalidBuffer || !bistate); - - /* - * If we're gonna fail for oversize tuple, do it right away - */ - if (len > MaxHeapTupleSize) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("row is too big: size %zu, maximum size %zu", - len, MaxHeapTupleSize))); - - /* Compute desired extra freespace due to fillfactor option */ - saveFreeSpace = RelationGetTargetPageFreeSpace(relation, - HEAP_DEFAULT_FILLFACTOR); - - /* - * Since pages without tuples can still have line pointers, we consider - * pages "empty" when the unavailable space is slight. This threshold is - * somewhat arbitrary, but it should prevent most unnecessary relation - * extensions while inserting large tuples into low-fillfactor tables. - */ - nearlyEmptyFreeSpace = MaxHeapTupleSize - - (MaxHeapTuplesPerPage / 8 * sizeof(ItemIdData)); - if (len + saveFreeSpace > nearlyEmptyFreeSpace) - targetFreeSpace = Max(len, nearlyEmptyFreeSpace); - else - targetFreeSpace = len + saveFreeSpace; - - if (otherBuffer != InvalidBuffer) - otherBlock = BufferGetBlockNumber(otherBuffer); - else - otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */ - - /* - * We first try to put the tuple on the same page we last inserted a tuple - * on, as cached in the BulkInsertState or relcache entry. If that - * doesn't work, we ask the Free Space Map to locate a suitable page. - * Since the FSM's info might be out of date, we have to be prepared to - * loop around and retry multiple times. (To ensure this isn't an infinite - * loop, we must update the FSM with the correct amount of free space on - * each page that proves not to be suitable.) If the FSM has no record of - * a page with enough free space, we give up and extend the relation. - * - * When use_fsm is false, we either put the tuple onto the existing target - * page or extend the relation. - */ - if (bistate && bistate->current_buf != InvalidBuffer) - targetBlock = BufferGetBlockNumber(bistate->current_buf); - else - targetBlock = RelationGetTargetBlock(relation); - - if (targetBlock == InvalidBlockNumber && use_fsm) - { - /* - * We have no cached target page, so ask the FSM for an initial - * target. - */ - targetBlock = GetPageWithFreeSpace(relation, targetFreeSpace); - } - - /* - * If the FSM knows nothing of the rel, try the last page before we give - * up and extend. This avoids one-tuple-per-page syndrome during - * bootstrapping or in a recently-started system. - */ - if (targetBlock == InvalidBlockNumber) - { - BlockNumber nblocks = RelationGetNumberOfBlocks(relation); - - if (nblocks > 0) - targetBlock = nblocks - 1; - } - -loop: - while (targetBlock != InvalidBlockNumber) - { - /* - * Read and exclusive-lock the target block, as well as the other - * block if one was given, taking suitable care with lock ordering and - * the possibility they are the same block. - * - * If the page-level all-visible flag is set, caller will need to - * clear both that and the corresponding visibility map bit. However, - * by the time we return, we'll have x-locked the buffer, and we don't - * want to do any I/O while in that state. So we check the bit here - * before taking the lock, and pin the page if it appears necessary. - * Checking without the lock creates a risk of getting the wrong - * answer, so we'll have to recheck after acquiring the lock. - */ - if (otherBuffer == InvalidBuffer) - { - /* easy case */ - buffer = ReadBufferBI(relation, targetBlock, RBM_NORMAL, bistate); - if (PageIsAllVisible(BufferGetPage(buffer))) - tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); - - /* - * If the page is empty, pin vmbuffer to set all_frozen bit later. - */ - if ((options & HEAP_INSERT_FROZEN) && - (PageGetMaxOffsetNumber(BufferGetPage(buffer)) == 0)) - tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); - - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - } - else if (otherBlock == targetBlock) - { - /* also easy case */ - buffer = otherBuffer; - if (PageIsAllVisible(BufferGetPage(buffer))) - tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - } - else if (otherBlock < targetBlock) - { - /* lock other buffer first */ - buffer = ReadBuffer(relation, targetBlock); - if (PageIsAllVisible(BufferGetPage(buffer))) - tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); - LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - } - else - { - /* lock target buffer first */ - buffer = ReadBuffer(relation, targetBlock); - if (PageIsAllVisible(BufferGetPage(buffer))) - tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); - } - - /* - * We now have the target page (and the other buffer, if any) pinned - * and locked. However, since our initial PageIsAllVisible checks - * were performed before acquiring the lock, the results might now be - * out of date, either for the selected victim buffer, or for the - * other buffer passed by the caller. In that case, we'll need to - * give up our locks, go get the pin(s) we failed to get earlier, and - * re-lock. That's pretty painful, but hopefully shouldn't happen - * often. - * - * Note that there's a small possibility that we didn't pin the page - * above but still have the correct page pinned anyway, either because - * we've already made a previous pass through this loop, or because - * caller passed us the right page anyway. - * - * Note also that it's possible that by the time we get the pin and - * retake the buffer locks, the visibility map bit will have been - * cleared by some other backend anyway. In that case, we'll have - * done a bit of extra work for no gain, but there's no real harm - * done. - */ - GetVisibilityMapPins(relation, buffer, otherBuffer, - targetBlock, otherBlock, vmbuffer, - vmbuffer_other); - - /* - * Now we can check to see if there's enough free space here. If so, - * we're done. - */ - page = BufferGetPage(buffer); - - /* - * If necessary initialize page, it'll be used soon. We could avoid - * dirtying the buffer here, and rely on the caller to do so whenever - * it puts a tuple onto the page, but there seems not much benefit in - * doing so. - */ - if (PageIsNew(page)) - { - PageInit(page, BufferGetPageSize(buffer), 0); - MarkBufferDirty(buffer); - } - - pageFreeSpace = PageGetHeapFreeSpace(page); - if (targetFreeSpace <= pageFreeSpace) - { - /* use this page as future insert target, too */ - RelationSetTargetBlock(relation, targetBlock); - return buffer; - } - - /* - * Not enough space, so we must give up our page locks and pin (if - * any) and prepare to look elsewhere. We don't care which order we - * unlock the two buffers in, so this can be slightly simpler than the - * code above. - */ - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - if (otherBuffer == InvalidBuffer) - ReleaseBuffer(buffer); - else if (otherBlock != targetBlock) - { - LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); - } - - /* Is there an ongoing bulk extension? */ - if (bistate && bistate->next_free != InvalidBlockNumber) - { - Assert(bistate->next_free <= bistate->last_free); - - /* - * We bulk extended the relation before, and there are still some - * unused pages from that extension, so we don't need to look in - * the FSM for a new page. But do record the free space from the - * last page, somebody might insert narrower tuples later. - */ - if (use_fsm) - RecordPageWithFreeSpace(relation, targetBlock, pageFreeSpace); - - targetBlock = bistate->next_free; - if (bistate->next_free >= bistate->last_free) - { - bistate->next_free = InvalidBlockNumber; - bistate->last_free = InvalidBlockNumber; - } - else - bistate->next_free++; - } - else if (!use_fsm) - { - /* Without FSM, always fall out of the loop and extend */ - break; - } - else - { - /* - * Update FSM as to condition of this page, and ask for another - * page to try. - */ - targetBlock = RecordAndGetPageWithFreeSpace(relation, - targetBlock, - pageFreeSpace, - targetFreeSpace); - } - } - - /* Have to extend the relation */ - buffer = RelationAddBlocks(relation, bistate, num_pages, use_fsm, - &unlockedTargetBuffer); - - targetBlock = BufferGetBlockNumber(buffer); - page = BufferGetPage(buffer); - - /* - * The page is empty, pin vmbuffer to set all_frozen bit. We don't want to - * do IO while the buffer is locked, so we unlock the page first if IO is - * needed (necessitating checks below). - */ - if (options & HEAP_INSERT_FROZEN) - { - Assert(PageGetMaxOffsetNumber(page) == 0); - - if (!tdeheap_visibilitymap_pin_ok(targetBlock, *vmbuffer)) - { - if (!unlockedTargetBuffer) - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - unlockedTargetBuffer = true; - tdeheap_visibilitymap_pin(relation, targetBlock, vmbuffer); - } - } - - /* - * Reacquire locks if necessary. - * - * If the target buffer was unlocked above, or is unlocked while - * reacquiring the lock on otherBuffer below, it's unlikely, but possible, - * that another backend used space on this page. We check for that below, - * and retry if necessary. - */ - recheckVmPins = false; - if (unlockedTargetBuffer) - { - /* released lock on target buffer above */ - if (otherBuffer != InvalidBuffer) - LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - recheckVmPins = true; - } - else if (otherBuffer != InvalidBuffer) - { - /* - * We did not release the target buffer, and otherBuffer is valid, - * need to lock the other buffer. It's guaranteed to be of a lower - * page number than the new page. To conform with the deadlock - * prevent rules, we ought to lock otherBuffer first, but that would - * give other backends a chance to put tuples on our page. To reduce - * the likelihood of that, attempt to lock the other buffer - * conditionally, that's very likely to work. - * - * Alternatively, we could acquire the lock on otherBuffer before - * extending the relation, but that'd require holding the lock while - * performing IO, which seems worse than an unlikely retry. - */ - Assert(otherBuffer != buffer); - Assert(targetBlock > otherBlock); - - if (unlikely(!ConditionalLockBuffer(otherBuffer))) - { - unlockedTargetBuffer = true; - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - } - recheckVmPins = true; - } - - /* - * If one of the buffers was unlocked (always the case if otherBuffer is - * valid), it's possible, although unlikely, that an all-visible flag - * became set. We can use GetVisibilityMapPins to deal with that. It's - * possible that GetVisibilityMapPins() might need to temporarily release - * buffer locks, in which case we'll need to check if there's still enough - * space on the page below. - */ - if (recheckVmPins) - { - if (GetVisibilityMapPins(relation, otherBuffer, buffer, - otherBlock, targetBlock, vmbuffer_other, - vmbuffer)) - unlockedTargetBuffer = true; - } - - /* - * If the target buffer was temporarily unlocked since the relation - * extension, it's possible, although unlikely, that all the space on the - * page was already used. If so, we just retry from the start. If we - * didn't unlock, something has gone wrong if there's not enough space - - * the test at the top should have prevented reaching this case. - */ - pageFreeSpace = PageGetHeapFreeSpace(page); - if (len > pageFreeSpace) - { - if (unlockedTargetBuffer) - { - if (otherBuffer != InvalidBuffer) - LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); - UnlockReleaseBuffer(buffer); - - goto loop; - } - elog(PANIC, "tuple is too big: size %zu", len); - } - - /* - * Remember the new page as our target for future insertions. - * - * XXX should we enter the new page into the free space map immediately, - * or just keep it for this backend's exclusive use in the short run - * (until VACUUM sees it)? Seems to depend on whether you expect the - * current backend to make more insertions or not, which is probably a - * good bet most of the time. So for now, don't add it to FSM yet. - */ - RelationSetTargetBlock(relation, targetBlock); - - return buffer; -} diff --git a/src/access/pg_tde_prune.c b/src/access/pg_tde_prune.c deleted file mode 100644 index 552151c5..00000000 --- a/src/access/pg_tde_prune.c +++ /dev/null @@ -1,1615 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pruneheap.c - * heap page pruning and HOT-chain management code - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/access/heap/pruneheap.c - * - *------------------------------------------------------------------------- - */ -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include "encryption/enc_tde.h" - -#include "access/pg_tdeam.h" -#include "access/pg_tdeam_xlog.h" - -#include "access/htup_details.h" -#include "access/transam.h" -#include "access/xlog.h" -#include "access/xloginsert.h" -#include "catalog/catalog.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "storage/bufmgr.h" -#include "utils/snapmgr.h" -#include "utils/rel.h" -#include "utils/snapmgr.h" - -/* Working data for tdeheap_page_prune and subroutines */ -typedef struct -{ - Relation rel; - - /* tuple visibility test, initialized for the relation */ - GlobalVisState *vistest; - - /* - * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have - * been computed (done on demand, and only if - * OldSnapshotThresholdActive()). The first time a tuple is about to be - * removed based on the limited horizon, old_snap_used is set to true, and - * SetOldSnapshotThresholdTimestamp() is called. See - * tdeheap_prune_satisfies_vacuum(). - */ - TimestampTz old_snap_ts; - TransactionId old_snap_xmin; - bool old_snap_used; - - TransactionId new_prune_xid; /* new prune hint value for page */ - TransactionId snapshotConflictHorizon; /* latest xid removed */ - int nredirected; /* numbers of entries in arrays below */ - int ndead; - int nunused; - /* arrays that accumulate indexes of items to be changed */ - OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; - OffsetNumber nowdead[MaxHeapTuplesPerPage]; - OffsetNumber nowunused[MaxHeapTuplesPerPage]; - - /* - * marked[i] is true if item i is entered in one of the above arrays. - * - * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is - * 1. Otherwise every access would need to subtract 1. - */ - bool marked[MaxHeapTuplesPerPage + 1]; - - /* - * Tuple visibility is only computed once for each tuple, for correctness - * and efficiency reasons; see comment in tdeheap_page_prune() for details. - * This is of type int8[], instead of HTSV_Result[], so we can use -1 to - * indicate no visibility has been computed, e.g. for LP_DEAD items. - * - * Same indexing as ->marked. - */ - int8 htsv[MaxHeapTuplesPerPage + 1]; -} PruneState; - -/* Local functions */ -static HTSV_Result tdeheap_prune_satisfies_vacuum(PruneState *prstate, - HeapTuple tup, - Buffer buffer); -static int tdeheap_prune_chain(Buffer buffer, - OffsetNumber rootoffnum, - PruneState *prstate); -static void tdeheap_prune_record_prunable(PruneState *prstate, TransactionId xid); -static void tdeheap_prune_record_redirect(PruneState *prstate, - OffsetNumber offnum, OffsetNumber rdoffnum); -static void tdeheap_prune_record_dead(PruneState *prstate, OffsetNumber offnum); -static void tdeheap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); -static void page_verify_redirects(Page page); - - -/* - * Optionally prune and repair fragmentation in the specified page. - * - * This is an opportunistic function. It will perform housekeeping - * only if the page heuristically looks like a candidate for pruning and we - * can acquire buffer cleanup lock without blocking. - * - * Note: this is called quite often. It's important that it fall out quickly - * if there's not any use in pruning. - * - * Caller must have pin on the buffer, and must *not* have a lock on it. - */ -void -tdeheap_page_prune_opt(Relation relation, Buffer buffer) -{ - Page page = BufferGetPage(buffer); - TransactionId prune_xid; - GlobalVisState *vistest; - TransactionId limited_xmin = InvalidTransactionId; - TimestampTz limited_ts = 0; - Size minfree; - - /* - * We can't write WAL in recovery mode, so there's no point trying to - * clean the page. The primary will likely issue a cleaning WAL record - * soon anyway, so this is no particular loss. - */ - if (RecoveryInProgress()) - return; - - /* - * XXX: Magic to keep old_snapshot_threshold tests appear "working". They - * currently are broken, and discussion of what to do about them is - * ongoing. See - * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de - */ - if (old_snapshot_threshold == 0) - SnapshotTooOldMagicForTest(); - - /* - * First check whether there's any chance there's something to prune, - * determining the appropriate horizon is a waste if there's no prune_xid - * (i.e. no updates/deletes left potentially dead tuples around). - */ - prune_xid = ((PageHeader) page)->pd_prune_xid; - if (!TransactionIdIsValid(prune_xid)) - return; - - /* - * Check whether prune_xid indicates that there may be dead rows that can - * be cleaned up. - * - * It is OK to check the old snapshot limit before acquiring the cleanup - * lock because the worst that can happen is that we are not quite as - * aggressive about the cleanup (by however many transaction IDs are - * consumed between this point and acquiring the lock). This allows us to - * save significant overhead in the case where the page is found not to be - * prunable. - * - * Even if old_snapshot_threshold is set, we first check whether the page - * can be pruned without. Both because - * TransactionIdLimitedForOldSnapshots() is not cheap, and because not - * unnecessarily relying on old_snapshot_threshold avoids causing - * conflicts. - */ - vistest = GlobalVisTestFor(relation); - - if (!GlobalVisTestIsRemovableXid(vistest, prune_xid)) - { - if (!OldSnapshotThresholdActive()) - return; - - if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest), - relation, - &limited_xmin, &limited_ts)) - return; - - if (!TransactionIdPrecedes(prune_xid, limited_xmin)) - return; - } - - /* - * We prune when a previous UPDATE failed to find enough space on the page - * for a new tuple version, or when free space falls below the relation's - * fill-factor target (but not less than 10%). - * - * Checking free space here is questionable since we aren't holding any - * lock on the buffer; in the worst case we could get a bogus answer. It's - * unlikely to be *seriously* wrong, though, since reading either pd_lower - * or pd_upper is probably atomic. Avoiding taking a lock seems more - * important than sometimes getting a wrong answer in what is after all - * just a heuristic estimate. - */ - minfree = RelationGetTargetPageFreeSpace(relation, - HEAP_DEFAULT_FILLFACTOR); - minfree = Max(minfree, BLCKSZ / 10); - - if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) - { - /* OK, try to get exclusive buffer lock */ - if (!ConditionalLockBufferForCleanup(buffer)) - return; - - /* - * Now that we have buffer lock, get accurate information about the - * page's free space, and recheck the heuristic about whether to - * prune. - */ - if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) - { - int ndeleted, - nnewlpdead; - - ndeleted = tdeheap_page_prune(relation, buffer, vistest, limited_xmin, - limited_ts, &nnewlpdead, NULL); - - /* - * Report the number of tuples reclaimed to pgstats. This is - * ndeleted minus the number of newly-LP_DEAD-set items. - * - * We derive the number of dead tuples like this to avoid totally - * forgetting about items that were set to LP_DEAD, since they - * still need to be cleaned up by VACUUM. We only want to count - * heap-only tuples that just became LP_UNUSED in our report, - * which don't. - * - * VACUUM doesn't have to compensate in the same way when it - * tracks ndeleted, since it will set the same LP_DEAD items to - * LP_UNUSED separately. - */ - if (ndeleted > nnewlpdead) - pgstat_update_heap_dead_tuples(relation, - ndeleted - nnewlpdead); - } - - /* And release buffer lock */ - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - /* - * We avoid reuse of any free space created on the page by unrelated - * UPDATEs/INSERTs by opting to not update the FSM at this point. The - * free space should be reused by UPDATEs to *this* page. - */ - } -} - - -/* - * Prune and repair fragmentation in the specified page. - * - * Caller must have pin and buffer cleanup lock on the page. Note that we - * don't update the FSM information for page on caller's behalf. Caller might - * also need to account for a reduction in the length of the line pointer - * array following array truncation by us. - * - * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD - * (see tdeheap_prune_satisfies_vacuum and - * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to - * either have been set by TransactionIdLimitedForOldSnapshots, or - * InvalidTransactionId/0 respectively. - * - * Sets *nnewlpdead for caller, indicating the number of items that were - * newly set LP_DEAD during prune operation. - * - * off_loc is the offset location required by the caller to use in error - * callback. - * - * Returns the number of tuples deleted from the page during this call. - */ -int -tdeheap_page_prune(Relation relation, Buffer buffer, - GlobalVisState *vistest, - TransactionId old_snap_xmin, - TimestampTz old_snap_ts, - int *nnewlpdead, - OffsetNumber *off_loc) -{ - int ndeleted = 0; - Page page = BufferGetPage(buffer); - BlockNumber blockno = BufferGetBlockNumber(buffer); - OffsetNumber offnum, - maxoff; - PruneState prstate; - HeapTupleData tup; - - /* - * Our strategy is to scan the page and make lists of items to change, - * then apply the changes within a critical section. This keeps as much - * logic as possible out of the critical section, and also ensures that - * WAL replay will work the same as the normal case. - * - * First, initialize the new pd_prune_xid value to zero (indicating no - * prunable tuples). If we find any tuples which may soon become - * prunable, we will save the lowest relevant XID in new_prune_xid. Also - * initialize the rest of our working state. - */ - prstate.new_prune_xid = InvalidTransactionId; - prstate.rel = relation; - prstate.vistest = vistest; - prstate.old_snap_xmin = old_snap_xmin; - prstate.old_snap_ts = old_snap_ts; - prstate.old_snap_used = false; - prstate.snapshotConflictHorizon = InvalidTransactionId; - prstate.nredirected = prstate.ndead = prstate.nunused = 0; - memset(prstate.marked, 0, sizeof(prstate.marked)); - - maxoff = PageGetMaxOffsetNumber(page); - tup.t_tableOid = RelationGetRelid(prstate.rel); - - /* - * Determine HTSV for all tuples. - * - * This is required for correctness to deal with cases where running HTSV - * twice could result in different results (e.g. RECENTLY_DEAD can turn to - * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid() - * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the - * inserting transaction aborts, ...). That in turn could cause - * tdeheap_prune_chain() to behave incorrectly if a tuple is reached twice, - * once directly via a tdeheap_prune_chain() and once following a HOT chain. - * - * It's also good for performance. Most commonly tuples within a page are - * stored at decreasing offsets (while the items are stored at increasing - * offsets). When processing all tuples on a page this leads to reading - * memory at decreasing offsets within a page, with a variable stride. - * That's hard for CPU prefetchers to deal with. Processing the items in - * reverse order (and thus the tuples in increasing order) increases - * prefetching efficiency significantly / decreases the number of cache - * misses. - */ - for (offnum = maxoff; - offnum >= FirstOffsetNumber; - offnum = OffsetNumberPrev(offnum)) - { - ItemId itemid = PageGetItemId(page, offnum); - HeapTupleHeader htup; - - /* Nothing to do if slot doesn't contain a tuple */ - if (!ItemIdIsNormal(itemid)) - { - prstate.htsv[offnum] = -1; - continue; - } - - htup = (HeapTupleHeader) PageGetItem(page, itemid); - tup.t_data = htup; - tup.t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tup.t_self), blockno, offnum); - - /* - * Set the offset number so that we can display it along with any - * error that occurred while processing this tuple. - */ - if (off_loc) - *off_loc = offnum; - - prstate.htsv[offnum] = tdeheap_prune_satisfies_vacuum(&prstate, &tup, - buffer); - } - - /* Scan the page */ - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid; - - /* Ignore items already processed as part of an earlier chain */ - if (prstate.marked[offnum]) - continue; - - /* see preceding loop */ - if (off_loc) - *off_loc = offnum; - - /* Nothing to do if slot is empty or already dead */ - itemid = PageGetItemId(page, offnum); - if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) - continue; - - /* Process this item or chain of items */ - ndeleted += tdeheap_prune_chain(buffer, offnum, &prstate); - } - - /* Clear the offset information once we have processed the given page. */ - if (off_loc) - *off_loc = InvalidOffsetNumber; - - /* - * Make sure relation key in the cahce to avoid pallocs in - * the critical section. - * We need it here as there is `pgtde_compactify_tuples()` down in - * the call stack wich reencrypt tuples. - */ - GetRelationKey(relation->rd_locator); - - /* Any error while applying the changes is critical */ - START_CRIT_SECTION(); - - /* Have we found any prunable items? */ - if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) - { - /* - * Apply the planned item changes, then repair page fragmentation, and - * update the page's hint bit about whether it has free line pointers. - */ - tdeheap_page_prune_execute(prstate.rel, buffer, - prstate.redirected, prstate.nredirected, - prstate.nowdead, prstate.ndead, - prstate.nowunused, prstate.nunused); - - /* - * Update the page's pd_prune_xid field to either zero, or the lowest - * XID of any soon-prunable tuple. - */ - ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; - - /* - * Also clear the "page is full" flag, since there's no point in - * repeating the prune/defrag process until something else happens to - * the page. - */ - PageClearFull(page); - - MarkBufferDirty(buffer); - - /* - * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did - */ - if (RelationNeedsWAL(relation)) - { - xl_tdeheap_prune xlrec; - XLogRecPtr recptr; - - xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(relation); - xlrec.snapshotConflictHorizon = prstate.snapshotConflictHorizon; - xlrec.nredirected = prstate.nredirected; - xlrec.ndead = prstate.ndead; - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapPrune); - - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - - /* - * The OffsetNumber arrays are not actually in the buffer, but we - * pretend that they are. When XLogInsert stores the whole - * buffer, the offset arrays need not be stored too. - */ - if (prstate.nredirected > 0) - XLogRegisterBufData(0, (char *) prstate.redirected, - prstate.nredirected * - sizeof(OffsetNumber) * 2); - - if (prstate.ndead > 0) - XLogRegisterBufData(0, (char *) prstate.nowdead, - prstate.ndead * sizeof(OffsetNumber)); - - if (prstate.nunused > 0) - XLogRegisterBufData(0, (char *) prstate.nowunused, - prstate.nunused * sizeof(OffsetNumber)); - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE); - - PageSetLSN(BufferGetPage(buffer), recptr); - } - } - else - { - /* - * If we didn't prune anything, but have found a new value for the - * pd_prune_xid field, update it and mark the buffer dirty. This is - * treated as a non-WAL-logged hint. - * - * Also clear the "page is full" flag if it is set, since there's no - * point in repeating the prune/defrag process until something else - * happens to the page. - */ - if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || - PageIsFull(page)) - { - ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; - PageClearFull(page); - MarkBufferDirtyHint(buffer, true); - } - } - - END_CRIT_SECTION(); - - /* Record number of newly-set-LP_DEAD items for caller */ - *nnewlpdead = prstate.ndead; - - return ndeleted; -} - - -/* - * Perform visibility checks for heap pruning. - * - * This is more complicated than just using GlobalVisTestIsRemovableXid() - * because of old_snapshot_threshold. We only want to increase the threshold - * that triggers errors for old snapshots when we actually decide to remove a - * row based on the limited horizon. - * - * Due to its cost we also only want to call - * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have - * done so in tdeheap_page_prune_opt() if pd_prune_xid was old enough. But we - * still want to be able to remove rows that are too new to be removed - * according to prstate->vistest, but that can be removed based on - * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on - * demand in here, if appropriate. - */ -static HTSV_Result -tdeheap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) -{ - HTSV_Result res; - TransactionId dead_after; - - res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after); - - if (res != HEAPTUPLE_RECENTLY_DEAD) - return res; - - /* - * If we are already relying on the limited xmin, there is no need to - * delay doing so anymore. - */ - if (prstate->old_snap_used) - { - Assert(TransactionIdIsValid(prstate->old_snap_xmin)); - - if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin)) - res = HEAPTUPLE_DEAD; - return res; - } - - /* - * First check if GlobalVisTestIsRemovableXid() is sufficient to find the - * row dead. If not, and old_snapshot_threshold is enabled, try to use the - * lowered horizon. - */ - if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after)) - res = HEAPTUPLE_DEAD; - else if (OldSnapshotThresholdActive()) - { - /* haven't determined limited horizon yet, requests */ - if (!TransactionIdIsValid(prstate->old_snap_xmin)) - { - TransactionId horizon = - GlobalVisTestNonRemovableHorizon(prstate->vistest); - - TransactionIdLimitedForOldSnapshots(horizon, prstate->rel, - &prstate->old_snap_xmin, - &prstate->old_snap_ts); - } - - if (TransactionIdIsValid(prstate->old_snap_xmin) && - TransactionIdPrecedes(dead_after, prstate->old_snap_xmin)) - { - /* - * About to remove row based on snapshot_too_old. Need to raise - * the threshold so problematic accesses would error. - */ - Assert(!prstate->old_snap_used); - SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts, - prstate->old_snap_xmin); - prstate->old_snap_used = true; - res = HEAPTUPLE_DEAD; - } - } - - return res; -} - - -/* - * Prune specified line pointer or a HOT chain originating at line pointer. - * - * If the item is an index-referenced tuple (i.e. not a heap-only tuple), - * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT - * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. - * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really - * DEAD, our visibility test is just too coarse to detect it. - * - * In general, pruning must never leave behind a DEAD tuple that still has - * tuple storage. VACUUM isn't prepared to deal with that case. That's why - * VACUUM prunes the same heap page a second time (without dropping its lock - * in the interim) when it sees a newly DEAD tuple that we initially saw as - * in-progress. Retrying pruning like this can only happen when an inserting - * transaction concurrently aborts. - * - * The root line pointer is redirected to the tuple immediately after the - * latest DEAD tuple. If all tuples in the chain are DEAD, the root line - * pointer is marked LP_DEAD. (This includes the case of a DEAD simple - * tuple, which we treat as a chain of length 1.) - * - * We don't actually change the page here. We just add entries to the arrays in - * prstate showing the changes to be made. Items to be redirected are added - * to the redirected[] array (two entries per redirection); items to be set to - * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED - * state are added to nowunused[]. - * - * Returns the number of tuples (to be) deleted from the page. - */ -static int -tdeheap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) -{ - int ndeleted = 0; - Page dp = (Page) BufferGetPage(buffer); - TransactionId priorXmax = InvalidTransactionId; - ItemId rootlp; - HeapTupleHeader htup; - OffsetNumber latestdead = InvalidOffsetNumber, - maxoff = PageGetMaxOffsetNumber(dp), - offnum; - OffsetNumber chainitems[MaxHeapTuplesPerPage]; - int nchain = 0, - i; - - rootlp = PageGetItemId(dp, rootoffnum); - - /* - * If it's a heap-only tuple, then it is not the start of a HOT chain. - */ - if (ItemIdIsNormal(rootlp)) - { - Assert(prstate->htsv[rootoffnum] != -1); - htup = (HeapTupleHeader) PageGetItem(dp, rootlp); - - if (HeapTupleHeaderIsHeapOnly(htup)) - { - /* - * If the tuple is DEAD and doesn't chain to anything else, mark - * it unused immediately. (If it does chain, we can only remove - * it as part of pruning its chain.) - * - * We need this primarily to handle aborted HOT updates, that is, - * XMIN_INVALID heap-only tuples. Those might not be linked to by - * any chain, since the parent tuple might be re-updated before - * any pruning occurs. So we have to be able to reap them - * separately from chain-pruning. (Note that - * HeapTupleHeaderIsHotUpdated will never return true for an - * XMIN_INVALID tuple, so this code will work even when there were - * sequential updates within the aborted transaction.) - * - * Note that we might first arrive at a dead heap-only tuple - * either here or while following a chain below. Whichever path - * gets there first will mark the tuple unused. - */ - if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD && - !HeapTupleHeaderIsHotUpdated(htup)) - { - tdeheap_prune_record_unused(prstate, rootoffnum); - HeapTupleHeaderAdvanceConflictHorizon(htup, - &prstate->snapshotConflictHorizon); - ndeleted++; - } - - /* Nothing more to do */ - return ndeleted; - } - } - - /* Start from the root tuple */ - offnum = rootoffnum; - - /* while not end of the chain */ - for (;;) - { - ItemId lp; - bool tupdead, - recent_dead; - - /* Sanity check (pure paranoia) */ - if (offnum < FirstOffsetNumber) - break; - - /* - * An offset past the end of page's line pointer array is possible - * when the array was truncated (original item must have been unused) - */ - if (offnum > maxoff) - break; - - /* If item is already processed, stop --- it must not be same chain */ - if (prstate->marked[offnum]) - break; - - lp = PageGetItemId(dp, offnum); - - /* Unused item obviously isn't part of the chain */ - if (!ItemIdIsUsed(lp)) - break; - - /* - * If we are looking at the redirected root line pointer, jump to the - * first normal tuple in the chain. If we find a redirect somewhere - * else, stop --- it must not be same chain. - */ - if (ItemIdIsRedirected(lp)) - { - if (nchain > 0) - break; /* not at start of chain */ - chainitems[nchain++] = offnum; - offnum = ItemIdGetRedirect(rootlp); - continue; - } - - /* - * Likewise, a dead line pointer can't be part of the chain. (We - * already eliminated the case of dead root tuple outside this - * function.) - */ - if (ItemIdIsDead(lp)) - break; - - Assert(ItemIdIsNormal(lp)); - Assert(prstate->htsv[offnum] != -1); - htup = (HeapTupleHeader) PageGetItem(dp, lp); - - /* - * Check the tuple XMIN against prior XMAX, if any - */ - if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) - break; - - /* - * OK, this tuple is indeed a member of the chain. - */ - chainitems[nchain++] = offnum; - - /* - * Check tuple's visibility status. - */ - tupdead = recent_dead = false; - - switch ((HTSV_Result) prstate->htsv[offnum]) - { - case HEAPTUPLE_DEAD: - tupdead = true; - break; - - case HEAPTUPLE_RECENTLY_DEAD: - recent_dead = true; - - /* - * This tuple may soon become DEAD. Update the hint field so - * that the page is reconsidered for pruning in future. - */ - tdeheap_prune_record_prunable(prstate, - HeapTupleHeaderGetUpdateXid(htup)); - break; - - case HEAPTUPLE_DELETE_IN_PROGRESS: - - /* - * This tuple may soon become DEAD. Update the hint field so - * that the page is reconsidered for pruning in future. - */ - tdeheap_prune_record_prunable(prstate, - HeapTupleHeaderGetUpdateXid(htup)); - break; - - case HEAPTUPLE_LIVE: - case HEAPTUPLE_INSERT_IN_PROGRESS: - - /* - * If we wanted to optimize for aborts, we might consider - * marking the page prunable when we see INSERT_IN_PROGRESS. - * But we don't. See related decisions about when to mark the - * page prunable in heapam.c. - */ - break; - - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - - /* - * Remember the last DEAD tuple seen. We will advance past - * RECENTLY_DEAD tuples just in case there's a DEAD one after them; - * but we can't advance past anything else. We have to make sure that - * we don't miss any DEAD tuples, since DEAD tuples that still have - * tuple storage after pruning will confuse VACUUM. - */ - if (tupdead) - { - latestdead = offnum; - HeapTupleHeaderAdvanceConflictHorizon(htup, - &prstate->snapshotConflictHorizon); - } - else if (!recent_dead) - break; - - /* - * If the tuple is not HOT-updated, then we are at the end of this - * HOT-update chain. - */ - if (!HeapTupleHeaderIsHotUpdated(htup)) - break; - - /* HOT implies it can't have moved to different partition */ - Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); - - /* - * Advance to next chain member. - */ - Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == - BufferGetBlockNumber(buffer)); - offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - priorXmax = HeapTupleHeaderGetUpdateXid(htup); - } - - /* - * If we found a DEAD tuple in the chain, adjust the HOT chain so that all - * the DEAD tuples at the start of the chain are removed and the root line - * pointer is appropriately redirected. - */ - if (OffsetNumberIsValid(latestdead)) - { - /* - * Mark as unused each intermediate item that we are able to remove - * from the chain. - * - * When the previous item is the last dead tuple seen, we are at the - * right candidate for redirection. - */ - for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) - { - tdeheap_prune_record_unused(prstate, chainitems[i]); - ndeleted++; - } - - /* - * If the root entry had been a normal tuple, we are deleting it, so - * count it in the result. But changing a redirect (even to DEAD - * state) doesn't count. - */ - if (ItemIdIsNormal(rootlp)) - ndeleted++; - - /* - * If the DEAD tuple is at the end of the chain, the entire chain is - * dead and the root line pointer can be marked dead. Otherwise just - * redirect the root to the correct chain member. - */ - if (i >= nchain) - tdeheap_prune_record_dead(prstate, rootoffnum); - else - tdeheap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); - } - else if (nchain < 2 && ItemIdIsRedirected(rootlp)) - { - /* - * We found a redirect item that doesn't point to a valid follow-on - * item. This can happen if the loop in tdeheap_page_prune caused us to - * visit the dead successor of a redirect item before visiting the - * redirect item. We can clean up by setting the redirect item to - * DEAD state. - */ - tdeheap_prune_record_dead(prstate, rootoffnum); - } - - return ndeleted; -} - -/* Record lowest soon-prunable XID */ -static void -tdeheap_prune_record_prunable(PruneState *prstate, TransactionId xid) -{ - /* - * This should exactly match the PageSetPrunable macro. We can't store - * directly into the page header yet, so we update working state. - */ - Assert(TransactionIdIsNormal(xid)); - if (!TransactionIdIsValid(prstate->new_prune_xid) || - TransactionIdPrecedes(xid, prstate->new_prune_xid)) - prstate->new_prune_xid = xid; -} - -/* Record line pointer to be redirected */ -static void -tdeheap_prune_record_redirect(PruneState *prstate, - OffsetNumber offnum, OffsetNumber rdoffnum) -{ - Assert(prstate->nredirected < MaxHeapTuplesPerPage); - prstate->redirected[prstate->nredirected * 2] = offnum; - prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; - prstate->nredirected++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; - Assert(!prstate->marked[rdoffnum]); - prstate->marked[rdoffnum] = true; -} - -/* Record line pointer to be marked dead */ -static void -tdeheap_prune_record_dead(PruneState *prstate, OffsetNumber offnum) -{ - Assert(prstate->ndead < MaxHeapTuplesPerPage); - prstate->nowdead[prstate->ndead] = offnum; - prstate->ndead++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; -} - -/* Record line pointer to be marked unused */ -static void -tdeheap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) -{ - Assert(prstate->nunused < MaxHeapTuplesPerPage); - prstate->nowunused[prstate->nunused] = offnum; - prstate->nunused++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; -} - -void TdePageRepairFragmentation(Relation rel, Buffer buffer, Page page); - -/* - * Perform the actual page changes needed by tdeheap_page_prune. - * It is expected that the caller has a full cleanup lock on the - * buffer. - */ -void -tdeheap_page_prune_execute(Relation rel, Buffer buffer, - OffsetNumber *redirected, int nredirected, - OffsetNumber *nowdead, int ndead, - OffsetNumber *nowunused, int nunused) -{ - Page page = (Page) BufferGetPage(buffer); - OffsetNumber *offnum; - HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY; - - /* Shouldn't be called unless there's something to do */ - Assert(nredirected > 0 || ndead > 0 || nunused > 0); - - /* Update all redirected line pointers */ - offnum = redirected; - for (int i = 0; i < nredirected; i++) - { - OffsetNumber fromoff = *offnum++; - OffsetNumber tooff = *offnum++; - ItemId fromlp = PageGetItemId(page, fromoff); - ItemId tolp PG_USED_FOR_ASSERTS_ONLY; - -#ifdef USE_ASSERT_CHECKING - - /* - * Any existing item that we set as an LP_REDIRECT (any 'from' item) - * must be the first item from a HOT chain. If the item has tuple - * storage then it can't be a heap-only tuple. Otherwise we are just - * maintaining an existing LP_REDIRECT from an existing HOT chain that - * has been pruned at least once before now. - */ - if (!ItemIdIsRedirected(fromlp)) - { - Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp)); - - htup = (HeapTupleHeader) PageGetItem(page, fromlp); - Assert(!HeapTupleHeaderIsHeapOnly(htup)); - } - else - { - /* We shouldn't need to redundantly set the redirect */ - Assert(ItemIdGetRedirect(fromlp) != tooff); - } - - /* - * The item that we're about to set as an LP_REDIRECT (the 'from' - * item) will point to an existing item (the 'to' item) that is - * already a heap-only tuple. There can be at most one LP_REDIRECT - * item per HOT chain. - * - * We need to keep around an LP_REDIRECT item (after original - * non-heap-only root tuple gets pruned away) so that it's always - * possible for VACUUM to easily figure out what TID to delete from - * indexes when an entire HOT chain becomes dead. A heap-only tuple - * can never become LP_DEAD; an LP_REDIRECT item or a regular heap - * tuple can. - * - * This check may miss problems, e.g. the target of a redirect could - * be marked as unused subsequently. The page_verify_redirects() check - * below will catch such problems. - */ - tolp = PageGetItemId(page, tooff); - Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp)); - htup = (HeapTupleHeader) PageGetItem(page, tolp); - Assert(HeapTupleHeaderIsHeapOnly(htup)); -#endif - - ItemIdSetRedirect(fromlp, tooff); - } - - /* Update all now-dead line pointers */ - offnum = nowdead; - for (int i = 0; i < ndead; i++) - { - OffsetNumber off = *offnum++; - ItemId lp = PageGetItemId(page, off); - -#ifdef USE_ASSERT_CHECKING - - /* - * An LP_DEAD line pointer must be left behind when the original item - * (which is dead to everybody) could still be referenced by a TID in - * an index. This should never be necessary with any individual - * heap-only tuple item, though. (It's not clear how much of a problem - * that would be, but there is no reason to allow it.) - */ - if (ItemIdHasStorage(lp)) - { - Assert(ItemIdIsNormal(lp)); - htup = (HeapTupleHeader) PageGetItem(page, lp); - Assert(!HeapTupleHeaderIsHeapOnly(htup)); - } - else - { - /* Whole HOT chain becomes dead */ - Assert(ItemIdIsRedirected(lp)); - } -#endif - - ItemIdSetDead(lp); - } - - /* Update all now-unused line pointers */ - offnum = nowunused; - for (int i = 0; i < nunused; i++) - { - OffsetNumber off = *offnum++; - ItemId lp = PageGetItemId(page, off); - -#ifdef USE_ASSERT_CHECKING - - /* - * Only heap-only tuples can become LP_UNUSED during pruning. They - * don't need to be left in place as LP_DEAD items until VACUUM gets - * around to doing index vacuuming. - */ - Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp)); - htup = (HeapTupleHeader) PageGetItem(page, lp); - Assert(HeapTupleHeaderIsHeapOnly(htup)); -#endif - - ItemIdSetUnused(lp); - } - - /* - * Finally, repair any fragmentation, and update the page's hint bit about - * whether it has free pointers. - */ - TdePageRepairFragmentation(rel, buffer, page); - - /* - * Now that the page has been modified, assert that redirect items still - * point to valid targets. - */ - page_verify_redirects(page); -} - - -/* - * If built with assertions, verify that all LP_REDIRECT items point to a - * valid item. - * - * One way that bugs related to HOT pruning show is redirect items pointing to - * removed tuples. It's not trivial to reliably check that marking an item - * unused will not orphan a redirect item during tdeheap_prune_chain() / - * tdeheap_page_prune_execute(), so we additionally check the whole page after - * pruning. Without this check such bugs would typically only cause asserts - * later, potentially well after the corruption has been introduced. - * - * Also check comments in tdeheap_page_prune_execute()'s redirection loop. - */ -static void -page_verify_redirects(Page page) -{ -#ifdef USE_ASSERT_CHECKING - OffsetNumber offnum; - OffsetNumber maxoff; - - maxoff = PageGetMaxOffsetNumber(page); - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid = PageGetItemId(page, offnum); - OffsetNumber targoff; - ItemId targitem; - HeapTupleHeader htup; - - if (!ItemIdIsRedirected(itemid)) - continue; - - targoff = ItemIdGetRedirect(itemid); - targitem = PageGetItemId(page, targoff); - - Assert(ItemIdIsUsed(targitem)); - Assert(ItemIdIsNormal(targitem)); - Assert(ItemIdHasStorage(targitem)); - htup = (HeapTupleHeader) PageGetItem(page, targitem); - Assert(HeapTupleHeaderIsHeapOnly(htup)); - } -#endif -} - - -/* - * For all items in this page, find their respective root line pointers. - * If item k is part of a HOT-chain with root at item j, then we set - * root_offsets[k - 1] = j. - * - * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. - * Unused entries are filled with InvalidOffsetNumber (zero). - * - * The function must be called with at least share lock on the buffer, to - * prevent concurrent prune operations. - * - * Note: The information collected here is valid only as long as the caller - * holds a pin on the buffer. Once pin is released, a tuple might be pruned - * and reused by a completely unrelated tuple. - */ -void -tdeheap_get_root_tuples(Page page, OffsetNumber *root_offsets) -{ - OffsetNumber offnum, - maxoff; - - MemSet(root_offsets, InvalidOffsetNumber, - MaxHeapTuplesPerPage * sizeof(OffsetNumber)); - - maxoff = PageGetMaxOffsetNumber(page); - for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) - { - ItemId lp = PageGetItemId(page, offnum); - HeapTupleHeader htup; - OffsetNumber nextoffnum; - TransactionId priorXmax; - - /* skip unused and dead items */ - if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) - continue; - - if (ItemIdIsNormal(lp)) - { - htup = (HeapTupleHeader) PageGetItem(page, lp); - - /* - * Check if this tuple is part of a HOT-chain rooted at some other - * tuple. If so, skip it for now; we'll process it when we find - * its root. - */ - if (HeapTupleHeaderIsHeapOnly(htup)) - continue; - - /* - * This is either a plain tuple or the root of a HOT-chain. - * Remember it in the mapping. - */ - root_offsets[offnum - 1] = offnum; - - /* If it's not the start of a HOT-chain, we're done with it */ - if (!HeapTupleHeaderIsHotUpdated(htup)) - continue; - - /* Set up to scan the HOT-chain */ - nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - priorXmax = HeapTupleHeaderGetUpdateXid(htup); - } - else - { - /* Must be a redirect item. We do not set its root_offsets entry */ - Assert(ItemIdIsRedirected(lp)); - /* Set up to scan the HOT-chain */ - nextoffnum = ItemIdGetRedirect(lp); - priorXmax = InvalidTransactionId; - } - - /* - * Now follow the HOT-chain and collect other tuples in the chain. - * - * Note: Even though this is a nested loop, the complexity of the - * function is O(N) because a tuple in the page should be visited not - * more than twice, once in the outer loop and once in HOT-chain - * chases. - */ - for (;;) - { - /* Sanity check (pure paranoia) */ - if (offnum < FirstOffsetNumber) - break; - - /* - * An offset past the end of page's line pointer array is possible - * when the array was truncated - */ - if (offnum > maxoff) - break; - - lp = PageGetItemId(page, nextoffnum); - - /* Check for broken chains */ - if (!ItemIdIsNormal(lp)) - break; - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) - break; - - /* Remember the root line pointer for this item */ - root_offsets[nextoffnum - 1] = offnum; - - /* Advance to next chain member, if any */ - if (!HeapTupleHeaderIsHotUpdated(htup)) - break; - - /* HOT implies it can't have moved to different partition */ - Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); - - nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - priorXmax = HeapTupleHeaderGetUpdateXid(htup); - } - } -} - -// TODO: move to own file so it can be autoupdated -// FROM src/page/bufpage.c - -/* - * Tuple defrag support for PageRepairFragmentation and PageIndexMultiDelete - */ -typedef struct itemIdCompactData -{ - uint16 offsetindex; /* linp array index */ - int16 itemoff; /* page offset of item data */ - uint16 len; - uint16 alignedlen; /* MAXALIGN(item data len) */ -} itemIdCompactData; -typedef itemIdCompactData *itemIdCompact; - -/* - * After removing or marking some line pointers unused, move the tuples to - * remove the gaps caused by the removed items and reorder them back into - * reverse line pointer order in the page. - * - * This function can often be fairly hot, so it pays to take some measures to - * make it as optimal as possible. - * - * Callers may pass 'presorted' as true if the 'itemidbase' array is sorted in - * descending order of itemoff. When this is true we can just memmove() - * tuples towards the end of the page. This is quite a common case as it's - * the order that tuples are initially inserted into pages. When we call this - * function to defragment the tuples in the page then any new line pointers - * added to the page will keep that presorted order, so hitting this case is - * still very common for tables that are commonly updated. - * - * When the 'itemidbase' array is not presorted then we're unable to just - * memmove() tuples around freely. Doing so could cause us to overwrite the - * memory belonging to a tuple we've not moved yet. In this case, we copy all - * the tuples that need to be moved into a temporary buffer. We can then - * simply memcpy() out of that temp buffer back into the page at the correct - * location. Tuples are copied back into the page in the same order as the - * 'itemidbase' array, so we end up reordering the tuples back into reverse - * line pointer order. This will increase the chances of hitting the - * presorted case the next time around. - * - * Callers must ensure that nitems is > 0 - */ -static void // this is where it happens! -pgtde_compactify_tuples(Relation rel, Buffer buffer, itemIdCompact itemidbase, int nitems, Page page, bool presorted) -{ - PageHeader phdr = (PageHeader) page; - Offset upper; - Offset copy_tail; - Offset copy_head; - itemIdCompact itemidptr; - int i; - - /* Code within will not work correctly if nitems == 0 */ - Assert(nitems > 0); - - if (presorted) - { - -#ifdef USE_ASSERT_CHECKING - { - /* - * Verify we've not gotten any new callers that are incorrectly - * passing a true presorted value. - */ - Offset lastoff = phdr->pd_special; - - for (i = 0; i < nitems; i++) - { - itemidptr = &itemidbase[i]; - - Assert(lastoff > itemidptr->itemoff); - - lastoff = itemidptr->itemoff; - } - } -#endif /* USE_ASSERT_CHECKING */ - - /* - * 'itemidbase' is already in the optimal order, i.e, lower item - * pointers have a higher offset. This allows us to memmove() the - * tuples up to the end of the page without having to worry about - * overwriting other tuples that have not been moved yet. - * - * There's a good chance that there are tuples already right at the - * end of the page that we can simply skip over because they're - * already in the correct location within the page. We'll do that - * first... - */ - upper = phdr->pd_special; - i = 0; - do - { - itemidptr = &itemidbase[i]; - if (upper != itemidptr->itemoff + itemidptr->alignedlen) - break; - upper -= itemidptr->alignedlen; - - i++; - } while (i < nitems); - - /* - * Now that we've found the first tuple that needs to be moved, we can - * do the tuple compactification. We try and make the least number of - * memmove() calls and only call memmove() when there's a gap. When - * we see a gap we just move all tuples after the gap up until the - * point of the last move operation. - */ - copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen; - for (; i < nitems; i++) - { - ItemId lp; - - itemidptr = &itemidbase[i]; - - lp = PageGetItemId(page, itemidptr->offsetindex + 1); - - if (copy_head != itemidptr->itemoff + itemidptr->alignedlen && copy_head < copy_tail) - { - memmove((char *) page + upper, - page + copy_head, - copy_tail - copy_head); - - /* - * We've now moved all tuples already seen, but not the - * current tuple, so we set the copy_tail to the end of this - * tuple so it can be moved in another iteration of the loop. - */ - copy_tail = itemidptr->itemoff + itemidptr->alignedlen; - } - /* shift the target offset down by the length of this tuple */ - upper -= itemidptr->alignedlen; - /* point the copy_head to the start of this tuple */ - copy_head = itemidptr->itemoff; - - /* update the line pointer to reference the new offset */ - lp->lp_off = upper; - } - - /* move the remaining tuples. */ - memmove((char *) page + upper, - page + copy_head, - copy_tail - copy_head); - } - else - { - PGAlignedBlock scratch; - char *scratchptr = scratch.data; - - /* - * Non-presorted case: The tuples in the itemidbase array may be in - * any order. So, in order to move these to the end of the page we - * must make a temp copy of each tuple that needs to be moved before - * we copy them back into the page at the new offset. - * - * If a large percentage of tuples have been pruned (>75%) then we'll - * copy these into the temp buffer tuple-by-tuple, otherwise, we'll - * just do a single memcpy() for all tuples that need to be moved. - * When so many tuples have been removed there's likely to be a lot of - * gaps and it's unlikely that many non-movable tuples remain at the - * end of the page. - */ - if (nitems < PageGetMaxOffsetNumber(page) / 4) - { - i = 0; - do - { - itemidptr = &itemidbase[i]; - memcpy(scratchptr + itemidptr->itemoff, page + itemidptr->itemoff, - itemidptr->alignedlen); - i++; - } while (i < nitems); - - /* Set things up for the compactification code below */ - i = 0; - itemidptr = &itemidbase[0]; - upper = phdr->pd_special; - } - else - { - upper = phdr->pd_special; - - /* - * Many tuples are likely to already be in the correct location. - * There's no need to copy these into the temp buffer. Instead - * we'll just skip forward in the itemidbase array to the position - * that we do need to move tuples from so that the code below just - * leaves these ones alone. - */ - i = 0; - do - { - itemidptr = &itemidbase[i]; - if (upper != itemidptr->itemoff + itemidptr->alignedlen) - break; - upper -= itemidptr->alignedlen; - - i++; - } while (i < nitems); - - /* Copy all tuples that need to be moved into the temp buffer */ - memcpy(scratchptr + phdr->pd_upper, - page + phdr->pd_upper, - upper - phdr->pd_upper); - } - - /* - * Do the tuple compactification. itemidptr is already pointing to - * the first tuple that we're going to move. Here we collapse the - * memcpy calls for adjacent tuples into a single call. This is done - * by delaying the memcpy call until we find a gap that needs to be - * closed. - */ - copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen; - for (; i < nitems; i++) - { - ItemId lp; - - itemidptr = &itemidbase[i]; - - lp = PageGetItemId(page, itemidptr->offsetindex + 1); - - /* copy pending tuples when we detect a gap */ - if (copy_head != itemidptr->itemoff + itemidptr->alignedlen) - { - memcpy((char *) page + upper, - scratchptr + copy_head, - copy_tail - copy_head); - - /* - * We've now copied all tuples already seen, but not the - * current tuple, so we set the copy_tail to the end of this - * tuple. - */ - copy_tail = itemidptr->itemoff + itemidptr->alignedlen; - } - /* shift the target offset down by the length of this tuple */ - upper -= itemidptr->alignedlen; - /* point the copy_head to the start of this tuple */ - copy_head = itemidptr->itemoff; - - /* update the line pointer to reference the new offset */ - lp->lp_off = upper; - } - - /* Copy the remaining chunk */ - memcpy((char *) page + upper, - scratchptr + copy_head, - copy_tail - copy_head); - } - - phdr->pd_upper = upper; -} - -/* - * PageRepairFragmentation - * - * Frees fragmented space on a heap page following pruning. - * - * This routine is usable for heap pages only, but see PageIndexMultiDelete. - * - * This routine removes unused line pointers from the end of the line pointer - * array. This is possible when dead heap-only tuples get removed by pruning, - * especially when there were HOT chains with several tuples each beforehand. - * - * Caller had better have a full cleanup lock on page's buffer. As a side - * effect the page's PD_HAS_FREE_LINES hint bit will be set or unset as - * needed. Caller might also need to account for a reduction in the length of - * the line pointer array following array truncation. - */ -void -TdePageRepairFragmentation(Relation rel, Buffer buffer, Page page) -{ - Offset pd_lower = ((PageHeader) page)->pd_lower; - Offset pd_upper = ((PageHeader) page)->pd_upper; - Offset pd_special = ((PageHeader) page)->pd_special; - Offset last_offset; - itemIdCompactData itemidbase[MaxHeapTuplesPerPage]; - itemIdCompact itemidptr; - ItemId lp; - int nline, - nstorage, - nunused; - OffsetNumber finalusedlp = InvalidOffsetNumber; - int i; - Size totallen; - bool presorted = true; /* For now */ - - /* - * It's worth the trouble to be more paranoid here than in most places, - * because we are about to reshuffle data in (what is usually) a shared - * disk buffer. If we aren't careful then corrupted pointers, lengths, - * etc could cause us to clobber adjacent disk buffers, spreading the data - * loss further. So, check everything. - */ - if (pd_lower < SizeOfPageHeaderData || - pd_lower > pd_upper || - pd_upper > pd_special || - pd_special > BLCKSZ || - pd_special != MAXALIGN(pd_special)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", - pd_lower, pd_upper, pd_special))); - - /* - * Run through the line pointer array and collect data about live items. - */ - nline = PageGetMaxOffsetNumber(page); - itemidptr = itemidbase; - nunused = totallen = 0; - last_offset = pd_special; - for (i = FirstOffsetNumber; i <= nline; i++) - { - lp = PageGetItemId(page, i); - if (ItemIdIsUsed(lp)) - { - if (ItemIdHasStorage(lp)) - { - itemidptr->offsetindex = i - 1; - itemidptr->itemoff = ItemIdGetOffset(lp); - - if (last_offset > itemidptr->itemoff) - last_offset = itemidptr->itemoff; - else - presorted = false; - - if (unlikely(itemidptr->itemoff < (int) pd_upper || - itemidptr->itemoff >= (int) pd_special)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted line pointer: %u", - itemidptr->itemoff))); - itemidptr->len = ItemIdGetLength(lp); - itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); - totallen += itemidptr->alignedlen; - itemidptr++; - } - - finalusedlp = i; /* Could be the final non-LP_UNUSED item */ - } - else - { - /* Unused entries should have lp_len = 0, but make sure */ - Assert(!ItemIdHasStorage(lp)); - ItemIdSetUnused(lp); - nunused++; - } - } - - nstorage = itemidptr - itemidbase; - if (nstorage == 0) - { - /* Page is completely empty, so just reset it quickly */ - ((PageHeader) page)->pd_upper = pd_special; - } - else - { - /* Need to compact the page the hard way */ - if (totallen > (Size) (pd_special - pd_lower)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted item lengths: total %u, available space %u", - (unsigned int) totallen, pd_special - pd_lower))); - - pgtde_compactify_tuples(rel, buffer, itemidbase, nstorage, page, presorted); - } - - if (finalusedlp != nline) - { - /* The last line pointer is not the last used line pointer */ - int nunusedend = nline - finalusedlp; - - Assert(nunused >= nunusedend && nunusedend > 0); - - /* remove trailing unused line pointers from the count */ - nunused -= nunusedend; - /* truncate the line pointer array */ - ((PageHeader) page)->pd_lower -= (sizeof(ItemIdData) * nunusedend); - } - - /* Set hint bit for PageAddItemExtended */ - if (nunused > 0) - PageSetHasFreeLinePointers(page); - else - PageClearHasFreeLinePointers(page); -} diff --git a/src/access/pg_tde_rewrite.c b/src/access/pg_tde_rewrite.c deleted file mode 100644 index 964082a0..00000000 --- a/src/access/pg_tde_rewrite.c +++ /dev/null @@ -1,1291 +0,0 @@ -/*------------------------------------------------------------------------- - * - * rewriteheap.c - * Support functions to rewrite tables. - * - * These functions provide a facility to completely rewrite a heap, while - * preserving visibility information and update chains. - * - * INTERFACE - * - * The caller is responsible for creating the new heap, all catalog - * changes, supplying the tuples to be written to the new heap, and - * rebuilding indexes. The caller must hold AccessExclusiveLock on the - * target table, because we assume no one else is writing into it. - * - * To use the facility: - * - * begin_tdeheap_rewrite - * while (fetch next tuple) - * { - * if (tuple is dead) - * rewrite_tdeheap_dead_tuple - * else - * { - * // do any transformations here if required - * rewrite_tdeheap_tuple - * } - * } - * end_tdeheap_rewrite - * - * The contents of the new relation shouldn't be relied on until after - * end_tdeheap_rewrite is called. - * - * - * IMPLEMENTATION - * - * This would be a fairly trivial affair, except that we need to maintain - * the ctid chains that link versions of an updated tuple together. - * Since the newly stored tuples will have tids different from the original - * ones, if we just copied t_ctid fields to the new table the links would - * be wrong. When we are required to copy a (presumably recently-dead or - * delete-in-progress) tuple whose ctid doesn't point to itself, we have - * to substitute the correct ctid instead. - * - * For each ctid reference from A -> B, we might encounter either A first - * or B first. (Note that a tuple in the middle of a chain is both A and B - * of different pairs.) - * - * If we encounter A first, we'll store the tuple in the unresolved_tups - * hash table. When we later encounter B, we remove A from the hash table, - * fix the ctid to point to the new location of B, and insert both A and B - * to the new heap. - * - * If we encounter B first, we can insert B to the new heap right away. - * We then add an entry to the old_new_tid_map hash table showing B's - * original tid (in the old heap) and new tid (in the new heap). - * When we later encounter A, we get the new location of B from the table, - * and can write A immediately with the correct ctid. - * - * Entries in the hash tables can be removed as soon as the later tuple - * is encountered. That helps to keep the memory usage down. At the end, - * both tables are usually empty; we should have encountered both A and B - * of each pair. However, it's possible for A to be RECENTLY_DEAD and B - * entirely DEAD according to HeapTupleSatisfiesVacuum, because the test - * for deadness using OldestXmin is not exact. In such a case we might - * encounter B first, and skip it, and find A later. Then A would be added - * to unresolved_tups, and stay there until end of the rewrite. Since - * this case is very unusual, we don't worry about the memory usage. - * - * Using in-memory hash tables means that we use some memory for each live - * update chain in the table, from the time we find one end of the - * reference until we find the other end. That shouldn't be a problem in - * practice, but if you do something like an UPDATE without a where-clause - * on a large table, and then run CLUSTER in the same transaction, you - * could run out of memory. It doesn't seem worthwhile to add support for - * spill-to-disk, as there shouldn't be that many RECENTLY_DEAD tuples in a - * table under normal circumstances. Furthermore, in the typical scenario - * of CLUSTERing on an unchanging key column, we'll see all the versions - * of a given tuple together anyway, and so the peak memory usage is only - * proportional to the number of RECENTLY_DEAD versions of a single row, not - * in the whole table. Note that if we do fail halfway through a CLUSTER, - * the old table is still valid, so failure is not catastrophic. - * - * We can't use the normal tdeheap_insert function to insert into the new - * heap, because tdeheap_insert overwrites the visibility information. - * We use a special-purpose raw_tdeheap_insert function instead, which - * is optimized for bulk inserting a lot of tuples, knowing that we have - * exclusive access to the heap. raw_tdeheap_insert builds new pages in - * local storage. When a page is full, or at the end of the process, - * we insert it to WAL as a single record and then write it to disk - * directly through smgr. Note, however, that any data sent to the new - * heap's TOAST table will go through the normal bufmgr. - * - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994-5, Regents of the University of California - * - * IDENTIFICATION - * src/backend/access/heap/rewriteheap.c - * - *------------------------------------------------------------------------- - */ -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include - -#include "access/pg_tdeam.h" -#include "access/pg_tdeam_xlog.h" -#include "access/pg_tdetoast.h" -#include "access/pg_tde_rewrite.h" -#include "encryption/enc_tde.h" - -#include "access/transam.h" -#include "access/xact.h" -#include "access/xloginsert.h" -#include "catalog/catalog.h" -#include "common/file_utils.h" -#include "lib/ilist.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "replication/logical.h" -#include "replication/slot.h" -#include "storage/bufmgr.h" -#include "storage/fd.h" -#include "storage/procarray.h" -#include "storage/smgr.h" -#include "utils/memutils.h" -#include "utils/rel.h" - -/* - * State associated with a rewrite operation. This is opaque to the user - * of the rewrite facility. - */ -typedef struct RewriteStateData -{ - Relation rs_old_rel; /* source heap */ - Relation rs_new_rel; /* destination heap */ - Page rs_buffer; /* page currently being built */ - BlockNumber rs_blockno; /* block where page will go */ - bool rs_buffer_valid; /* T if any tuples in buffer */ - bool rs_logical_rewrite; /* do we need to do logical rewriting */ - TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine - * tuple visibility */ - TransactionId rs_freeze_xid; /* Xid that will be used as freeze cutoff - * point */ - TransactionId rs_logical_xmin; /* Xid that will be used as cutoff point - * for logical rewrites */ - MultiXactId rs_cutoff_multi; /* MultiXactId that will be used as cutoff - * point for multixacts */ - MemoryContext rs_cxt; /* for hash tables and entries and tuples in - * them */ - XLogRecPtr rs_begin_lsn; /* XLogInsertLsn when starting the rewrite */ - HTAB *rs_unresolved_tups; /* unmatched A tuples */ - HTAB *rs_old_new_tid_map; /* unmatched B tuples */ - HTAB *rs_logical_mappings; /* logical remapping files */ - uint32 rs_num_rewrite_mappings; /* # in memory mappings */ -} RewriteStateData; - -/* - * The lookup keys for the hash tables are tuple TID and xmin (we must check - * both to avoid false matches from dead tuples). Beware that there is - * probably some padding space in this struct; it must be zeroed out for - * correct hashtable operation. - */ -typedef struct -{ - TransactionId xmin; /* tuple xmin */ - ItemPointerData tid; /* tuple location in old heap */ -} TidHashKey; - -/* - * Entry structures for the hash tables - */ -typedef struct -{ - TidHashKey key; /* expected xmin/old location of B tuple */ - ItemPointerData old_tid; /* A's location in the old heap */ - HeapTuple tuple; /* A's tuple contents */ -} UnresolvedTupData; - -typedef UnresolvedTupData *UnresolvedTup; - -typedef struct -{ - TidHashKey key; /* actual xmin/old location of B tuple */ - ItemPointerData new_tid; /* where we put it in the new heap */ -} OldToNewMappingData; - -typedef OldToNewMappingData *OldToNewMapping; - -/* - * In-Memory data for an xid that might need logical remapping entries - * to be logged. - */ -typedef struct RewriteMappingFile -{ - TransactionId xid; /* xid that might need to see the row */ - int vfd; /* fd of mappings file */ - off_t off; /* how far have we written yet */ - dclist_head mappings; /* list of in-memory mappings */ - char path[MAXPGPATH]; /* path, for error messages */ -} RewriteMappingFile; - -/* - * A single In-Memory logical rewrite mapping, hanging off - * RewriteMappingFile->mappings. - */ -typedef struct RewriteMappingDataEntry -{ - LogicalRewriteMappingData map; /* map between old and new location of the - * tuple */ - dlist_node node; -} RewriteMappingDataEntry; - - -/* prototypes for internal functions */ -static void raw_tdeheap_insert(RewriteState state, HeapTuple tup); - -/* internal logical remapping prototypes */ -static void logical_begin_tdeheap_rewrite(RewriteState state); -static void logical_rewrite_tdeheap_tuple(RewriteState state, ItemPointerData old_tid, HeapTuple new_tuple); -static void logical_end_tdeheap_rewrite(RewriteState state); - - -/* - * Begin a rewrite of a table - * - * old_heap old, locked heap relation tuples will be read from - * new_heap new, locked heap relation to insert tuples to - * oldest_xmin xid used by the caller to determine which tuples are dead - * freeze_xid xid before which tuples will be frozen - * cutoff_multi multixact before which multis will be removed - * - * Returns an opaque RewriteState, allocated in current memory context, - * to be used in subsequent calls to the other functions. - */ -RewriteState -begin_tdeheap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin, - TransactionId freeze_xid, MultiXactId cutoff_multi) -{ - RewriteState state; - MemoryContext rw_cxt; - MemoryContext old_cxt; - HASHCTL hash_ctl; - - /* - * To ease cleanup, make a separate context that will contain the - * RewriteState struct itself plus all subsidiary data. - */ - rw_cxt = AllocSetContextCreate(CurrentMemoryContext, - "Table rewrite", - ALLOCSET_DEFAULT_SIZES); - old_cxt = MemoryContextSwitchTo(rw_cxt); - - /* Create and fill in the state struct */ - state = palloc0(sizeof(RewriteStateData)); - - state->rs_old_rel = old_heap; - state->rs_new_rel = new_heap; - state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); - /* new_heap needn't be empty, just locked */ - state->rs_blockno = RelationGetNumberOfBlocks(new_heap); - state->rs_buffer_valid = false; - state->rs_oldest_xmin = oldest_xmin; - state->rs_freeze_xid = freeze_xid; - state->rs_cutoff_multi = cutoff_multi; - state->rs_cxt = rw_cxt; - - /* Initialize hash tables used to track update chains */ - hash_ctl.keysize = sizeof(TidHashKey); - hash_ctl.entrysize = sizeof(UnresolvedTupData); - hash_ctl.hcxt = state->rs_cxt; - - state->rs_unresolved_tups = - hash_create("Rewrite / Unresolved ctids", - 128, /* arbitrary initial size */ - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - hash_ctl.entrysize = sizeof(OldToNewMappingData); - - state->rs_old_new_tid_map = - hash_create("Rewrite / Old to new tid map", - 128, /* arbitrary initial size */ - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - MemoryContextSwitchTo(old_cxt); - - logical_begin_tdeheap_rewrite(state); - - return state; -} - -/* - * End a rewrite. - * - * state and any other resources are freed. - */ -void -end_tdeheap_rewrite(RewriteState state) -{ - HASH_SEQ_STATUS seq_status; - UnresolvedTup unresolved; - - /* - * Write any remaining tuples in the UnresolvedTups table. If we have any - * left, they should in fact be dead, but let's err on the safe side. - */ - hash_seq_init(&seq_status, state->rs_unresolved_tups); - - while ((unresolved = hash_seq_search(&seq_status)) != NULL) - { - ItemPointerSetInvalid(&unresolved->tuple->t_data->t_ctid); - raw_tdeheap_insert(state, unresolved->tuple); - } - - /* Write the last page, if any */ - if (state->rs_buffer_valid) - { - if (RelationNeedsWAL(state->rs_new_rel)) - log_newpage(&state->rs_new_rel->rd_locator, - MAIN_FORKNUM, - state->rs_blockno, - state->rs_buffer, - true); - - PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); - - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, - state->rs_blockno, state->rs_buffer, true); - } - - /* - * When we WAL-logged rel pages, we must nonetheless fsync them. The - * reason is the same as in storage.c's RelationCopyStorage(): we're - * writing data that's not in shared buffers, and so a CHECKPOINT - * occurring during the rewriteheap operation won't have fsync'd data we - * wrote before the checkpoint. - */ - if (RelationNeedsWAL(state->rs_new_rel)) - smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM); - - logical_end_tdeheap_rewrite(state); - - /* Deleting the context frees everything */ - MemoryContextDelete(state->rs_cxt); -} - -/* - * Add a tuple to the new heap. - * - * Visibility information is copied from the original tuple, except that - * we "freeze" very-old tuples. Note that since we scribble on new_tuple, - * it had better be temp storage not a pointer to the original tuple. - * - * state opaque state as returned by begin_tdeheap_rewrite - * old_tuple original tuple in the old heap - * new_tuple new, rewritten tuple to be inserted to new heap - */ -void -rewrite_tdeheap_tuple(RewriteState state, - HeapTuple old_tuple, HeapTuple new_tuple) -{ - MemoryContext old_cxt; - ItemPointerData old_tid; - TidHashKey hashkey; - bool found; - bool free_new; - - old_cxt = MemoryContextSwitchTo(state->rs_cxt); - - /* - * Copy the original tuple's visibility information into new_tuple. - * - * XXX we might later need to copy some t_infomask2 bits, too? Right now, - * we intentionally clear the HOT status bits. - */ - memcpy(&new_tuple->t_data->t_choice.t_heap, - &old_tuple->t_data->t_choice.t_heap, - sizeof(HeapTupleFields)); - - new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; - new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; - new_tuple->t_data->t_infomask |= - old_tuple->t_data->t_infomask & HEAP_XACT_MASK; - - /* - * While we have our hands on the tuple, we may as well freeze any - * eligible xmin or xmax, so that future VACUUM effort can be saved. - */ - tdeheap_freeze_tuple(new_tuple->t_data, - state->rs_old_rel->rd_rel->relfrozenxid, - state->rs_old_rel->rd_rel->relminmxid, - state->rs_freeze_xid, - state->rs_cutoff_multi); - - /* - * Invalid ctid means that ctid should point to the tuple itself. We'll - * override it later if the tuple is part of an update chain. - */ - ItemPointerSetInvalid(&new_tuple->t_data->t_ctid); - - /* - * If the tuple has been updated, check the old-to-new mapping hash table. - */ - if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || - HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) && - !HeapTupleHeaderIndicatesMovedPartitions(old_tuple->t_data) && - !(ItemPointerEquals(&(old_tuple->t_self), - &(old_tuple->t_data->t_ctid)))) - { - OldToNewMapping mapping; - - memset(&hashkey, 0, sizeof(hashkey)); - hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); - hashkey.tid = old_tuple->t_data->t_ctid; - - mapping = (OldToNewMapping) - hash_search(state->rs_old_new_tid_map, &hashkey, - HASH_FIND, NULL); - - if (mapping != NULL) - { - /* - * We've already copied the tuple that t_ctid points to, so we can - * set the ctid of this tuple to point to the new location, and - * insert it right away. - */ - new_tuple->t_data->t_ctid = mapping->new_tid; - - /* We don't need the mapping entry anymore */ - hash_search(state->rs_old_new_tid_map, &hashkey, - HASH_REMOVE, &found); - Assert(found); - } - else - { - /* - * We haven't seen the tuple t_ctid points to yet. Stash this - * tuple into unresolved_tups to be written later. - */ - UnresolvedTup unresolved; - - unresolved = hash_search(state->rs_unresolved_tups, &hashkey, - HASH_ENTER, &found); - Assert(!found); - - unresolved->old_tid = old_tuple->t_self; - unresolved->tuple = tdeheap_copytuple(new_tuple); - - /* - * We can't do anything more now, since we don't know where the - * tuple will be written. - */ - MemoryContextSwitchTo(old_cxt); - return; - } - } - - /* - * Now we will write the tuple, and then check to see if it is the B tuple - * in any new or known pair. When we resolve a known pair, we will be - * able to write that pair's A tuple, and then we have to check if it - * resolves some other pair. Hence, we need a loop here. - */ - old_tid = old_tuple->t_self; - free_new = false; - - for (;;) - { - ItemPointerData new_tid; - - /* Insert the tuple and find out where it's put in new_heap */ - raw_tdeheap_insert(state, new_tuple); - new_tid = new_tuple->t_self; - - logical_rewrite_tdeheap_tuple(state, old_tid, new_tuple); - - /* - * If the tuple is the updated version of a row, and the prior version - * wouldn't be DEAD yet, then we need to either resolve the prior - * version (if it's waiting in rs_unresolved_tups), or make an entry - * in rs_old_new_tid_map (so we can resolve it when we do see it). The - * previous tuple's xmax would equal this one's xmin, so it's - * RECENTLY_DEAD if and only if the xmin is not before OldestXmin. - */ - if ((new_tuple->t_data->t_infomask & HEAP_UPDATED) && - !TransactionIdPrecedes(HeapTupleHeaderGetXmin(new_tuple->t_data), - state->rs_oldest_xmin)) - { - /* - * Okay, this is B in an update pair. See if we've seen A. - */ - UnresolvedTup unresolved; - - memset(&hashkey, 0, sizeof(hashkey)); - hashkey.xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); - hashkey.tid = old_tid; - - unresolved = hash_search(state->rs_unresolved_tups, &hashkey, - HASH_FIND, NULL); - - if (unresolved != NULL) - { - /* - * We have seen and memorized the previous tuple already. Now - * that we know where we inserted the tuple its t_ctid points - * to, fix its t_ctid and insert it to the new heap. - */ - if (free_new) - tdeheap_freetuple(new_tuple); - new_tuple = unresolved->tuple; - free_new = true; - old_tid = unresolved->old_tid; - new_tuple->t_data->t_ctid = new_tid; - - /* - * We don't need the hash entry anymore, but don't free its - * tuple just yet. - */ - hash_search(state->rs_unresolved_tups, &hashkey, - HASH_REMOVE, &found); - Assert(found); - - /* loop back to insert the previous tuple in the chain */ - continue; - } - else - { - /* - * Remember the new tid of this tuple. We'll use it to set the - * ctid when we find the previous tuple in the chain. - */ - OldToNewMapping mapping; - - mapping = hash_search(state->rs_old_new_tid_map, &hashkey, - HASH_ENTER, &found); - Assert(!found); - - mapping->new_tid = new_tid; - } - } - - /* Done with this (chain of) tuples, for now */ - if (free_new) - tdeheap_freetuple(new_tuple); - break; - } - - MemoryContextSwitchTo(old_cxt); -} - -/* - * Register a dead tuple with an ongoing rewrite. Dead tuples are not - * copied to the new table, but we still make note of them so that we - * can release some resources earlier. - * - * Returns true if a tuple was removed from the unresolved_tups table. - * This indicates that that tuple, previously thought to be "recently dead", - * is now known really dead and won't be written to the output. - */ -bool -rewrite_tdeheap_dead_tuple(RewriteState state, HeapTuple old_tuple) -{ - /* - * If we have already seen an earlier tuple in the update chain that - * points to this tuple, let's forget about that earlier tuple. It's in - * fact dead as well, our simple xmax < OldestXmin test in - * HeapTupleSatisfiesVacuum just wasn't enough to detect it. It happens - * when xmin of a tuple is greater than xmax, which sounds - * counter-intuitive but is perfectly valid. - * - * We don't bother to try to detect the situation the other way round, - * when we encounter the dead tuple first and then the recently dead one - * that points to it. If that happens, we'll have some unmatched entries - * in the UnresolvedTups hash table at the end. That can happen anyway, - * because a vacuum might have removed the dead tuple in the chain before - * us. - */ - UnresolvedTup unresolved; - TidHashKey hashkey; - bool found; - - memset(&hashkey, 0, sizeof(hashkey)); - hashkey.xmin = HeapTupleHeaderGetXmin(old_tuple->t_data); - hashkey.tid = old_tuple->t_self; - - unresolved = hash_search(state->rs_unresolved_tups, &hashkey, - HASH_FIND, NULL); - - if (unresolved != NULL) - { - /* Need to free the contained tuple as well as the hashtable entry */ - tdeheap_freetuple(unresolved->tuple); - hash_search(state->rs_unresolved_tups, &hashkey, - HASH_REMOVE, &found); - Assert(found); - return true; - } - - return false; -} - -/* - * Insert a tuple to the new relation. This has to track tdeheap_insert - * and its subsidiary functions! - * - * t_self of the tuple is set to the new TID of the tuple. If t_ctid of the - * tuple is invalid on entry, it's replaced with the new TID as well (in - * the inserted data only, not in the caller's copy). - */ -static void -raw_tdeheap_insert(RewriteState state, HeapTuple tup) -{ - Page page = state->rs_buffer; - Size pageFreeSpace, - saveFreeSpace; - Size len; - OffsetNumber newoff; - HeapTuple heaptup; - - /* - * If the new tuple is too big for storage or contains already toasted - * out-of-line attributes from some other relation, invoke the toaster. - * - * Note: below this point, heaptup is the data we actually intend to store - * into the relation; tup is the caller's original untoasted data. - */ - if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE) - { - /* toast table entries should never be recursively toasted */ - Assert(!HeapTupleHasExternal(tup)); - heaptup = tup; - } - else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) - { - int options = HEAP_INSERT_SKIP_FSM; - - /* - * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data - * for the TOAST table are not logically decoded. The main heap is - * WAL-logged as XLOG FPI records, which are not logically decoded. - */ - options |= HEAP_INSERT_NO_LOGICAL; - - heaptup = tdeheap_toast_insert_or_update(state->rs_new_rel, tup, NULL, - options); - } - else - heaptup = tup; - - len = MAXALIGN(heaptup->t_len); /* be conservative */ - - /* - * If we're gonna fail for oversize tuple, do it right away - */ - if (len > MaxHeapTupleSize) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("row is too big: size %zu, maximum size %zu", - len, MaxHeapTupleSize))); - - /* Compute desired extra freespace due to fillfactor option */ - saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel, - HEAP_DEFAULT_FILLFACTOR); - - /* Now we can check to see if there's enough free space already. */ - if (state->rs_buffer_valid) - { - pageFreeSpace = PageGetHeapFreeSpace(page); - - if (len + saveFreeSpace > pageFreeSpace) - { - /* - * Doesn't fit, so write out the existing page. It always - * contains a tuple. Hence, unlike tdeheap_RelationGetBufferForTuple(), - * enforce saveFreeSpace unconditionally. - */ - - /* XLOG stuff */ - if (RelationNeedsWAL(state->rs_new_rel)) - log_newpage(&state->rs_new_rel->rd_locator, - MAIN_FORKNUM, - state->rs_blockno, - page, - true); - - /* - * Now write the page. We say skipFsync = true because there's no - * need for smgr to schedule an fsync for this write; we'll do it - * ourselves in end_tdeheap_rewrite. - */ - PageSetChecksumInplace(page, state->rs_blockno); - - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, - state->rs_blockno, page, true); - - state->rs_blockno++; - state->rs_buffer_valid = false; - } - } - - if (!state->rs_buffer_valid) - { - /* Initialize a new empty page */ - PageInit(page, BLCKSZ, 0); - state->rs_buffer_valid = true; - } - - /* And now we can insert the tuple into the page */ - newoff = TDE_PageAddItem(state->rs_new_rel->rd_locator, heaptup->t_tableOid, state->rs_blockno, page, (Item) heaptup->t_data, heaptup->t_len, - InvalidOffsetNumber, false, true); - if (newoff == InvalidOffsetNumber) - elog(ERROR, "failed to add tuple"); - - /* Update caller's t_self to the actual position where it was stored */ - ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff); - - /* - * Insert the correct position into CTID of the stored tuple, too, if the - * caller didn't supply a valid CTID. - */ - if (!ItemPointerIsValid(&tup->t_data->t_ctid)) - { - ItemId newitemid; - HeapTupleHeader onpage_tup; - - newitemid = PageGetItemId(page, newoff); - onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); - - onpage_tup->t_ctid = tup->t_self; - } - - /* If heaptup is a private copy, release it. */ - if (heaptup != tup) - tdeheap_freetuple(heaptup); -} - -/* ------------------------------------------------------------------------ - * Logical rewrite support - * - * When doing logical decoding - which relies on using cmin/cmax of catalog - * tuples, via xl_tdeheap_new_cid records - heap rewrites have to log enough - * information to allow the decoding backend to update its internal mapping - * of (relfilelocator,ctid) => (cmin, cmax) to be correct for the rewritten heap. - * - * For that, every time we find a tuple that's been modified in a catalog - * relation within the xmin horizon of any decoding slot, we log a mapping - * from the old to the new location. - * - * To deal with rewrites that abort the filename of a mapping file contains - * the xid of the transaction performing the rewrite, which then can be - * checked before being read in. - * - * For efficiency we don't immediately spill every single map mapping for a - * row to disk but only do so in batches when we've collected several of them - * in memory or when end_tdeheap_rewrite() has been called. - * - * Crash-Safety: This module diverts from the usual patterns of doing WAL - * since it cannot rely on checkpoint flushing out all buffers and thus - * waiting for exclusive locks on buffers. Usually the XLogInsert() covering - * buffer modifications is performed while the buffer(s) that are being - * modified are exclusively locked guaranteeing that both the WAL record and - * the modified heap are on either side of the checkpoint. But since the - * mapping files we log aren't in shared_buffers that interlock doesn't work. - * - * Instead we simply write the mapping files out to disk, *before* the - * XLogInsert() is performed. That guarantees that either the XLogInsert() is - * inserted after the checkpoint's redo pointer or that the checkpoint (via - * CheckPointLogicalRewriteHeap()) has flushed the (partial) mapping file to - * disk. That leaves the tail end that has not yet been flushed open to - * corruption, which is solved by including the current offset in the - * xl_tdeheap_rewrite_mapping records and truncating the mapping file to it - * during replay. Every time a rewrite is finished all generated mapping files - * are synced to disk. - * - * Note that if we were only concerned about crash safety we wouldn't have to - * deal with WAL logging at all - an fsync() at the end of a rewrite would be - * sufficient for crash safety. Any mapping that hasn't been safely flushed to - * disk has to be by an aborted (explicitly or via a crash) transaction and is - * ignored by virtue of the xid in its name being subject to a - * TransactionDidCommit() check. But we want to support having standbys via - * physical replication, both for availability and to do logical decoding - * there. - * ------------------------------------------------------------------------ - */ - -/* - * Do preparations for logging logical mappings during a rewrite if - * necessary. If we detect that we don't need to log anything we'll prevent - * any further action by the various logical rewrite functions. - */ -static void -logical_begin_tdeheap_rewrite(RewriteState state) -{ - HASHCTL hash_ctl; - TransactionId logical_xmin; - - /* - * We only need to persist these mappings if the rewritten table can be - * accessed during logical decoding, if not, we can skip doing any - * additional work. - */ - state->rs_logical_rewrite = - RelationIsAccessibleInLogicalDecoding(state->rs_old_rel); - - if (!state->rs_logical_rewrite) - return; - - ProcArrayGetReplicationSlotXmin(NULL, &logical_xmin); - - /* - * If there are no logical slots in progress we don't need to do anything, - * there cannot be any remappings for relevant rows yet. The relation's - * lock protects us against races. - */ - if (logical_xmin == InvalidTransactionId) - { - state->rs_logical_rewrite = false; - return; - } - - state->rs_logical_xmin = logical_xmin; - state->rs_begin_lsn = GetXLogInsertRecPtr(); - state->rs_num_rewrite_mappings = 0; - - hash_ctl.keysize = sizeof(TransactionId); - hash_ctl.entrysize = sizeof(RewriteMappingFile); - hash_ctl.hcxt = state->rs_cxt; - - state->rs_logical_mappings = - hash_create("Logical rewrite mapping", - 128, /* arbitrary initial size */ - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); -} - -/* - * Flush all logical in-memory mappings to disk, but don't fsync them yet. - */ -static void -logical_tdeheap_rewrite_flush_mappings(RewriteState state) -{ - HASH_SEQ_STATUS seq_status; - RewriteMappingFile *src; - dlist_mutable_iter iter; - - Assert(state->rs_logical_rewrite); - - /* no logical rewrite in progress, no need to iterate over mappings */ - if (state->rs_num_rewrite_mappings == 0) - return; - - elog(DEBUG1, "flushing %u logical rewrite mapping entries", - state->rs_num_rewrite_mappings); - - hash_seq_init(&seq_status, state->rs_logical_mappings); - while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) - { - char *waldata; - char *waldata_start; - xl_tdeheap_rewrite_mapping xlrec; - Oid dboid; - uint32 len; - int written; - uint32 num_mappings = dclist_count(&src->mappings); - - /* this file hasn't got any new mappings */ - if (num_mappings == 0) - continue; - - if (state->rs_old_rel->rd_rel->relisshared) - dboid = InvalidOid; - else - dboid = MyDatabaseId; - - xlrec.num_mappings = num_mappings; - xlrec.mapped_rel = RelationGetRelid(state->rs_old_rel); - xlrec.mapped_xid = src->xid; - xlrec.mapped_db = dboid; - xlrec.offset = src->off; - xlrec.start_lsn = state->rs_begin_lsn; - - /* write all mappings consecutively */ - len = num_mappings * sizeof(LogicalRewriteMappingData); - waldata_start = waldata = palloc(len); - - /* - * collect data we need to write out, but don't modify ondisk data yet - */ - dclist_foreach_modify(iter, &src->mappings) - { - RewriteMappingDataEntry *pmap; - - pmap = dclist_container(RewriteMappingDataEntry, node, iter.cur); - - memcpy(waldata, &pmap->map, sizeof(pmap->map)); - waldata += sizeof(pmap->map); - - /* remove from the list and free */ - dclist_delete_from(&src->mappings, &pmap->node); - pfree(pmap); - - /* update bookkeeping */ - state->rs_num_rewrite_mappings--; - } - - Assert(dclist_count(&src->mappings) == 0); - Assert(waldata == waldata_start + len); - - /* - * Note that we deviate from the usual WAL coding practices here, - * check the above "Logical rewrite support" comment for reasoning. - */ - written = FileWrite(src->vfd, waldata_start, len, src->off, - WAIT_EVENT_LOGICAL_REWRITE_WRITE); - if (written != len) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path, - written, len))); - src->off += len; - - XLogBeginInsert(); - XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); - XLogRegisterData(waldata_start, len); - - /* write xlog record */ - XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE); - - pfree(waldata_start); - } - Assert(state->rs_num_rewrite_mappings == 0); -} - -/* - * Logical remapping part of end_tdeheap_rewrite(). - */ -static void -logical_end_tdeheap_rewrite(RewriteState state) -{ - HASH_SEQ_STATUS seq_status; - RewriteMappingFile *src; - - /* done, no logical rewrite in progress */ - if (!state->rs_logical_rewrite) - return; - - /* writeout remaining in-memory entries */ - if (state->rs_num_rewrite_mappings > 0) - logical_tdeheap_rewrite_flush_mappings(state); - - /* Iterate over all mappings we have written and fsync the files. */ - hash_seq_init(&seq_status, state->rs_logical_mappings); - while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) - { - if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0) - ereport(data_sync_elevel(ERROR), - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", src->path))); - FileClose(src->vfd); - } - /* memory context cleanup will deal with the rest */ -} - -/* - * Log a single (old->new) mapping for 'xid'. - */ -static void -logical_rewrite_log_mapping(RewriteState state, TransactionId xid, - LogicalRewriteMappingData *map) -{ - RewriteMappingFile *src; - RewriteMappingDataEntry *pmap; - Oid relid; - bool found; - - relid = RelationGetRelid(state->rs_old_rel); - - /* look for existing mappings for this 'mapped' xid */ - src = hash_search(state->rs_logical_mappings, &xid, - HASH_ENTER, &found); - - /* - * We haven't yet had the need to map anything for this xid, create - * per-xid data structures. - */ - if (!found) - { - char path[MAXPGPATH]; - Oid dboid; - - if (state->rs_old_rel->rd_rel->relisshared) - dboid = InvalidOid; - else - dboid = MyDatabaseId; - - snprintf(path, MAXPGPATH, - "pg_logical/mappings/" LOGICAL_REWRITE_FORMAT, - dboid, relid, - LSN_FORMAT_ARGS(state->rs_begin_lsn), - xid, GetCurrentTransactionId()); - - dclist_init(&src->mappings); - src->off = 0; - memcpy(src->path, path, sizeof(path)); - src->vfd = PathNameOpenFile(path, - O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); - if (src->vfd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create file \"%s\": %m", path))); - } - - pmap = MemoryContextAlloc(state->rs_cxt, - sizeof(RewriteMappingDataEntry)); - memcpy(&pmap->map, map, sizeof(LogicalRewriteMappingData)); - dclist_push_tail(&src->mappings, &pmap->node); - state->rs_num_rewrite_mappings++; - - /* - * Write out buffer every time we've too many in-memory entries across all - * mapping files. - */ - if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */ ) - logical_tdeheap_rewrite_flush_mappings(state); -} - -/* - * Perform logical remapping for a tuple that's mapped from old_tid to - * new_tuple->t_self by rewrite_tdeheap_tuple() if necessary for the tuple. - */ -static void -logical_rewrite_tdeheap_tuple(RewriteState state, ItemPointerData old_tid, - HeapTuple new_tuple) -{ - ItemPointerData new_tid = new_tuple->t_self; - TransactionId cutoff = state->rs_logical_xmin; - TransactionId xmin; - TransactionId xmax; - bool do_log_xmin = false; - bool do_log_xmax = false; - LogicalRewriteMappingData map; - - /* no logical rewrite in progress, we don't need to log anything */ - if (!state->rs_logical_rewrite) - return; - - xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); - /* use *GetUpdateXid to correctly deal with multixacts */ - xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data); - - /* - * Log the mapping iff the tuple has been created recently. - */ - if (TransactionIdIsNormal(xmin) && !TransactionIdPrecedes(xmin, cutoff)) - do_log_xmin = true; - - if (!TransactionIdIsNormal(xmax)) - { - /* - * no xmax is set, can't have any permanent ones, so this check is - * sufficient - */ - } - else if (HEAP_XMAX_IS_LOCKED_ONLY(new_tuple->t_data->t_infomask)) - { - /* only locked, we don't care */ - } - else if (!TransactionIdPrecedes(xmax, cutoff)) - { - /* tuple has been deleted recently, log */ - do_log_xmax = true; - } - - /* if neither needs to be logged, we're done */ - if (!do_log_xmin && !do_log_xmax) - return; - - /* fill out mapping information */ - map.old_locator = state->rs_old_rel->rd_locator; - map.old_tid = old_tid; - map.new_locator = state->rs_new_rel->rd_locator; - map.new_tid = new_tid; - - /* --- - * Now persist the mapping for the individual xids that are affected. We - * need to log for both xmin and xmax if they aren't the same transaction - * since the mapping files are per "affected" xid. - * We don't muster all that much effort detecting whether xmin and xmax - * are actually the same transaction, we just check whether the xid is the - * same disregarding subtransactions. Logging too much is relatively - * harmless and we could never do the check fully since subtransaction - * data is thrown away during restarts. - * --- - */ - if (do_log_xmin) - logical_rewrite_log_mapping(state, xmin, &map); - /* separately log mapping for xmax unless it'd be redundant */ - if (do_log_xmax && !TransactionIdEquals(xmin, xmax)) - logical_rewrite_log_mapping(state, xmax, &map); -} - -/* - * Replay XLOG_HEAP2_REWRITE records - */ -void -tdeheap_xlog_logical_rewrite(XLogReaderState *r) -{ - char path[MAXPGPATH]; - int fd; - xl_tdeheap_rewrite_mapping *xlrec; - uint32 len; - char *data; - - xlrec = (xl_tdeheap_rewrite_mapping *) XLogRecGetData(r); - - snprintf(path, MAXPGPATH, - "pg_logical/mappings/" LOGICAL_REWRITE_FORMAT, - xlrec->mapped_db, xlrec->mapped_rel, - LSN_FORMAT_ARGS(xlrec->start_lsn), - xlrec->mapped_xid, XLogRecGetXid(r)); - - fd = OpenTransientFile(path, - O_CREAT | O_WRONLY | PG_BINARY); - if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create file \"%s\": %m", path))); - - /* - * Truncate all data that's not guaranteed to have been safely fsynced (by - * previous record or by the last checkpoint). - */ - pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE); - if (ftruncate(fd, xlrec->offset) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not truncate file \"%s\" to %u: %m", - path, (uint32) xlrec->offset))); - pgstat_report_wait_end(); - - data = XLogRecGetData(r) + sizeof(*xlrec); - - len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData); - - /* write out tail end of mapping file (again) */ - errno = 0; - pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE); - if (pg_pwrite(fd, data, len, xlrec->offset) != len) - { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write to file \"%s\": %m", path))); - } - pgstat_report_wait_end(); - - /* - * Now fsync all previously written data. We could improve things and only - * do this for the last write to a file, but the required bookkeeping - * doesn't seem worth the trouble. - */ - pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC); - if (pg_fsync(fd) != 0) - ereport(data_sync_elevel(ERROR), - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", path))); - pgstat_report_wait_end(); - - if (CloseTransientFile(fd) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", path))); -} - -/* --- - * Perform a checkpoint for logical rewrite mappings - * - * This serves two tasks: - * 1) Remove all mappings not needed anymore based on the logical restart LSN - * 2) Flush all remaining mappings to disk, so that replay after a checkpoint - * only has to deal with the parts of a mapping that have been written out - * after the checkpoint started. - * --- - */ -void -CheckPointLogicalRewriteHeap(void) -{ - XLogRecPtr cutoff; - XLogRecPtr redo; - DIR *mappings_dir; - struct dirent *mapping_de; - char path[MAXPGPATH + 20]; - - /* - * We start of with a minimum of the last redo pointer. No new decoding - * slot will start before that, so that's a safe upper bound for removal. - */ - redo = GetRedoRecPtr(); - - /* now check for the restart ptrs from existing slots */ - cutoff = ReplicationSlotsComputeLogicalRestartLSN(); - - /* don't start earlier than the restart lsn */ - if (cutoff != InvalidXLogRecPtr && redo < cutoff) - cutoff = redo; - - mappings_dir = AllocateDir("pg_logical/mappings"); - while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL) - { - Oid dboid; - Oid relid; - XLogRecPtr lsn; - TransactionId rewrite_xid; - TransactionId create_xid; - uint32 hi, - lo; - PGFileType de_type; - - if (strcmp(mapping_de->d_name, ".") == 0 || - strcmp(mapping_de->d_name, "..") == 0) - continue; - - snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name); - de_type = get_dirent_type(path, mapping_de, false, DEBUG1); - - if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_REG) - continue; - - /* Skip over files that cannot be ours. */ - if (strncmp(mapping_de->d_name, "map-", 4) != 0) - continue; - - if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, - &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6) - elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name); - - lsn = ((uint64) hi) << 32 | lo; - - if (lsn < cutoff || cutoff == InvalidXLogRecPtr) - { - elog(DEBUG1, "removing logical rewrite file \"%s\"", path); - if (unlink(path) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); - } - else - { - /* on some operating systems fsyncing a file requires O_RDWR */ - int fd = OpenTransientFile(path, O_RDWR | PG_BINARY); - - /* - * The file cannot vanish due to concurrency since this function - * is the only one removing logical mappings and only one - * checkpoint can be in progress at a time. - */ - if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", path))); - - /* - * We could try to avoid fsyncing files that either haven't - * changed or have only been created since the checkpoint's start, - * but it's currently not deemed worth the effort. - */ - pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC); - if (pg_fsync(fd) != 0) - ereport(data_sync_elevel(ERROR), - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", path))); - pgstat_report_wait_end(); - - if (CloseTransientFile(fd) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", path))); - } - } - FreeDir(mappings_dir); - - /* persist directory entries to disk */ - fsync_fname("pg_logical/mappings", true); -} diff --git a/src/access/pg_tde_vacuumlazy.c b/src/access/pg_tde_vacuumlazy.c deleted file mode 100644 index 8a3f49ef..00000000 --- a/src/access/pg_tde_vacuumlazy.c +++ /dev/null @@ -1,3476 +0,0 @@ -/*------------------------------------------------------------------------- - * - * vacuumlazy.c - * Concurrent ("lazy") vacuuming. - * - * The major space usage for vacuuming is storage for the array of dead TIDs - * that are to be removed from indexes. We want to ensure we can vacuum even - * the very largest relations with finite memory space usage. To do that, we - * set upper bounds on the number of TIDs we can keep track of at once. - * - * We are willing to use at most maintenance_work_mem (or perhaps - * autovacuum_work_mem) memory space to keep track of dead TIDs. We initially - * allocate an array of TIDs of that size, with an upper limit that depends on - * table size (this limit ensures we don't allocate a huge area uselessly for - * vacuuming small tables). If the array threatens to overflow, we must call - * lazy_vacuum to vacuum indexes (and to vacuum the pages that we've pruned). - * This frees up the memory space dedicated to storing dead TIDs. - * - * In practice VACUUM will often complete its initial pass over the target - * pg_tde relation without ever running out of space to store TIDs. This means - * that there only needs to be one call to lazy_vacuum, after the initial pass - * completes. - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/access/pg_tde/vacuumlazy.c - * - *------------------------------------------------------------------------- - */ -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include - -#include "access/pg_tdeam.h" -#include "access/pg_tdeam_xlog.h" -#include "access/pg_tde_visibilitymap.h" -#include "encryption/enc_tde.h" - -#include "access/amapi.h" -#include "access/genam.h" -#include "access/htup_details.h" -#include "access/multixact.h" -#include "access/transam.h" -#include "access/xact.h" -#include "access/xlog.h" -#include "access/xloginsert.h" -#include "catalog/index.h" -#include "catalog/storage.h" -#include "commands/dbcommands.h" -#include "commands/progress.h" -#include "commands/vacuum.h" -#include "executor/instrument.h" -#include "miscadmin.h" -#include "optimizer/paths.h" -#include "pgstat.h" -#include "portability/instr_time.h" -#include "postmaster/autovacuum.h" -#include "storage/bufmgr.h" -#include "storage/freespace.h" -#include "storage/lmgr.h" -#include "tcop/tcopprot.h" -#include "utils/lsyscache.h" -#include "utils/memutils.h" -#include "utils/pg_rusage.h" -#include "utils/timestamp.h" - - -/* - * Space/time tradeoff parameters: do these need to be user-tunable? - * - * To consider truncating the relation, we want there to be at least - * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever - * is less) potentially-freeable pages. - */ -#define REL_TRUNCATE_MINIMUM 1000 -#define REL_TRUNCATE_FRACTION 16 - -/* - * Timing parameters for truncate locking heuristics. - * - * These were not exposed as user tunable GUC values because it didn't seem - * that the potential for improvement was great enough to merit the cost of - * supporting them. - */ -#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */ -#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */ -#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ - -/* - * Threshold that controls whether we bypass index vacuuming and heap - * vacuuming as an optimization - */ -#define BYPASS_THRESHOLD_PAGES 0.02 /* i.e. 2% of rel_pages */ - -/* - * Perform a failsafe check each time we scan another 4GB of pages. - * (Note that this is deliberately kept to a power-of-two, usually 2^19.) - */ -#define FAILSAFE_EVERY_PAGES \ - ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ)) - -/* - * When a table has no indexes, vacuum the FSM after every 8GB, approximately - * (it won't be exact because we only vacuum FSM after processing a heap page - * that has some removable tuples). When there are indexes, this is ignored, - * and we vacuum FSM after each index/heap cleaning pass. - */ -#define VACUUM_FSM_EVERY_PAGES \ - ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) - -/* - * Before we consider skipping a page that's marked as clean in - * visibility map, we must've seen at least this many clean pages. - */ -#define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) - -/* - * Size of the prefetch window for lazy vacuum backwards truncation scan. - * Needs to be a power of 2. - */ -#define PREFETCH_SIZE ((BlockNumber) 32) - -/* - * Macro to check if we are in a parallel vacuum. If true, we are in the - * parallel mode and the DSM segment is initialized. - */ -#define ParallelVacuumIsActive(vacrel) ((vacrel)->pvs != NULL) - -/* Phases of vacuum during which we report error context. */ -typedef enum -{ - VACUUM_ERRCB_PHASE_UNKNOWN, - VACUUM_ERRCB_PHASE_SCAN_HEAP, - VACUUM_ERRCB_PHASE_VACUUM_INDEX, - VACUUM_ERRCB_PHASE_VACUUM_HEAP, - VACUUM_ERRCB_PHASE_INDEX_CLEANUP, - VACUUM_ERRCB_PHASE_TRUNCATE -} VacErrPhase; - -typedef struct LVRelState -{ - /* Target heap relation and its indexes */ - Relation rel; - Relation *indrels; - int nindexes; - - /* Buffer access strategy and parallel vacuum state */ - BufferAccessStrategy bstrategy; - ParallelVacuumState *pvs; - - /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */ - bool aggressive; - /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */ - bool skipwithvm; - /* Consider index vacuuming bypass optimization? */ - bool consider_bypass_optimization; - - /* Doing index vacuuming, index cleanup, rel truncation? */ - bool do_index_vacuuming; - bool do_index_cleanup; - bool do_rel_truncate; - - /* VACUUM operation's cutoffs for freezing and pruning */ - struct VacuumCutoffs cutoffs; - GlobalVisState *vistest; - /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */ - TransactionId NewRelfrozenXid; - MultiXactId NewRelminMxid; - bool skippedallvis; - - /* Error reporting state */ - char *dbname; - char *relnamespace; - char *relname; - char *indname; /* Current index name */ - BlockNumber blkno; /* used only for heap operations */ - OffsetNumber offnum; /* used only for heap operations */ - VacErrPhase phase; - bool verbose; /* VACUUM VERBOSE? */ - - /* - * dead_items stores TIDs whose index tuples are deleted by index - * vacuuming. Each TID points to an LP_DEAD line pointer from a heap page - * that has been processed by lazy_scan_prune. Also needed by - * lazy_vacuum_tdeheap_rel, which marks the same LP_DEAD line pointers as - * LP_UNUSED during second heap pass. - */ - VacDeadItems *dead_items; /* TIDs whose index tuples we'll delete */ - BlockNumber rel_pages; /* total number of pages */ - BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ - BlockNumber removed_pages; /* # pages removed by relation truncation */ - BlockNumber frozen_pages; /* # pages with newly frozen tuples */ - BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */ - BlockNumber missed_dead_pages; /* # pages with missed dead tuples */ - BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ - - /* Statistics output by us, for table */ - double new_rel_tuples; /* new estimated total # of tuples */ - double new_live_tuples; /* new estimated total # of live tuples */ - /* Statistics output by index AMs */ - IndexBulkDeleteResult **indstats; - - /* Instrumentation counters */ - int num_index_scans; - /* Counters that follow are only for scanned_pages */ - int64 tuples_deleted; /* # deleted from table */ - int64 tuples_frozen; /* # newly frozen */ - int64 lpdead_items; /* # deleted from indexes */ - int64 live_tuples; /* # live tuples remaining */ - int64 recently_dead_tuples; /* # dead, but not yet removable */ - int64 missed_dead_tuples; /* # removable, but not removed */ -} LVRelState; - -/* - * State returned by lazy_scan_prune() - */ -typedef struct LVPagePruneState -{ - bool hastup; /* Page prevents rel truncation? */ - bool has_lpdead_items; /* includes existing LP_DEAD items */ - - /* - * State describes the proper VM bit states to set for the page following - * pruning and freezing. all_visible implies !has_lpdead_items, but don't - * trust all_frozen result unless all_visible is also set to true. - */ - bool all_visible; /* Every item visible to all? */ - bool all_frozen; /* provided all_visible is also true */ - TransactionId visibility_cutoff_xid; /* For recovery conflicts */ -} LVPagePruneState; - -/* Struct for saving and restoring vacuum error information. */ -typedef struct LVSavedErrInfo -{ - BlockNumber blkno; - OffsetNumber offnum; - VacErrPhase phase; -} LVSavedErrInfo; - - -/* non-export function prototypes */ -static void lazy_scan_heap(LVRelState *vacrel); -static BlockNumber lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, - BlockNumber next_block, - bool *next_unskippable_allvis, - bool *skipping_current_range); -static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, - BlockNumber blkno, Page page, - bool sharelock, Buffer vmbuffer); -static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, - BlockNumber blkno, Page page, - LVPagePruneState *prunestate); -static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf, - BlockNumber blkno, Page page, - bool *hastup, bool *recordfreespace); -static void lazy_vacuum(LVRelState *vacrel); -static bool lazy_vacuum_all_indexes(LVRelState *vacrel); -static void lazy_vacuum_tdeheap_rel(LVRelState *vacrel); -static int lazy_vacuum_tdeheap_page(LVRelState *vacrel, BlockNumber blkno, - Buffer buffer, int index, Buffer vmbuffer); -static bool lazy_check_wraparound_failsafe(LVRelState *vacrel); -static void lazy_cleanup_all_indexes(LVRelState *vacrel); -static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel, - IndexBulkDeleteResult *istat, - double reltuples, - LVRelState *vacrel); -static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel, - IndexBulkDeleteResult *istat, - double reltuples, - bool estimated_count, - LVRelState *vacrel); -static bool should_attempt_truncation(LVRelState *vacrel); -static void lazy_truncate_heap(LVRelState *vacrel); -static BlockNumber count_nondeletable_pages(LVRelState *vacrel, - bool *lock_waiter_detected); -static void dead_items_alloc(LVRelState *vacrel, int nworkers); -static void dead_items_cleanup(LVRelState *vacrel); -static bool tdeheap_page_is_all_visible(LVRelState *vacrel, Buffer buf, - TransactionId *visibility_cutoff_xid, bool *all_frozen); -static void update_relstats_all_indexes(LVRelState *vacrel); -static void vacuum_error_callback(void *arg); -static void update_vacuum_error_info(LVRelState *vacrel, - LVSavedErrInfo *saved_vacrel, - int phase, BlockNumber blkno, - OffsetNumber offnum); -static void restore_vacuum_error_info(LVRelState *vacrel, - const LVSavedErrInfo *saved_vacrel); - - -/* - * tdeheap_vacuum_rel() -- perform VACUUM for one heap relation - * - * This routine sets things up for and then calls lazy_scan_heap, where - * almost all work actually takes place. Finalizes everything after call - * returns by managing relation truncation and updating rel's pg_class - * entry. (Also updates pg_class entries for any indexes that need it.) - * - * At entry, we have already established a transaction and opened - * and locked the relation. - */ -void -tdeheap_vacuum_rel(Relation rel, VacuumParams *params, - BufferAccessStrategy bstrategy) -{ - LVRelState *vacrel; - bool verbose, - instrument, - skipwithvm, - frozenxid_updated, - minmulti_updated; - BlockNumber orig_rel_pages, - new_rel_pages, - new_rel_allvisible; - PGRUsage ru0; - TimestampTz starttime = 0; - PgStat_Counter startreadtime = 0, - startwritetime = 0; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; - ErrorContextCallback errcallback; - char **indnames = NULL; - - verbose = (params->options & VACOPT_VERBOSE) != 0; - instrument = (verbose || (IsAutoVacuumWorkerProcess() && - params->log_min_duration >= 0)); - if (instrument) - { - pg_rusage_init(&ru0); - starttime = GetCurrentTimestamp(); - if (track_io_timing) - { - startreadtime = pgStatBlockReadTime; - startwritetime = pgStatBlockWriteTime; - } - } - - pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM, - RelationGetRelid(rel)); - - /* - * Setup error traceback support for ereport() first. The idea is to set - * up an error context callback to display additional information on any - * error during a vacuum. During different phases of vacuum, we update - * the state so that the error context callback always display current - * information. - * - * Copy the names of heap rel into local memory for error reporting - * purposes, too. It isn't always safe to assume that we can get the name - * of each rel. It's convenient for code in lazy_scan_heap to always use - * these temp copies. - */ - vacrel = (LVRelState *) palloc0(sizeof(LVRelState)); - vacrel->dbname = get_database_name(MyDatabaseId); - vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel)); - vacrel->relname = pstrdup(RelationGetRelationName(rel)); - vacrel->indname = NULL; - vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN; - vacrel->verbose = verbose; - errcallback.callback = vacuum_error_callback; - errcallback.arg = vacrel; - errcallback.previous = error_context_stack; - error_context_stack = &errcallback; - - /* Set up high level stuff about rel and its indexes */ - vacrel->rel = rel; - vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes, - &vacrel->indrels); - vacrel->bstrategy = bstrategy; - if (instrument && vacrel->nindexes > 0) - { - /* Copy index names used by instrumentation (not error reporting) */ - indnames = palloc(sizeof(char *) * vacrel->nindexes); - for (int i = 0; i < vacrel->nindexes; i++) - indnames[i] = pstrdup(RelationGetRelationName(vacrel->indrels[i])); - } - - /* - * The index_cleanup param either disables index vacuuming and cleanup or - * forces it to go ahead when we would otherwise apply the index bypass - * optimization. The default is 'auto', which leaves the final decision - * up to lazy_vacuum(). - * - * The truncate param allows user to avoid attempting relation truncation, - * though it can't force truncation to happen. - */ - Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED); - Assert(params->truncate != VACOPTVALUE_UNSPECIFIED && - params->truncate != VACOPTVALUE_AUTO); - - /* - * While VacuumFailSafeActive is reset to false before calling this, we - * still need to reset it here due to recursive calls. - */ - VacuumFailsafeActive = false; - vacrel->consider_bypass_optimization = true; - vacrel->do_index_vacuuming = true; - vacrel->do_index_cleanup = true; - vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED); - if (params->index_cleanup == VACOPTVALUE_DISABLED) - { - /* Force disable index vacuuming up-front */ - vacrel->do_index_vacuuming = false; - vacrel->do_index_cleanup = false; - } - else if (params->index_cleanup == VACOPTVALUE_ENABLED) - { - /* Force index vacuuming. Note that failsafe can still bypass. */ - vacrel->consider_bypass_optimization = false; - } - else - { - /* Default/auto, make all decisions dynamically */ - Assert(params->index_cleanup == VACOPTVALUE_AUTO); - } - - /* Initialize page counters explicitly (be tidy) */ - vacrel->scanned_pages = 0; - vacrel->removed_pages = 0; - vacrel->frozen_pages = 0; - vacrel->lpdead_item_pages = 0; - vacrel->missed_dead_pages = 0; - vacrel->nonempty_pages = 0; - /* dead_items_alloc allocates vacrel->dead_items later on */ - - /* Allocate/initialize output statistics state */ - vacrel->new_rel_tuples = 0; - vacrel->new_live_tuples = 0; - vacrel->indstats = (IndexBulkDeleteResult **) - palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *)); - - /* Initialize remaining counters (be tidy) */ - vacrel->num_index_scans = 0; - vacrel->tuples_deleted = 0; - vacrel->tuples_frozen = 0; - vacrel->lpdead_items = 0; - vacrel->live_tuples = 0; - vacrel->recently_dead_tuples = 0; - vacrel->missed_dead_tuples = 0; - - /* - * Get cutoffs that determine which deleted tuples are considered DEAD, - * not just RECENTLY_DEAD, and which XIDs/MXIDs to freeze. Then determine - * the extent of the blocks that we'll scan in lazy_scan_heap. It has to - * happen in this order to ensure that the OldestXmin cutoff field works - * as an upper bound on the XIDs stored in the pages we'll actually scan - * (NewRelfrozenXid tracking must never be allowed to miss unfrozen XIDs). - * - * Next acquire vistest, a related cutoff that's used in tdeheap_page_prune. - * We expect vistest will always make tdeheap_page_prune remove any deleted - * tuple whose xmax is < OldestXmin. lazy_scan_prune must never become - * confused about whether a tuple should be frozen or removed. (In the - * future we might want to teach lazy_scan_prune to recompute vistest from - * time to time, to increase the number of dead tuples it can prune away.) - */ - vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs); - vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); - vacrel->vistest = GlobalVisTestFor(rel); - /* Initialize state used to track oldest extant XID/MXID */ - vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; - vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact; - vacrel->skippedallvis = false; - skipwithvm = true; - if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) - { - /* - * Force aggressive mode, and disable skipping blocks using the - * visibility map (even those set all-frozen) - */ - vacrel->aggressive = true; - skipwithvm = false; - } - - vacrel->skipwithvm = skipwithvm; - - if (verbose) - { - if (vacrel->aggressive) - ereport(INFO, - (errmsg("aggressively vacuuming \"%s.%s.%s\"", - vacrel->dbname, vacrel->relnamespace, - vacrel->relname))); - else - ereport(INFO, - (errmsg("vacuuming \"%s.%s.%s\"", - vacrel->dbname, vacrel->relnamespace, - vacrel->relname))); - } - - /* - * Allocate dead_items array memory using dead_items_alloc. This handles - * parallel VACUUM initialization as part of allocating shared memory - * space used for dead_items. (But do a failsafe precheck first, to - * ensure that parallel VACUUM won't be attempted at all when relfrozenxid - * is already dangerously old.) - */ - lazy_check_wraparound_failsafe(vacrel); - dead_items_alloc(vacrel, params->nworkers); - - /* - * Call lazy_scan_heap to perform all required heap pruning, index - * vacuuming, and heap vacuuming (plus related processing) - */ - lazy_scan_heap(vacrel); - - /* - * Free resources managed by dead_items_alloc. This ends parallel mode in - * passing when necessary. - */ - dead_items_cleanup(vacrel); - Assert(!IsInParallelMode()); - - /* - * Update pg_class entries for each of rel's indexes where appropriate. - * - * Unlike the later update to rel's pg_class entry, this is not critical. - * Maintains relpages/reltuples statistics used by the planner only. - */ - if (vacrel->do_index_cleanup) - update_relstats_all_indexes(vacrel); - - /* Done with rel's indexes */ - vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock); - - /* Optionally truncate rel */ - if (should_attempt_truncation(vacrel)) - lazy_truncate_heap(vacrel); - - /* Pop the error context stack */ - error_context_stack = errcallback.previous; - - /* Report that we are now doing final cleanup */ - pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, - PROGRESS_VACUUM_PHASE_FINAL_CLEANUP); - - /* - * Prepare to update rel's pg_class entry. - * - * Aggressive VACUUMs must always be able to advance relfrozenxid to a - * value >= FreezeLimit, and relminmxid to a value >= MultiXactCutoff. - * Non-aggressive VACUUMs may advance them by any amount, or not at all. - */ - Assert(vacrel->NewRelfrozenXid == vacrel->cutoffs.OldestXmin || - TransactionIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.FreezeLimit : - vacrel->cutoffs.relfrozenxid, - vacrel->NewRelfrozenXid)); - Assert(vacrel->NewRelminMxid == vacrel->cutoffs.OldestMxact || - MultiXactIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.MultiXactCutoff : - vacrel->cutoffs.relminmxid, - vacrel->NewRelminMxid)); - if (vacrel->skippedallvis) - { - /* - * Must keep original relfrozenxid in a non-aggressive VACUUM that - * chose to skip an all-visible page range. The state that tracks new - * values will have missed unfrozen XIDs from the pages we skipped. - */ - Assert(!vacrel->aggressive); - vacrel->NewRelfrozenXid = InvalidTransactionId; - vacrel->NewRelminMxid = InvalidMultiXactId; - } - - /* - * For safety, clamp relallvisible to be not more than what we're setting - * pg_class.relpages to - */ - new_rel_pages = vacrel->rel_pages; /* After possible rel truncation */ - tdeheap_visibilitymap_count(rel, &new_rel_allvisible, NULL); - if (new_rel_allvisible > new_rel_pages) - new_rel_allvisible = new_rel_pages; - - /* - * Now actually update rel's pg_class entry. - * - * In principle new_live_tuples could be -1 indicating that we (still) - * don't know the tuple count. In practice that can't happen, since we - * scan every page that isn't skipped using the visibility map. - */ - vac_update_relstats(rel, new_rel_pages, vacrel->new_live_tuples, - new_rel_allvisible, vacrel->nindexes > 0, - vacrel->NewRelfrozenXid, vacrel->NewRelminMxid, - &frozenxid_updated, &minmulti_updated, false); - - /* - * Report results to the cumulative stats system, too. - * - * Deliberately avoid telling the stats system about LP_DEAD items that - * remain in the table due to VACUUM bypassing index and heap vacuuming. - * ANALYZE will consider the remaining LP_DEAD items to be dead "tuples". - * It seems like a good idea to err on the side of not vacuuming again too - * soon in cases where the failsafe prevented significant amounts of heap - * vacuuming. - */ - pgstat_report_vacuum(RelationGetRelid(rel), - rel->rd_rel->relisshared, - Max(vacrel->new_live_tuples, 0), - vacrel->recently_dead_tuples + - vacrel->missed_dead_tuples); - pgstat_progress_end_command(); - - if (instrument) - { - TimestampTz endtime = GetCurrentTimestamp(); - - if (verbose || params->log_min_duration == 0 || - TimestampDifferenceExceeds(starttime, endtime, - params->log_min_duration)) - { - long secs_dur; - int usecs_dur; - WalUsage walusage; - BufferUsage bufferusage; - StringInfoData buf; - char *msgfmt; - int32 diff; - double read_rate = 0, - write_rate = 0; - - TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); - - initStringInfo(&buf); - if (verbose) - { - /* - * Aggressiveness already reported earlier, in dedicated - * VACUUM VERBOSE ereport - */ - Assert(!params->is_wraparound); - msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n"); - } - else if (params->is_wraparound) - { - /* - * While it's possible for a VACUUM to be both is_wraparound - * and !aggressive, that's just a corner-case -- is_wraparound - * implies aggressive. Produce distinct output for the corner - * case all the same, just in case. - */ - if (vacrel->aggressive) - msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n"); - else - msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n"); - } - else - { - if (vacrel->aggressive) - msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n"); - else - msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"); - } - appendStringInfo(&buf, msgfmt, - vacrel->dbname, - vacrel->relnamespace, - vacrel->relname, - vacrel->num_index_scans); - appendStringInfo(&buf, _("pages: %u removed, %u remain, %u scanned (%.2f%% of total)\n"), - vacrel->removed_pages, - new_rel_pages, - vacrel->scanned_pages, - orig_rel_pages == 0 ? 100.0 : - 100.0 * vacrel->scanned_pages / orig_rel_pages); - appendStringInfo(&buf, - _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable\n"), - (long long) vacrel->tuples_deleted, - (long long) vacrel->new_rel_tuples, - (long long) vacrel->recently_dead_tuples); - if (vacrel->missed_dead_tuples > 0) - appendStringInfo(&buf, - _("tuples missed: %lld dead from %u pages not removed due to cleanup lock contention\n"), - (long long) vacrel->missed_dead_tuples, - vacrel->missed_dead_pages); - diff = (int32) (ReadNextTransactionId() - - vacrel->cutoffs.OldestXmin); - appendStringInfo(&buf, - _("removable cutoff: %u, which was %d XIDs old when operation ended\n"), - vacrel->cutoffs.OldestXmin, diff); - if (frozenxid_updated) - { - diff = (int32) (vacrel->NewRelfrozenXid - - vacrel->cutoffs.relfrozenxid); - appendStringInfo(&buf, - _("new relfrozenxid: %u, which is %d XIDs ahead of previous value\n"), - vacrel->NewRelfrozenXid, diff); - } - if (minmulti_updated) - { - diff = (int32) (vacrel->NewRelminMxid - - vacrel->cutoffs.relminmxid); - appendStringInfo(&buf, - _("new relminmxid: %u, which is %d MXIDs ahead of previous value\n"), - vacrel->NewRelminMxid, diff); - } - appendStringInfo(&buf, _("frozen: %u pages from table (%.2f%% of total) had %lld tuples frozen\n"), - vacrel->frozen_pages, - orig_rel_pages == 0 ? 100.0 : - 100.0 * vacrel->frozen_pages / orig_rel_pages, - (long long) vacrel->tuples_frozen); - if (vacrel->do_index_vacuuming) - { - if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0) - appendStringInfoString(&buf, _("index scan not needed: ")); - else - appendStringInfoString(&buf, _("index scan needed: ")); - - msgfmt = _("%u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n"); - } - else - { - if (!VacuumFailsafeActive) - appendStringInfoString(&buf, _("index scan bypassed: ")); - else - appendStringInfoString(&buf, _("index scan bypassed by failsafe: ")); - - msgfmt = _("%u pages from table (%.2f%% of total) have %lld dead item identifiers\n"); - } - appendStringInfo(&buf, msgfmt, - vacrel->lpdead_item_pages, - orig_rel_pages == 0 ? 100.0 : - 100.0 * vacrel->lpdead_item_pages / orig_rel_pages, - (long long) vacrel->lpdead_items); - for (int i = 0; i < vacrel->nindexes; i++) - { - IndexBulkDeleteResult *istat = vacrel->indstats[i]; - - if (!istat) - continue; - - appendStringInfo(&buf, - _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"), - indnames[i], - istat->num_pages, - istat->pages_newly_deleted, - istat->pages_deleted, - istat->pages_free); - } - if (track_io_timing) - { - double read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000; - double write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000; - - appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"), - read_ms, write_ms); - } - if (secs_dur > 0 || usecs_dur > 0) - { - read_rate = (double) BLCKSZ * (bufferusage.shared_blks_read + bufferusage.local_blks_read) / - (1024 * 1024) / (secs_dur + usecs_dur / 1000000.0); - write_rate = (double) BLCKSZ * (bufferusage.shared_blks_dirtied + bufferusage.local_blks_dirtied) / - (1024 * 1024) / (secs_dur + usecs_dur / 1000000.0); - } - appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"), - read_rate, write_rate); - appendStringInfo(&buf, - _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"), - (long long) (bufferusage.shared_blks_hit + bufferusage.local_blks_hit), - (long long) (bufferusage.shared_blks_read + bufferusage.local_blks_read), - (long long) (bufferusage.shared_blks_dirtied + bufferusage.local_blks_dirtied)); - appendStringInfo(&buf, - _("WAL usage: %lld records, %lld full page images, %llu bytes\n"), - (long long) walusage.wal_records, - (long long) walusage.wal_fpi, - (unsigned long long) walusage.wal_bytes); - appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0)); - - ereport(verbose ? INFO : LOG, - (errmsg_internal("%s", buf.data))); - pfree(buf.data); - } - } - - /* Cleanup index statistics and index names */ - for (int i = 0; i < vacrel->nindexes; i++) - { - if (vacrel->indstats[i]) - pfree(vacrel->indstats[i]); - - if (instrument) - pfree(indnames[i]); - } -} - -/* - * lazy_scan_heap() -- workhorse function for VACUUM - * - * This routine prunes each page in the heap, and considers the need to - * freeze remaining tuples with storage (not including pages that can be - * skipped using the visibility map). Also performs related maintenance - * of the FSM and visibility map. These steps all take place during an - * initial pass over the target heap relation. - * - * Also invokes lazy_vacuum_all_indexes to vacuum indexes, which largely - * consists of deleting index tuples that point to LP_DEAD items left in - * heap pages following pruning. Earlier initial pass over the heap will - * have collected the TIDs whose index tuples need to be removed. - * - * Finally, invokes lazy_vacuum_tdeheap_rel to vacuum heap pages, which - * largely consists of marking LP_DEAD items (from collected TID array) - * as LP_UNUSED. This has to happen in a second, final pass over the - * heap, to preserve a basic invariant that all index AMs rely on: no - * extant index tuple can ever be allowed to contain a TID that points to - * an LP_UNUSED line pointer in the heap. We must disallow premature - * recycling of line pointers to avoid index scans that get confused - * about which TID points to which tuple immediately after recycling. - * (Actually, this isn't a concern when target heap relation happens to - * have no indexes, which allows us to safely apply the one-pass strategy - * as an optimization). - * - * In practice we often have enough space to fit all TIDs, and so won't - * need to call lazy_vacuum more than once, after our initial pass over - * the heap has totally finished. Otherwise things are slightly more - * complicated: our "initial pass" over the heap applies only to those - * pages that were pruned before we needed to call lazy_vacuum, and our - * "final pass" over the heap only vacuums these same heap pages. - * However, we process indexes in full every time lazy_vacuum is called, - * which makes index processing very inefficient when memory is in short - * supply. - */ -static void -lazy_scan_heap(LVRelState *vacrel) -{ - BlockNumber rel_pages = vacrel->rel_pages, - blkno, - next_unskippable_block, - next_fsm_block_to_vacuum = 0; - VacDeadItems *dead_items = vacrel->dead_items; - Buffer vmbuffer = InvalidBuffer; - bool next_unskippable_allvis, - skipping_current_range; - const int initprog_index[] = { - PROGRESS_VACUUM_PHASE, - PROGRESS_VACUUM_TOTAL_HEAP_BLKS, - PROGRESS_VACUUM_MAX_DEAD_TUPLES - }; - int64 initprog_val[3]; - - /* Report that we're scanning the heap, advertising total # of blocks */ - initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; - initprog_val[1] = rel_pages; - initprog_val[2] = dead_items->max_items; - pgstat_progress_update_multi_param(3, initprog_index, initprog_val); - - /* Set up an initial range of skippable blocks using the visibility map */ - next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0, - &next_unskippable_allvis, - &skipping_current_range); - for (blkno = 0; blkno < rel_pages; blkno++) - { - Buffer buf; - Page page; - bool all_visible_according_to_vm; - LVPagePruneState prunestate; - - if (blkno == next_unskippable_block) - { - /* - * Can't skip this page safely. Must scan the page. But - * determine the next skippable range after the page first. - */ - all_visible_according_to_vm = next_unskippable_allvis; - next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, - blkno + 1, - &next_unskippable_allvis, - &skipping_current_range); - - Assert(next_unskippable_block >= blkno + 1); - } - else - { - /* Last page always scanned (may need to set nonempty_pages) */ - Assert(blkno < rel_pages - 1); - - if (skipping_current_range) - continue; - - /* Current range is too small to skip -- just scan the page */ - all_visible_according_to_vm = true; - } - - vacrel->scanned_pages++; - - /* Report as block scanned, update error traceback information */ - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); - update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP, - blkno, InvalidOffsetNumber); - - vacuum_delay_point(); - - /* - * Regularly check if wraparound failsafe should trigger. - * - * There is a similar check inside lazy_vacuum_all_indexes(), but - * relfrozenxid might start to look dangerously old before we reach - * that point. This check also provides failsafe coverage for the - * one-pass strategy, and the two-pass strategy with the index_cleanup - * param set to 'off'. - */ - if (vacrel->scanned_pages % FAILSAFE_EVERY_PAGES == 0) - lazy_check_wraparound_failsafe(vacrel); - - /* - * Consider if we definitely have enough space to process TIDs on page - * already. If we are close to overrunning the available space for - * dead_items TIDs, pause and do a cycle of vacuuming before we tackle - * this page. - */ - Assert(dead_items->max_items >= MaxHeapTuplesPerPage); - if (dead_items->max_items - dead_items->num_items < MaxHeapTuplesPerPage) - { - /* - * Before beginning index vacuuming, we release any pin we may - * hold on the visibility map page. This isn't necessary for - * correctness, but we do it anyway to avoid holding the pin - * across a lengthy, unrelated operation. - */ - if (BufferIsValid(vmbuffer)) - { - ReleaseBuffer(vmbuffer); - vmbuffer = InvalidBuffer; - } - - /* Perform a round of index and heap vacuuming */ - vacrel->consider_bypass_optimization = false; - lazy_vacuum(vacrel); - - /* - * Vacuum the Free Space Map to make newly-freed space visible on - * upper-level FSM pages. Note we have not yet processed blkno. - */ - FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, - blkno); - next_fsm_block_to_vacuum = blkno; - - /* Report that we are once again scanning the heap */ - pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, - PROGRESS_VACUUM_PHASE_SCAN_HEAP); - } - - /* - * Pin the visibility map page in case we need to mark the page - * all-visible. In most cases this will be very cheap, because we'll - * already have the correct page pinned anyway. - */ - tdeheap_visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); - - /* - * We need a buffer cleanup lock to prune HOT chains and defragment - * the page in lazy_scan_prune. But when it's not possible to acquire - * a cleanup lock right away, we may be able to settle for reduced - * processing using lazy_scan_noprune. - */ - buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, - vacrel->bstrategy); - page = BufferGetPage(buf); - if (!ConditionalLockBufferForCleanup(buf)) - { - bool hastup, - recordfreespace; - - LockBuffer(buf, BUFFER_LOCK_SHARE); - - /* Check for new or empty pages before lazy_scan_noprune call */ - if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, true, - vmbuffer)) - { - /* Processed as new/empty page (lock and pin released) */ - continue; - } - - /* Collect LP_DEAD items in dead_items array, count tuples */ - if (lazy_scan_noprune(vacrel, buf, blkno, page, &hastup, - &recordfreespace)) - { - Size freespace = 0; - - /* - * Processed page successfully (without cleanup lock) -- just - * need to perform rel truncation and FSM steps, much like the - * lazy_scan_prune case. Don't bother trying to match its - * visibility map setting steps, though. - */ - if (hastup) - vacrel->nonempty_pages = blkno + 1; - if (recordfreespace) - freespace = PageGetHeapFreeSpace(page); - UnlockReleaseBuffer(buf); - if (recordfreespace) - RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); - continue; - } - - /* - * lazy_scan_noprune could not do all required processing. Wait - * for a cleanup lock, and call lazy_scan_prune in the usual way. - */ - Assert(vacrel->aggressive); - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBufferForCleanup(buf); - } - - /* Check for new or empty pages before lazy_scan_prune call */ - if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, false, vmbuffer)) - { - /* Processed as new/empty page (lock and pin released) */ - continue; - } - - /* - * Prune, freeze, and count tuples. - * - * Accumulates details of remaining LP_DEAD line pointers on page in - * dead_items array. This includes LP_DEAD line pointers that we - * pruned ourselves, as well as existing LP_DEAD line pointers that - * were pruned some time earlier. Also considers freezing XIDs in the - * tuple headers of remaining items with storage. - */ - lazy_scan_prune(vacrel, buf, blkno, page, &prunestate); - - Assert(!prunestate.all_visible || !prunestate.has_lpdead_items); - - /* Remember the location of the last page with nonremovable tuples */ - if (prunestate.hastup) - vacrel->nonempty_pages = blkno + 1; - - if (vacrel->nindexes == 0) - { - /* - * Consider the need to do page-at-a-time heap vacuuming when - * using the one-pass strategy now. - * - * The one-pass strategy will never call lazy_vacuum(). The steps - * performed here can be thought of as the one-pass equivalent of - * a call to lazy_vacuum(). - */ - if (prunestate.has_lpdead_items) - { - Size freespace; - - lazy_vacuum_tdeheap_page(vacrel, blkno, buf, 0, vmbuffer); - - /* Forget the LP_DEAD items that we just vacuumed */ - dead_items->num_items = 0; - - /* - * Periodically perform FSM vacuuming to make newly-freed - * space visible on upper FSM pages. Note we have not yet - * performed FSM processing for blkno. - */ - if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) - { - FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, - blkno); - next_fsm_block_to_vacuum = blkno; - } - - /* - * Now perform FSM processing for blkno, and move on to next - * page. - * - * Our call to lazy_vacuum_tdeheap_page() will have considered if - * it's possible to set all_visible/all_frozen independently - * of lazy_scan_prune(). Note that prunestate was invalidated - * by lazy_vacuum_tdeheap_page() call. - */ - freespace = PageGetHeapFreeSpace(page); - - UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); - continue; - } - - /* - * There was no call to lazy_vacuum_tdeheap_page() because pruning - * didn't encounter/create any LP_DEAD items that needed to be - * vacuumed. Prune state has not been invalidated, so proceed - * with prunestate-driven visibility map and FSM steps (just like - * the two-pass strategy). - */ - Assert(dead_items->num_items == 0); - } - - /* - * Handle setting visibility map bit based on information from the VM - * (as of last lazy_scan_skip() call), and from prunestate - */ - if (!all_visible_according_to_vm && prunestate.all_visible) - { - uint8 flags = VISIBILITYMAP_ALL_VISIBLE; - - if (prunestate.all_frozen) - { - Assert(!TransactionIdIsValid(prunestate.visibility_cutoff_xid)); - flags |= VISIBILITYMAP_ALL_FROZEN; - } - - /* - * It should never be the case that the visibility map page is set - * while the page-level bit is clear, but the reverse is allowed - * (if checksums are not enabled). Regardless, set both bits so - * that we get back in sync. - * - * NB: If the heap page is all-visible but the VM bit is not set, - * we don't need to dirty the heap page. However, if checksums - * are enabled, we do need to make sure that the heap page is - * dirtied before passing it to tdeheap_visibilitymap_set(), because it - * may be logged. Given that this situation should only happen in - * rare cases after a crash, it is not worth optimizing. - */ - PageSetAllVisible(page); - MarkBufferDirty(buf); - tdeheap_visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, prunestate.visibility_cutoff_xid, - flags); - } - - /* - * As of PostgreSQL 9.2, the visibility map bit should never be set if - * the page-level bit is clear. However, it's possible that the bit - * got cleared after lazy_scan_skip() was called, so we must recheck - * with buffer lock before concluding that the VM is corrupt. - */ - else if (all_visible_according_to_vm && !PageIsAllVisible(page) && - tdeheap_visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0) - { - elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); - tdeheap_visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); - } - - /* - * It's possible for the value returned by - * GetOldestNonRemovableTransactionId() to move backwards, so it's not - * wrong for us to see tuples that appear to not be visible to - * everyone yet, while PD_ALL_VISIBLE is already set. The real safe - * xmin value never moves backwards, but - * GetOldestNonRemovableTransactionId() is conservative and sometimes - * returns a value that's unnecessarily small, so if we see that - * contradiction it just means that the tuples that we think are not - * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag - * is correct. - * - * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE - * set, however. - */ - else if (prunestate.has_lpdead_items && PageIsAllVisible(page)) - { - elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", - vacrel->relname, blkno); - PageClearAllVisible(page); - MarkBufferDirty(buf); - tdeheap_visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); - } - - /* - * If the all-visible page is all-frozen but not marked as such yet, - * mark it as all-frozen. Note that all_frozen is only valid if - * all_visible is true, so we must check both prunestate fields. - */ - else if (all_visible_according_to_vm && prunestate.all_visible && - prunestate.all_frozen && - !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) - { - /* - * Avoid relying on all_visible_according_to_vm as a proxy for the - * page-level PD_ALL_VISIBLE bit being set, since it might have - * become stale -- even when all_visible is set in prunestate - */ - if (!PageIsAllVisible(page)) - { - PageSetAllVisible(page); - MarkBufferDirty(buf); - } - - /* - * Set the page all-frozen (and all-visible) in the VM. - * - * We can pass InvalidTransactionId as our visibility_cutoff_xid, - * since a snapshotConflictHorizon sufficient to make everything - * safe for REDO was logged when the page's tuples were frozen. - */ - Assert(!TransactionIdIsValid(prunestate.visibility_cutoff_xid)); - tdeheap_visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | - VISIBILITYMAP_ALL_FROZEN); - } - - /* - * Final steps for block: drop cleanup lock, record free space in the - * FSM - */ - if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming) - { - /* - * Wait until lazy_vacuum_tdeheap_rel() to save free space. This - * doesn't just save us some cycles; it also allows us to record - * any additional free space that lazy_vacuum_tdeheap_page() will - * make available in cases where it's possible to truncate the - * page's line pointer array. - * - * Note: It's not in fact 100% certain that we really will call - * lazy_vacuum_tdeheap_rel() -- lazy_vacuum() might yet opt to skip - * index vacuuming (and so must skip heap vacuuming). This is - * deemed okay because it only happens in emergencies, or when - * there is very little free space anyway. (Besides, we start - * recording free space in the FSM once index vacuuming has been - * abandoned.) - * - * Note: The one-pass (no indexes) case is only supposed to make - * it this far when there were no LP_DEAD items during pruning. - */ - Assert(vacrel->nindexes > 0); - UnlockReleaseBuffer(buf); - } - else - { - Size freespace = PageGetHeapFreeSpace(page); - - UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); - } - } - - vacrel->blkno = InvalidBlockNumber; - if (BufferIsValid(vmbuffer)) - ReleaseBuffer(vmbuffer); - - /* report that everything is now scanned */ - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); - - /* now we can compute the new value for pg_class.reltuples */ - vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages, - vacrel->scanned_pages, - vacrel->live_tuples); - - /* - * Also compute the total number of surviving heap entries. In the - * (unlikely) scenario that new_live_tuples is -1, take it as zero. - */ - vacrel->new_rel_tuples = - Max(vacrel->new_live_tuples, 0) + vacrel->recently_dead_tuples + - vacrel->missed_dead_tuples; - - /* - * Do index vacuuming (call each index's ambulkdelete routine), then do - * related heap vacuuming - */ - if (dead_items->num_items > 0) - lazy_vacuum(vacrel); - - /* - * Vacuum the remainder of the Free Space Map. We must do this whether or - * not there were indexes, and whether or not we bypassed index vacuuming. - */ - if (blkno > next_fsm_block_to_vacuum) - FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno); - - /* report all blocks vacuumed */ - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); - - /* Do final index cleanup (call each index's amvacuumcleanup routine) */ - if (vacrel->nindexes > 0 && vacrel->do_index_cleanup) - lazy_cleanup_all_indexes(vacrel); -} - -/* - * lazy_scan_skip() -- set up range of skippable blocks using visibility map. - * - * lazy_scan_heap() calls here every time it needs to set up a new range of - * blocks to skip via the visibility map. Caller passes the next block in - * line. We return a next_unskippable_block for this range. When there are - * no skippable blocks we just return caller's next_block. The all-visible - * status of the returned block is set in *next_unskippable_allvis for caller, - * too. Block usually won't be all-visible (since it's unskippable), but it - * can be during aggressive VACUUMs (as well as in certain edge cases). - * - * Sets *skipping_current_range to indicate if caller should skip this range. - * Costs and benefits drive our decision. Very small ranges won't be skipped. - * - * Note: our opinion of which blocks can be skipped can go stale immediately. - * It's okay if caller "misses" a page whose all-visible or all-frozen marking - * was concurrently cleared, though. All that matters is that caller scan all - * pages whose tuples might contain XIDs < OldestXmin, or MXIDs < OldestMxact. - * (Actually, non-aggressive VACUUMs can choose to skip all-visible pages with - * older XIDs/MXIDs. The vacrel->skippedallvis flag will be set here when the - * choice to skip such a range is actually made, making everything safe.) - */ -static BlockNumber -lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, BlockNumber next_block, - bool *next_unskippable_allvis, bool *skipping_current_range) -{ - BlockNumber rel_pages = vacrel->rel_pages, - next_unskippable_block = next_block, - nskippable_blocks = 0; - bool skipsallvis = false; - - *next_unskippable_allvis = true; - while (next_unskippable_block < rel_pages) - { - uint8 mapbits = tdeheap_visibilitymap_get_status(vacrel->rel, - next_unskippable_block, - vmbuffer); - - if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - Assert((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0); - *next_unskippable_allvis = false; - break; - } - - /* - * Caller must scan the last page to determine whether it has tuples - * (caller must have the opportunity to set vacrel->nonempty_pages). - * This rule avoids having lazy_truncate_heap() take access-exclusive - * lock on rel to attempt a truncation that fails anyway, just because - * there are tuples on the last page (it is likely that there will be - * tuples on other nearby pages as well, but those can be skipped). - * - * Implement this by always treating the last block as unsafe to skip. - */ - if (next_unskippable_block == rel_pages - 1) - break; - - /* DISABLE_PAGE_SKIPPING makes all skipping unsafe */ - if (!vacrel->skipwithvm) - break; - - /* - * Aggressive VACUUM caller can't skip pages just because they are - * all-visible. They may still skip all-frozen pages, which can't - * contain XIDs < OldestXmin (XIDs that aren't already frozen by now). - */ - if ((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0) - { - if (vacrel->aggressive) - break; - - /* - * All-visible block is safe to skip in non-aggressive case. But - * remember that the final range contains such a block for later. - */ - skipsallvis = true; - } - - vacuum_delay_point(); - next_unskippable_block++; - nskippable_blocks++; - } - - /* - * We only skip a range with at least SKIP_PAGES_THRESHOLD consecutive - * pages. Since we're reading sequentially, the OS should be doing - * readahead for us, so there's no gain in skipping a page now and then. - * Skipping such a range might even discourage sequential detection. - * - * This test also enables more frequent relfrozenxid advancement during - * non-aggressive VACUUMs. If the range has any all-visible pages then - * skipping makes updating relfrozenxid unsafe, which is a real downside. - */ - if (nskippable_blocks < SKIP_PAGES_THRESHOLD) - *skipping_current_range = false; - else - { - *skipping_current_range = true; - if (skipsallvis) - vacrel->skippedallvis = true; - } - - return next_unskippable_block; -} - -/* - * lazy_scan_new_or_empty() -- lazy_scan_heap() new/empty page handling. - * - * Must call here to handle both new and empty pages before calling - * lazy_scan_prune or lazy_scan_noprune, since they're not prepared to deal - * with new or empty pages. - * - * It's necessary to consider new pages as a special case, since the rules for - * maintaining the visibility map and FSM with empty pages are a little - * different (though new pages can be truncated away during rel truncation). - * - * Empty pages are not really a special case -- they're just heap pages that - * have no allocated tuples (including even LP_UNUSED items). You might - * wonder why we need to handle them here all the same. It's only necessary - * because of a corner-case involving a hard crash during heap relation - * extension. If we ever make relation-extension crash safe, then it should - * no longer be necessary to deal with empty pages here (or new pages, for - * that matter). - * - * Caller must hold at least a shared lock. We might need to escalate the - * lock in that case, so the type of lock caller holds needs to be specified - * using 'sharelock' argument. - * - * Returns false in common case where caller should go on to call - * lazy_scan_prune (or lazy_scan_noprune). Otherwise returns true, indicating - * that lazy_scan_heap is done processing the page, releasing lock on caller's - * behalf. - */ -static bool -lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, - Page page, bool sharelock, Buffer vmbuffer) -{ - Size freespace; - - if (PageIsNew(page)) - { - /* - * All-zeroes pages can be left over if either a backend extends the - * relation by a single page, but crashes before the newly initialized - * page has been written out, or when bulk-extending the relation - * (which creates a number of empty pages at the tail end of the - * relation), and then enters them into the FSM. - * - * Note we do not enter the page into the visibilitymap. That has the - * downside that we repeatedly visit this page in subsequent vacuums, - * but otherwise we'll never discover the space on a promoted standby. - * The harm of repeated checking ought to normally not be too bad. The - * space usually should be used at some point, otherwise there - * wouldn't be any regular vacuums. - * - * Make sure these pages are in the FSM, to ensure they can be reused. - * Do that by testing if there's any space recorded for the page. If - * not, enter it. We do so after releasing the lock on the heap page, - * the FSM is approximate, after all. - */ - UnlockReleaseBuffer(buf); - - if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0) - { - freespace = BLCKSZ - SizeOfPageHeaderData; - - RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); - } - - return true; - } - - if (PageIsEmpty(page)) - { - /* - * It seems likely that caller will always be able to get a cleanup - * lock on an empty page. But don't take any chances -- escalate to - * an exclusive lock (still don't need a cleanup lock, though). - */ - if (sharelock) - { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - - if (!PageIsEmpty(page)) - { - /* page isn't new or empty -- keep lock and pin for now */ - return false; - } - } - else - { - /* Already have a full cleanup lock (which is more than enough) */ - } - - /* - * Unlike new pages, empty pages are always set all-visible and - * all-frozen. - */ - if (!PageIsAllVisible(page)) - { - START_CRIT_SECTION(); - - /* mark buffer dirty before writing a WAL record */ - MarkBufferDirty(buf); - - /* - * It's possible that another backend has extended the heap, - * initialized the page, and then failed to WAL-log the page due - * to an ERROR. Since heap extension is not WAL-logged, recovery - * might try to replay our record setting the page all-visible and - * find that the page isn't initialized, which will cause a PANIC. - * To prevent that, check whether the page has been previously - * WAL-logged, and if not, do that now. - */ - if (RelationNeedsWAL(vacrel->rel) && - PageGetLSN(page) == InvalidXLogRecPtr) - log_newpage_buffer(buf, true); - - PageSetAllVisible(page); - tdeheap_visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); - END_CRIT_SECTION(); - } - - freespace = PageGetHeapFreeSpace(page); - UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); - return true; - } - - /* page isn't new or empty -- keep lock and pin */ - return false; -} - -/* - * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing. - * - * Caller must hold pin and buffer cleanup lock on the buffer. - * - * Prior to PostgreSQL 14 there were very rare cases where tdeheap_page_prune() - * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about - * whether or not a tuple should be considered DEAD. This happened when an - * inserting transaction concurrently aborted (after our tdeheap_page_prune() - * call, before our HeapTupleSatisfiesVacuum() call). There was rather a lot - * of complexity just so we could deal with tuples that were DEAD to VACUUM, - * but nevertheless were left with storage after pruning. - * - * The approach we take now is to restart pruning when the race condition is - * detected. This allows tdeheap_page_prune() to prune the tuples inserted by - * the now-aborted transaction. This is a little crude, but it guarantees - * that any items that make it into the dead_items array are simple LP_DEAD - * line pointers, and that every remaining item with tuple storage is - * considered as a candidate for freezing. - */ -static void -lazy_scan_prune(LVRelState *vacrel, - Buffer buf, - BlockNumber blkno, - Page page, - LVPagePruneState *prunestate) -{ - Relation rel = vacrel->rel; - OffsetNumber offnum, - maxoff; - ItemId itemid; - HeapTupleData tuple; - HTSV_Result res; - int tuples_deleted, - tuples_frozen, - lpdead_items, - live_tuples, - recently_dead_tuples; - int nnewlpdead; - HeapPageFreeze pagefrz; - int64 fpi_before = pgWalUsage.wal_fpi; - OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; - HeapTupleFreeze frozen[MaxHeapTuplesPerPage]; - - Assert(BufferGetBlockNumber(buf) == blkno); - - /* - * maxoff might be reduced following line pointer array truncation in - * tdeheap_page_prune. That's safe for us to ignore, since the reclaimed - * space will continue to look like LP_UNUSED items below. - */ - maxoff = PageGetMaxOffsetNumber(page); - -retry: - - /* Initialize (or reset) page-level state */ - pagefrz.freeze_required = false; - pagefrz.FreezePageRelfrozenXid = vacrel->NewRelfrozenXid; - pagefrz.FreezePageRelminMxid = vacrel->NewRelminMxid; - pagefrz.NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid; - pagefrz.NoFreezePageRelminMxid = vacrel->NewRelminMxid; - tuples_deleted = 0; - tuples_frozen = 0; - lpdead_items = 0; - live_tuples = 0; - recently_dead_tuples = 0; - - /* - * Prune all HOT-update chains in this page. - * - * We count tuples removed by the pruning step as tuples_deleted. Its - * final value can be thought of as the number of tuples that have been - * deleted from the table. It should not be confused with lpdead_items; - * lpdead_items's final value can be thought of as the number of tuples - * that were deleted from indexes. - */ - tuples_deleted = tdeheap_page_prune(rel, buf, vacrel->vistest, - InvalidTransactionId, 0, &nnewlpdead, - &vacrel->offnum); - - /* - * Now scan the page to collect LP_DEAD items and check for tuples - * requiring freezing among remaining tuples with storage - */ - prunestate->hastup = false; - prunestate->has_lpdead_items = false; - prunestate->all_visible = true; - prunestate->all_frozen = true; - prunestate->visibility_cutoff_xid = InvalidTransactionId; - - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - bool totally_frozen; - - /* - * Set the offset number so that we can display it along with any - * error that occurred while processing this tuple. - */ - vacrel->offnum = offnum; - itemid = PageGetItemId(page, offnum); - - if (!ItemIdIsUsed(itemid)) - continue; - - /* Redirect items mustn't be touched */ - if (ItemIdIsRedirected(itemid)) - { - /* page makes rel truncation unsafe */ - prunestate->hastup = true; - continue; - } - - if (ItemIdIsDead(itemid)) - { - /* - * Deliberately don't set hastup for LP_DEAD items. We make the - * soft assumption that any LP_DEAD items encountered here will - * become LP_UNUSED later on, before count_nondeletable_pages is - * reached. If we don't make this assumption then rel truncation - * will only happen every other VACUUM, at most. Besides, VACUUM - * must treat hastup/nonempty_pages as provisional no matter how - * LP_DEAD items are handled (handled here, or handled later on). - * - * Also deliberately delay unsetting all_visible until just before - * we return to lazy_scan_heap caller, as explained in full below. - * (This is another case where it's useful to anticipate that any - * LP_DEAD items will become LP_UNUSED during the ongoing VACUUM.) - */ - deadoffsets[lpdead_items++] = offnum; - continue; - } - - Assert(ItemIdIsNormal(itemid)); - - ItemPointerSet(&(tuple.t_self), blkno, offnum); - tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(rel); - - /* - * DEAD tuples are almost always pruned into LP_DEAD line pointers by - * tdeheap_page_prune(), but it's possible that the tuple state changed - * since tdeheap_page_prune() looked. Handle that here by restarting. - * (See comments at the top of function for a full explanation.) - */ - res = HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin, - buf); - - if (unlikely(res == HEAPTUPLE_DEAD)) - goto retry; - - /* - * The criteria for counting a tuple as live in this block need to - * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM - * and ANALYZE may produce wildly different reltuples values, e.g. - * when there are many recently-dead tuples. - * - * The logic here is a bit simpler than acquire_sample_rows(), as - * VACUUM can't run inside a transaction block, which makes some cases - * impossible (e.g. in-progress insert from the same transaction). - * - * We treat LP_DEAD items (which are the closest thing to DEAD tuples - * that might be seen here) differently, too: we assume that they'll - * become LP_UNUSED before VACUUM finishes. This difference is only - * superficial. VACUUM effectively agrees with ANALYZE about DEAD - * items, in the end. VACUUM won't remember LP_DEAD items, but only - * because they're not supposed to be left behind when it is done. - * (Cases where we bypass index vacuuming will violate this optimistic - * assumption, but the overall impact of that should be negligible.) - */ - switch (res) - { - case HEAPTUPLE_LIVE: - - /* - * Count it as live. Not only is this natural, but it's also - * what acquire_sample_rows() does. - */ - live_tuples++; - - /* - * Is the tuple definitely visible to all transactions? - * - * NB: Like with per-tuple hint bits, we can't set the - * PD_ALL_VISIBLE flag if the inserter committed - * asynchronously. See SetHintBits for more info. Check that - * the tuple is hinted xmin-committed because of that. - */ - if (prunestate->all_visible) - { - TransactionId xmin; - - if (!HeapTupleHeaderXminCommitted(tuple.t_data)) - { - prunestate->all_visible = false; - break; - } - - /* - * The inserter definitely committed. But is it old enough - * that everyone sees it as committed? - */ - xmin = HeapTupleHeaderGetXmin(tuple.t_data); - if (!TransactionIdPrecedes(xmin, - vacrel->cutoffs.OldestXmin)) - { - prunestate->all_visible = false; - break; - } - - /* Track newest xmin on page. */ - if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid) && - TransactionIdIsNormal(xmin)) - prunestate->visibility_cutoff_xid = xmin; - } - break; - case HEAPTUPLE_RECENTLY_DEAD: - - /* - * If tuple is recently dead then we must not remove it from - * the relation. (We only remove items that are LP_DEAD from - * pruning.) - */ - recently_dead_tuples++; - prunestate->all_visible = false; - break; - case HEAPTUPLE_INSERT_IN_PROGRESS: - - /* - * We do not count these rows as live, because we expect the - * inserting transaction to update the counters at commit, and - * we assume that will happen only after we report our - * results. This assumption is a bit shaky, but it is what - * acquire_sample_rows() does, so be consistent. - */ - prunestate->all_visible = false; - break; - case HEAPTUPLE_DELETE_IN_PROGRESS: - /* This is an expected case during concurrent vacuum */ - prunestate->all_visible = false; - - /* - * Count such rows as live. As above, we assume the deleting - * transaction will commit and update the counters after we - * report. - */ - live_tuples++; - break; - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - - prunestate->hastup = true; /* page makes rel truncation unsafe */ - - /* Tuple with storage -- consider need to freeze */ - if (tdeheap_prepare_freeze_tuple(tuple.t_data, &vacrel->cutoffs, &pagefrz, - &frozen[tuples_frozen], &totally_frozen)) - { - /* Save prepared freeze plan for later */ - frozen[tuples_frozen++].offset = offnum; - } - - /* - * If any tuple isn't either totally frozen already or eligible to - * become totally frozen (according to its freeze plan), then the page - * definitely cannot be set all-frozen in the visibility map later on - */ - if (!totally_frozen) - prunestate->all_frozen = false; - } - - /* - * We have now divided every item on the page into either an LP_DEAD item - * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple - * that remains and needs to be considered for freezing now (LP_UNUSED and - * LP_REDIRECT items also remain, but are of no further interest to us). - */ - vacrel->offnum = InvalidOffsetNumber; - - /* - * Freeze the page when tdeheap_prepare_freeze_tuple indicates that at least - * one XID/MXID from before FreezeLimit/MultiXactCutoff is present. Also - * freeze when pruning generated an FPI, if doing so means that we set the - * page all-frozen afterwards (might not happen until final heap pass). - */ - if (pagefrz.freeze_required || tuples_frozen == 0 || - (prunestate->all_visible && prunestate->all_frozen && - fpi_before != pgWalUsage.wal_fpi)) - { - /* - * We're freezing the page. Our final NewRelfrozenXid doesn't need to - * be affected by the XIDs that are just about to be frozen anyway. - */ - vacrel->NewRelfrozenXid = pagefrz.FreezePageRelfrozenXid; - vacrel->NewRelminMxid = pagefrz.FreezePageRelminMxid; - - if (tuples_frozen == 0) - { - /* - * We have no freeze plans to execute, so there's no added cost - * from following the freeze path. That's why it was chosen. This - * is important in the case where the page only contains totally - * frozen tuples at this point (perhaps only following pruning). - * Such pages can be marked all-frozen in the VM by our caller, - * even though none of its tuples were newly frozen here (note - * that the "no freeze" path never sets pages all-frozen). - * - * We never increment the frozen_pages instrumentation counter - * here, since it only counts pages with newly frozen tuples - * (don't confuse that with pages newly set all-frozen in VM). - */ - } - else - { - TransactionId snapshotConflictHorizon; - - vacrel->frozen_pages++; - - /* - * We can use visibility_cutoff_xid as our cutoff for conflicts - * when the whole page is eligible to become all-frozen in the VM - * once we're done with it. Otherwise we generate a conservative - * cutoff by stepping back from OldestXmin. - */ - if (prunestate->all_visible && prunestate->all_frozen) - { - /* Using same cutoff when setting VM is now unnecessary */ - snapshotConflictHorizon = prunestate->visibility_cutoff_xid; - prunestate->visibility_cutoff_xid = InvalidTransactionId; - } - else - { - /* Avoids false conflicts when hot_standby_feedback in use */ - snapshotConflictHorizon = vacrel->cutoffs.OldestXmin; - TransactionIdRetreat(snapshotConflictHorizon); - } - - /* Execute all freeze plans for page as a single atomic action */ - tdeheap_freeze_execute_prepared(vacrel->rel, buf, - snapshotConflictHorizon, - frozen, tuples_frozen); - } - } - else - { - /* - * Page requires "no freeze" processing. It might be set all-visible - * in the visibility map, but it can never be set all-frozen. - */ - vacrel->NewRelfrozenXid = pagefrz.NoFreezePageRelfrozenXid; - vacrel->NewRelminMxid = pagefrz.NoFreezePageRelminMxid; - prunestate->all_frozen = false; - tuples_frozen = 0; /* avoid miscounts in instrumentation */ - } - - /* - * VACUUM will call tdeheap_page_is_all_visible() during the second pass over - * the heap to determine all_visible and all_frozen for the page -- this - * is a specialized version of the logic from this function. Now that - * we've finished pruning and freezing, make sure that we're in total - * agreement with tdeheap_page_is_all_visible() using an assertion. - */ -#ifdef USE_ASSERT_CHECKING - /* Note that all_frozen value does not matter when !all_visible */ - if (prunestate->all_visible && lpdead_items == 0) - { - TransactionId cutoff; - bool all_frozen; - - if (!tdeheap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen)) - Assert(false); - - Assert(!TransactionIdIsValid(cutoff) || - cutoff == prunestate->visibility_cutoff_xid); - } -#endif - - /* - * Now save details of the LP_DEAD items from the page in vacrel - */ - if (lpdead_items > 0) - { - VacDeadItems *dead_items = vacrel->dead_items; - ItemPointerData tmp; - - vacrel->lpdead_item_pages++; - prunestate->has_lpdead_items = true; - - ItemPointerSetBlockNumber(&tmp, blkno); - - for (int i = 0; i < lpdead_items; i++) - { - ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); - dead_items->items[dead_items->num_items++] = tmp; - } - - Assert(dead_items->num_items <= dead_items->max_items); - pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - dead_items->num_items); - - /* - * It was convenient to ignore LP_DEAD items in all_visible earlier on - * to make the choice of whether or not to freeze the page unaffected - * by the short-term presence of LP_DEAD items. These LP_DEAD items - * were effectively assumed to be LP_UNUSED items in the making. It - * doesn't matter which heap pass (initial pass or final pass) ends up - * setting the page all-frozen, as long as the ongoing VACUUM does it. - * - * Now that freezing has been finalized, unset all_visible. It needs - * to reflect the present state of things, as expected by our caller. - */ - prunestate->all_visible = false; - } - - /* Finally, add page-local counts to whole-VACUUM counts */ - vacrel->tuples_deleted += tuples_deleted; - vacrel->tuples_frozen += tuples_frozen; - vacrel->lpdead_items += lpdead_items; - vacrel->live_tuples += live_tuples; - vacrel->recently_dead_tuples += recently_dead_tuples; -} - -/* - * lazy_scan_noprune() -- lazy_scan_prune() without pruning or freezing - * - * Caller need only hold a pin and share lock on the buffer, unlike - * lazy_scan_prune, which requires a full cleanup lock. While pruning isn't - * performed here, it's quite possible that an earlier opportunistic pruning - * operation left LP_DEAD items behind. We'll at least collect any such items - * in the dead_items array for removal from indexes. - * - * For aggressive VACUUM callers, we may return false to indicate that a full - * cleanup lock is required for processing by lazy_scan_prune. This is only - * necessary when the aggressive VACUUM needs to freeze some tuple XIDs from - * one or more tuples on the page. We always return true for non-aggressive - * callers. - * - * See lazy_scan_prune for an explanation of hastup return flag. - * recordfreespace flag instructs caller on whether or not it should do - * generic FSM processing for page. - */ -static bool -lazy_scan_noprune(LVRelState *vacrel, - Buffer buf, - BlockNumber blkno, - Page page, - bool *hastup, - bool *recordfreespace) -{ - OffsetNumber offnum, - maxoff; - int lpdead_items, - live_tuples, - recently_dead_tuples, - missed_dead_tuples; - HeapTupleHeader tupleheader; - TransactionId NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid; - MultiXactId NoFreezePageRelminMxid = vacrel->NewRelminMxid; - OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; - - Assert(BufferGetBlockNumber(buf) == blkno); - - *hastup = false; /* for now */ - *recordfreespace = false; /* for now */ - - lpdead_items = 0; - live_tuples = 0; - recently_dead_tuples = 0; - missed_dead_tuples = 0; - - maxoff = PageGetMaxOffsetNumber(page); - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid; - HeapTupleData tuple; - - vacrel->offnum = offnum; - itemid = PageGetItemId(page, offnum); - - if (!ItemIdIsUsed(itemid)) - continue; - - if (ItemIdIsRedirected(itemid)) - { - *hastup = true; - continue; - } - - if (ItemIdIsDead(itemid)) - { - /* - * Deliberately don't set hastup=true here. See same point in - * lazy_scan_prune for an explanation. - */ - deadoffsets[lpdead_items++] = offnum; - continue; - } - - *hastup = true; /* page prevents rel truncation */ - tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); - if (tdeheap_tuple_should_freeze(tupleheader, &vacrel->cutoffs, - &NoFreezePageRelfrozenXid, - &NoFreezePageRelminMxid)) - { - /* Tuple with XID < FreezeLimit (or MXID < MultiXactCutoff) */ - if (vacrel->aggressive) - { - /* - * Aggressive VACUUMs must always be able to advance rel's - * relfrozenxid to a value >= FreezeLimit (and be able to - * advance rel's relminmxid to a value >= MultiXactCutoff). - * The ongoing aggressive VACUUM won't be able to do that - * unless it can freeze an XID (or MXID) from this tuple now. - * - * The only safe option is to have caller perform processing - * of this page using lazy_scan_prune. Caller might have to - * wait a while for a cleanup lock, but it can't be helped. - */ - vacrel->offnum = InvalidOffsetNumber; - return false; - } - - /* - * Non-aggressive VACUUMs are under no obligation to advance - * relfrozenxid (even by one XID). We can be much laxer here. - * - * Currently we always just accept an older final relfrozenxid - * and/or relminmxid value. We never make caller wait or work a - * little harder, even when it likely makes sense to do so. - */ - } - - ItemPointerSet(&(tuple.t_self), blkno, offnum); - tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(vacrel->rel); - - switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin, - buf)) - { - case HEAPTUPLE_DELETE_IN_PROGRESS: - case HEAPTUPLE_LIVE: - - /* - * Count both cases as live, just like lazy_scan_prune - */ - live_tuples++; - - break; - case HEAPTUPLE_DEAD: - - /* - * There is some useful work for pruning to do, that won't be - * done due to failure to get a cleanup lock. - */ - missed_dead_tuples++; - break; - case HEAPTUPLE_RECENTLY_DEAD: - - /* - * Count in recently_dead_tuples, just like lazy_scan_prune - */ - recently_dead_tuples++; - break; - case HEAPTUPLE_INSERT_IN_PROGRESS: - - /* - * Do not count these rows as live, just like lazy_scan_prune - */ - break; - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - } - - vacrel->offnum = InvalidOffsetNumber; - - /* - * By here we know for sure that caller can put off freezing and pruning - * this particular page until the next VACUUM. Remember its details now. - * (lazy_scan_prune expects a clean slate, so we have to do this last.) - */ - vacrel->NewRelfrozenXid = NoFreezePageRelfrozenXid; - vacrel->NewRelminMxid = NoFreezePageRelminMxid; - - /* Save any LP_DEAD items found on the page in dead_items array */ - if (vacrel->nindexes == 0) - { - /* Using one-pass strategy (since table has no indexes) */ - if (lpdead_items > 0) - { - /* - * Perfunctory handling for the corner case where a single pass - * strategy VACUUM cannot get a cleanup lock, and it turns out - * that there is one or more LP_DEAD items: just count the LP_DEAD - * items as missed_dead_tuples instead. (This is a bit dishonest, - * but it beats having to maintain specialized heap vacuuming code - * forever, for vanishingly little benefit.) - */ - *hastup = true; - missed_dead_tuples += lpdead_items; - } - - *recordfreespace = true; - } - else if (lpdead_items == 0) - { - /* - * Won't be vacuuming this page later, so record page's freespace in - * the FSM now - */ - *recordfreespace = true; - } - else - { - VacDeadItems *dead_items = vacrel->dead_items; - ItemPointerData tmp; - - /* - * Page has LP_DEAD items, and so any references/TIDs that remain in - * indexes will be deleted during index vacuuming (and then marked - * LP_UNUSED in the heap) - */ - vacrel->lpdead_item_pages++; - - ItemPointerSetBlockNumber(&tmp, blkno); - - for (int i = 0; i < lpdead_items; i++) - { - ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); - dead_items->items[dead_items->num_items++] = tmp; - } - - Assert(dead_items->num_items <= dead_items->max_items); - pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - dead_items->num_items); - - vacrel->lpdead_items += lpdead_items; - - /* - * Assume that we'll go on to vacuum this heap page during final pass - * over the heap. Don't record free space until then. - */ - *recordfreespace = false; - } - - /* - * Finally, add relevant page-local counts to whole-VACUUM counts - */ - vacrel->live_tuples += live_tuples; - vacrel->recently_dead_tuples += recently_dead_tuples; - vacrel->missed_dead_tuples += missed_dead_tuples; - if (missed_dead_tuples > 0) - vacrel->missed_dead_pages++; - - /* Caller won't need to call lazy_scan_prune with same page */ - return true; -} - -/* - * Main entry point for index vacuuming and heap vacuuming. - * - * Removes items collected in dead_items from table's indexes, then marks the - * same items LP_UNUSED in the heap. See the comments above lazy_scan_heap - * for full details. - * - * Also empties dead_items, freeing up space for later TIDs. - * - * We may choose to bypass index vacuuming at this point, though only when the - * ongoing VACUUM operation will definitely only have one index scan/round of - * index vacuuming. - */ -static void -lazy_vacuum(LVRelState *vacrel) -{ - bool bypass; - - /* Should not end up here with no indexes */ - Assert(vacrel->nindexes > 0); - Assert(vacrel->lpdead_item_pages > 0); - - if (!vacrel->do_index_vacuuming) - { - Assert(!vacrel->do_index_cleanup); - vacrel->dead_items->num_items = 0; - return; - } - - /* - * Consider bypassing index vacuuming (and heap vacuuming) entirely. - * - * We currently only do this in cases where the number of LP_DEAD items - * for the entire VACUUM operation is close to zero. This avoids sharp - * discontinuities in the duration and overhead of successive VACUUM - * operations that run against the same table with a fixed workload. - * Ideally, successive VACUUM operations will behave as if there are - * exactly zero LP_DEAD items in cases where there are close to zero. - * - * This is likely to be helpful with a table that is continually affected - * by UPDATEs that can mostly apply the HOT optimization, but occasionally - * have small aberrations that lead to just a few heap pages retaining - * only one or two LP_DEAD items. This is pretty common; even when the - * DBA goes out of their way to make UPDATEs use HOT, it is practically - * impossible to predict whether HOT will be applied in 100% of cases. - * It's far easier to ensure that 99%+ of all UPDATEs against a table use - * HOT through careful tuning. - */ - bypass = false; - if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0) - { - BlockNumber threshold; - - Assert(vacrel->num_index_scans == 0); - Assert(vacrel->lpdead_items == vacrel->dead_items->num_items); - Assert(vacrel->do_index_vacuuming); - Assert(vacrel->do_index_cleanup); - - /* - * This crossover point at which we'll start to do index vacuuming is - * expressed as a percentage of the total number of heap pages in the - * table that are known to have at least one LP_DEAD item. This is - * much more important than the total number of LP_DEAD items, since - * it's a proxy for the number of heap pages whose visibility map bits - * cannot be set on account of bypassing index and heap vacuuming. - * - * We apply one further precautionary test: the space currently used - * to store the TIDs (TIDs that now all point to LP_DEAD items) must - * not exceed 32MB. This limits the risk that we will bypass index - * vacuuming again and again until eventually there is a VACUUM whose - * dead_items space is not CPU cache resident. - * - * We don't take any special steps to remember the LP_DEAD items (such - * as counting them in our final update to the stats system) when the - * optimization is applied. Though the accounting used in analyze.c's - * acquire_sample_rows() will recognize the same LP_DEAD items as dead - * rows in its own stats report, that's okay. The discrepancy should - * be negligible. If this optimization is ever expanded to cover more - * cases then this may need to be reconsidered. - */ - threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES; - bypass = (vacrel->lpdead_item_pages < threshold && - vacrel->lpdead_items < MAXDEADITEMS(32L * 1024L * 1024L)); - } - - if (bypass) - { - /* - * There are almost zero TIDs. Behave as if there were precisely - * zero: bypass index vacuuming, but do index cleanup. - * - * We expect that the ongoing VACUUM operation will finish very - * quickly, so there is no point in considering speeding up as a - * failsafe against wraparound failure. (Index cleanup is expected to - * finish very quickly in cases where there were no ambulkdelete() - * calls.) - */ - vacrel->do_index_vacuuming = false; - } - else if (lazy_vacuum_all_indexes(vacrel)) - { - /* - * We successfully completed a round of index vacuuming. Do related - * heap vacuuming now. - */ - lazy_vacuum_tdeheap_rel(vacrel); - } - else - { - /* - * Failsafe case. - * - * We attempted index vacuuming, but didn't finish a full round/full - * index scan. This happens when relfrozenxid or relminmxid is too - * far in the past. - * - * From this point on the VACUUM operation will do no further index - * vacuuming or heap vacuuming. This VACUUM operation won't end up - * back here again. - */ - Assert(VacuumFailsafeActive); - } - - /* - * Forget the LP_DEAD items that we just vacuumed (or just decided to not - * vacuum) - */ - vacrel->dead_items->num_items = 0; -} - -/* - * lazy_vacuum_all_indexes() -- Main entry for index vacuuming - * - * Returns true in the common case when all indexes were successfully - * vacuumed. Returns false in rare cases where we determined that the ongoing - * VACUUM operation is at risk of taking too long to finish, leading to - * wraparound failure. - */ -static bool -lazy_vacuum_all_indexes(LVRelState *vacrel) -{ - bool allindexes = true; - double old_live_tuples = vacrel->rel->rd_rel->reltuples; - - Assert(vacrel->nindexes > 0); - Assert(vacrel->do_index_vacuuming); - Assert(vacrel->do_index_cleanup); - - /* Precheck for XID wraparound emergencies */ - if (lazy_check_wraparound_failsafe(vacrel)) - { - /* Wraparound emergency -- don't even start an index scan */ - return false; - } - - /* Report that we are now vacuuming indexes */ - pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, - PROGRESS_VACUUM_PHASE_VACUUM_INDEX); - - if (!ParallelVacuumIsActive(vacrel)) - { - for (int idx = 0; idx < vacrel->nindexes; idx++) - { - Relation indrel = vacrel->indrels[idx]; - IndexBulkDeleteResult *istat = vacrel->indstats[idx]; - - vacrel->indstats[idx] = lazy_vacuum_one_index(indrel, istat, - old_live_tuples, - vacrel); - - if (lazy_check_wraparound_failsafe(vacrel)) - { - /* Wraparound emergency -- end current index scan */ - allindexes = false; - break; - } - } - } - else - { - /* Outsource everything to parallel variant */ - parallel_vacuum_bulkdel_all_indexes(vacrel->pvs, old_live_tuples, - vacrel->num_index_scans); - - /* - * Do a postcheck to consider applying wraparound failsafe now. Note - * that parallel VACUUM only gets the precheck and this postcheck. - */ - if (lazy_check_wraparound_failsafe(vacrel)) - allindexes = false; - } - - /* - * We delete all LP_DEAD items from the first heap pass in all indexes on - * each call here (except calls where we choose to do the failsafe). This - * makes the next call to lazy_vacuum_tdeheap_rel() safe (except in the event - * of the failsafe triggering, which prevents the next call from taking - * place). - */ - Assert(vacrel->num_index_scans > 0 || - vacrel->dead_items->num_items == vacrel->lpdead_items); - Assert(allindexes || VacuumFailsafeActive); - - /* - * Increase and report the number of index scans. - * - * We deliberately include the case where we started a round of bulk - * deletes that we weren't able to finish due to the failsafe triggering. - */ - vacrel->num_index_scans++; - pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS, - vacrel->num_index_scans); - - return allindexes; -} - -/* - * lazy_vacuum_tdeheap_rel() -- second pass over the heap for two pass strategy - * - * This routine marks LP_DEAD items in vacrel->dead_items array as LP_UNUSED. - * Pages that never had lazy_scan_prune record LP_DEAD items are not visited - * at all. - * - * We may also be able to truncate the line pointer array of the heap pages we - * visit. If there is a contiguous group of LP_UNUSED items at the end of the - * array, it can be reclaimed as free space. These LP_UNUSED items usually - * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from - * each page to LP_UNUSED, and then consider if it's possible to truncate the - * page's line pointer array). - * - * Note: the reason for doing this as a second pass is we cannot remove the - * tuples until we've removed their index entries, and we want to process - * index entry removal in batches as large as possible. - */ -static void -lazy_vacuum_tdeheap_rel(LVRelState *vacrel) -{ - int index = 0; - BlockNumber vacuumed_pages = 0; - Buffer vmbuffer = InvalidBuffer; - LVSavedErrInfo saved_err_info; - - Assert(vacrel->do_index_vacuuming); - Assert(vacrel->do_index_cleanup); - Assert(vacrel->num_index_scans > 0); - - /* Report that we are now vacuuming the heap */ - pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, - PROGRESS_VACUUM_PHASE_VACUUM_HEAP); - - /* Update error traceback information */ - update_vacuum_error_info(vacrel, &saved_err_info, - VACUUM_ERRCB_PHASE_VACUUM_HEAP, - InvalidBlockNumber, InvalidOffsetNumber); - - while (index < vacrel->dead_items->num_items) - { - BlockNumber blkno; - Buffer buf; - Page page; - Size freespace; - - vacuum_delay_point(); - - blkno = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); - vacrel->blkno = blkno; - - /* - * Pin the visibility map page in case we need to mark the page - * all-visible. In most cases this will be very cheap, because we'll - * already have the correct page pinned anyway. - */ - tdeheap_visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); - - /* We need a non-cleanup exclusive lock to mark dead_items unused */ - buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, - vacrel->bstrategy); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - index = lazy_vacuum_tdeheap_page(vacrel, blkno, buf, index, vmbuffer); - - /* Now that we've vacuumed the page, record its available space */ - page = BufferGetPage(buf); - freespace = PageGetHeapFreeSpace(page); - - UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); - vacuumed_pages++; - } - - vacrel->blkno = InvalidBlockNumber; - if (BufferIsValid(vmbuffer)) - ReleaseBuffer(vmbuffer); - - /* - * We set all LP_DEAD items from the first heap pass to LP_UNUSED during - * the second heap pass. No more, no less. - */ - Assert(index > 0); - Assert(vacrel->num_index_scans > 1 || - (index == vacrel->lpdead_items && - vacuumed_pages == vacrel->lpdead_item_pages)); - - ereport(DEBUG2, - (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages", - vacrel->relname, (long long) index, vacuumed_pages))); - - /* Revert to the previous phase information for error traceback */ - restore_vacuum_error_info(vacrel, &saved_err_info); -} - -/* - * lazy_vacuum_tdeheap_page() -- free page's LP_DEAD items listed in the - * vacrel->dead_items array. - * - * Caller must have an exclusive buffer lock on the buffer (though a full - * cleanup lock is also acceptable). vmbuffer must be valid and already have - * a pin on blkno's visibility map page. - * - * index is an offset into the vacrel->dead_items array for the first listed - * LP_DEAD item on the page. The return value is the first index immediately - * after all LP_DEAD items for the same page in the array. - */ -static int -lazy_vacuum_tdeheap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, - int index, Buffer vmbuffer) -{ - VacDeadItems *dead_items = vacrel->dead_items; - Page page = BufferGetPage(buffer); - OffsetNumber unused[MaxHeapTuplesPerPage]; - int nunused = 0; - TransactionId visibility_cutoff_xid; - bool all_frozen; - LVSavedErrInfo saved_err_info; - - Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming); - - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); - - /* Update error traceback information */ - update_vacuum_error_info(vacrel, &saved_err_info, - VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno, - InvalidOffsetNumber); - - START_CRIT_SECTION(); - - for (; index < dead_items->num_items; index++) - { - BlockNumber tblk; - OffsetNumber toff; - ItemId itemid; - - tblk = ItemPointerGetBlockNumber(&dead_items->items[index]); - if (tblk != blkno) - break; /* past end of tuples for this block */ - toff = ItemPointerGetOffsetNumber(&dead_items->items[index]); - itemid = PageGetItemId(page, toff); - - Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid)); - ItemIdSetUnused(itemid); - unused[nunused++] = toff; - } - - Assert(nunused > 0); - - /* Attempt to truncate line pointer array now */ - PageTruncateLinePointerArray(page); - - /* - * Mark buffer dirty before we write WAL. - */ - MarkBufferDirty(buffer); - - /* XLOG stuff */ - if (RelationNeedsWAL(vacrel->rel)) - { - xl_tdeheap_vacuum xlrec; - XLogRecPtr recptr; - - xlrec.nunused = nunused; - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum); - - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) unused, nunused * sizeof(OffsetNumber)); - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM); - - PageSetLSN(page, recptr); - } - - /* - * End critical section, so we safely can do visibility tests (which - * possibly need to perform IO and allocate memory!). If we crash now the - * page (including the corresponding vm bit) might not be marked all - * visible, but that's fine. A later vacuum will fix that. - */ - END_CRIT_SECTION(); - - /* - * Now that we have removed the LP_DEAD items from the page, once again - * check if the page has become all-visible. The page is already marked - * dirty, exclusively locked, and, if needed, a full page image has been - * emitted. - */ - Assert(!PageIsAllVisible(page)); - if (tdeheap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid, - &all_frozen)) - { - uint8 flags = VISIBILITYMAP_ALL_VISIBLE; - - if (all_frozen) - { - Assert(!TransactionIdIsValid(visibility_cutoff_xid)); - flags |= VISIBILITYMAP_ALL_FROZEN; - } - - PageSetAllVisible(page); - tdeheap_visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr, - vmbuffer, visibility_cutoff_xid, flags); - } - - /* Revert to the previous phase information for error traceback */ - restore_vacuum_error_info(vacrel, &saved_err_info); - return index; -} - -/* - * Trigger the failsafe to avoid wraparound failure when vacrel table has a - * relfrozenxid and/or relminmxid that is dangerously far in the past. - * Triggering the failsafe makes the ongoing VACUUM bypass any further index - * vacuuming and heap vacuuming. Truncating the heap is also bypassed. - * - * Any remaining work (work that VACUUM cannot just bypass) is typically sped - * up when the failsafe triggers. VACUUM stops applying any cost-based delay - * that it started out with. - * - * Returns true when failsafe has been triggered. - */ -static bool -lazy_check_wraparound_failsafe(LVRelState *vacrel) -{ - /* Don't warn more than once per VACUUM */ - if (VacuumFailsafeActive) - return true; - - if (unlikely(vacuum_xid_failsafe_check(&vacrel->cutoffs))) - { - VacuumFailsafeActive = true; - - /* - * Abandon use of a buffer access strategy to allow use of all of - * shared buffers. We assume the caller who allocated the memory for - * the BufferAccessStrategy will free it. - */ - vacrel->bstrategy = NULL; - - /* Disable index vacuuming, index cleanup, and heap rel truncation */ - vacrel->do_index_vacuuming = false; - vacrel->do_index_cleanup = false; - vacrel->do_rel_truncate = false; - - ereport(WARNING, - (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans", - vacrel->dbname, vacrel->relnamespace, vacrel->relname, - vacrel->num_index_scans), - errdetail("The table's relfrozenxid or relminmxid is too far in the past."), - errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n" - "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs."))); - - /* Stop applying cost limits from this point on */ - VacuumCostActive = false; - VacuumCostBalance = 0; - - return true; - } - - return false; -} - -/* - * lazy_cleanup_all_indexes() -- cleanup all indexes of relation. - */ -static void -lazy_cleanup_all_indexes(LVRelState *vacrel) -{ - double reltuples = vacrel->new_rel_tuples; - bool estimated_count = vacrel->scanned_pages < vacrel->rel_pages; - - Assert(vacrel->do_index_cleanup); - Assert(vacrel->nindexes > 0); - - /* Report that we are now cleaning up indexes */ - pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, - PROGRESS_VACUUM_PHASE_INDEX_CLEANUP); - - if (!ParallelVacuumIsActive(vacrel)) - { - for (int idx = 0; idx < vacrel->nindexes; idx++) - { - Relation indrel = vacrel->indrels[idx]; - IndexBulkDeleteResult *istat = vacrel->indstats[idx]; - - vacrel->indstats[idx] = - lazy_cleanup_one_index(indrel, istat, reltuples, - estimated_count, vacrel); - } - } - else - { - /* Outsource everything to parallel variant */ - parallel_vacuum_cleanup_all_indexes(vacrel->pvs, reltuples, - vacrel->num_index_scans, - estimated_count); - } -} - -/* - * lazy_vacuum_one_index() -- vacuum index relation. - * - * Delete all the index tuples containing a TID collected in - * vacrel->dead_items array. Also update running statistics. - * Exact details depend on index AM's ambulkdelete routine. - * - * reltuples is the number of heap tuples to be passed to the - * bulkdelete callback. It's always assumed to be estimated. - * See indexam.sgml for more info. - * - * Returns bulk delete stats derived from input stats - */ -static IndexBulkDeleteResult * -lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat, - double reltuples, LVRelState *vacrel) -{ - IndexVacuumInfo ivinfo; - LVSavedErrInfo saved_err_info; - - ivinfo.index = indrel; - ivinfo.heaprel = vacrel->rel; - ivinfo.analyze_only = false; - ivinfo.report_progress = false; - ivinfo.estimated_count = true; - ivinfo.message_level = DEBUG2; - ivinfo.num_heap_tuples = reltuples; - ivinfo.strategy = vacrel->bstrategy; - - /* - * Update error traceback information. - * - * The index name is saved during this phase and restored immediately - * after this phase. See vacuum_error_callback. - */ - Assert(vacrel->indname == NULL); - vacrel->indname = pstrdup(RelationGetRelationName(indrel)); - update_vacuum_error_info(vacrel, &saved_err_info, - VACUUM_ERRCB_PHASE_VACUUM_INDEX, - InvalidBlockNumber, InvalidOffsetNumber); - - /* Do bulk deletion */ - istat = vac_bulkdel_one_index(&ivinfo, istat, (void *) vacrel->dead_items); - - /* Revert to the previous phase information for error traceback */ - restore_vacuum_error_info(vacrel, &saved_err_info); - pfree(vacrel->indname); - vacrel->indname = NULL; - - return istat; -} - -/* - * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation. - * - * Calls index AM's amvacuumcleanup routine. reltuples is the number - * of heap tuples and estimated_count is true if reltuples is an - * estimated value. See indexam.sgml for more info. - * - * Returns bulk delete stats derived from input stats - */ -static IndexBulkDeleteResult * -lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat, - double reltuples, bool estimated_count, - LVRelState *vacrel) -{ - IndexVacuumInfo ivinfo; - LVSavedErrInfo saved_err_info; - - ivinfo.index = indrel; - ivinfo.heaprel = vacrel->rel; - ivinfo.analyze_only = false; - ivinfo.report_progress = false; - ivinfo.estimated_count = estimated_count; - ivinfo.message_level = DEBUG2; - - ivinfo.num_heap_tuples = reltuples; - ivinfo.strategy = vacrel->bstrategy; - - /* - * Update error traceback information. - * - * The index name is saved during this phase and restored immediately - * after this phase. See vacuum_error_callback. - */ - Assert(vacrel->indname == NULL); - vacrel->indname = pstrdup(RelationGetRelationName(indrel)); - update_vacuum_error_info(vacrel, &saved_err_info, - VACUUM_ERRCB_PHASE_INDEX_CLEANUP, - InvalidBlockNumber, InvalidOffsetNumber); - - istat = vac_cleanup_one_index(&ivinfo, istat); - - /* Revert to the previous phase information for error traceback */ - restore_vacuum_error_info(vacrel, &saved_err_info); - pfree(vacrel->indname); - vacrel->indname = NULL; - - return istat; -} - -/* - * should_attempt_truncation - should we attempt to truncate the heap? - * - * Don't even think about it unless we have a shot at releasing a goodly - * number of pages. Otherwise, the time taken isn't worth it, mainly because - * an AccessExclusive lock must be replayed on any hot standby, where it can - * be particularly disruptive. - * - * Also don't attempt it if wraparound failsafe is in effect. The entire - * system might be refusing to allocate new XIDs at this point. The system - * definitely won't return to normal unless and until VACUUM actually advances - * the oldest relfrozenxid -- which hasn't happened for target rel just yet. - * If lazy_truncate_heap attempted to acquire an AccessExclusiveLock to - * truncate the table under these circumstances, an XID exhaustion error might - * make it impossible for VACUUM to fix the underlying XID exhaustion problem. - * There is very little chance of truncation working out when the failsafe is - * in effect in any case. lazy_scan_prune makes the optimistic assumption - * that any LP_DEAD items it encounters will always be LP_UNUSED by the time - * we're called. - * - * Also don't attempt it if we are doing early pruning/vacuuming, because a - * scan which cannot find a truncated heap page cannot determine that the - * snapshot is too old to read that page. - */ -static bool -should_attempt_truncation(LVRelState *vacrel) -{ - BlockNumber possibly_freeable; - - if (!vacrel->do_rel_truncate || VacuumFailsafeActive || - old_snapshot_threshold >= 0) - return false; - - possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages; - if (possibly_freeable > 0 && - (possibly_freeable >= REL_TRUNCATE_MINIMUM || - possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION)) - return true; - - return false; -} - -/* - * lazy_truncate_heap - try to truncate off any empty pages at the end - */ -static void -lazy_truncate_heap(LVRelState *vacrel) -{ - BlockNumber orig_rel_pages = vacrel->rel_pages; - BlockNumber new_rel_pages; - bool lock_waiter_detected; - int lock_retry; - - /* Report that we are now truncating */ - pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, - PROGRESS_VACUUM_PHASE_TRUNCATE); - - /* Update error traceback information one last time */ - update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE, - vacrel->nonempty_pages, InvalidOffsetNumber); - - /* - * Loop until no more truncating can be done. - */ - do - { - /* - * We need full exclusive lock on the relation in order to do - * truncation. If we can't get it, give up rather than waiting --- we - * don't want to block other backends, and we don't want to deadlock - * (which is quite possible considering we already hold a lower-grade - * lock). - */ - lock_waiter_detected = false; - lock_retry = 0; - while (true) - { - if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock)) - break; - - /* - * Check for interrupts while trying to (re-)acquire the exclusive - * lock. - */ - CHECK_FOR_INTERRUPTS(); - - if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT / - VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) - { - /* - * We failed to establish the lock in the specified number of - * retries. This means we give up truncating. - */ - ereport(vacrel->verbose ? INFO : DEBUG2, - (errmsg("\"%s\": stopping truncate due to conflicting lock request", - vacrel->relname))); - return; - } - - (void) WaitLatch(MyLatch, - WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, - VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL, - WAIT_EVENT_VACUUM_TRUNCATE); - ResetLatch(MyLatch); - } - - /* - * Now that we have exclusive lock, look to see if the rel has grown - * whilst we were vacuuming with non-exclusive lock. If so, give up; - * the newly added pages presumably contain non-deletable tuples. - */ - new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel); - if (new_rel_pages != orig_rel_pages) - { - /* - * Note: we intentionally don't update vacrel->rel_pages with the - * new rel size here. If we did, it would amount to assuming that - * the new pages are empty, which is unlikely. Leaving the numbers - * alone amounts to assuming that the new pages have the same - * tuple density as existing ones, which is less unlikely. - */ - UnlockRelation(vacrel->rel, AccessExclusiveLock); - return; - } - - /* - * Scan backwards from the end to verify that the end pages actually - * contain no tuples. This is *necessary*, not optional, because - * other backends could have added tuples to these pages whilst we - * were vacuuming. - */ - new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected); - vacrel->blkno = new_rel_pages; - - if (new_rel_pages >= orig_rel_pages) - { - /* can't do anything after all */ - UnlockRelation(vacrel->rel, AccessExclusiveLock); - return; - } - - /* - * Okay to truncate. - */ - RelationTruncate(vacrel->rel, new_rel_pages); - - /* - * We can release the exclusive lock as soon as we have truncated. - * Other backends can't safely access the relation until they have - * processed the smgr invalidation that smgrtruncate sent out ... but - * that should happen as part of standard invalidation processing once - * they acquire lock on the relation. - */ - UnlockRelation(vacrel->rel, AccessExclusiveLock); - - /* - * Update statistics. Here, it *is* correct to adjust rel_pages - * without also touching reltuples, since the tuple count wasn't - * changed by the truncation. - */ - vacrel->removed_pages += orig_rel_pages - new_rel_pages; - vacrel->rel_pages = new_rel_pages; - - ereport(vacrel->verbose ? INFO : DEBUG2, - (errmsg("table \"%s\": truncated %u to %u pages", - vacrel->relname, - orig_rel_pages, new_rel_pages))); - orig_rel_pages = new_rel_pages; - } while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected); -} - -/* - * Rescan end pages to verify that they are (still) empty of tuples. - * - * Returns number of nondeletable pages (last nonempty page + 1). - */ -static BlockNumber -count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) -{ - BlockNumber blkno; - BlockNumber prefetchedUntil; - instr_time starttime; - - /* Initialize the starttime if we check for conflicting lock requests */ - INSTR_TIME_SET_CURRENT(starttime); - - /* - * Start checking blocks at what we believe relation end to be and move - * backwards. (Strange coding of loop control is needed because blkno is - * unsigned.) To make the scan faster, we prefetch a few blocks at a time - * in forward direction, so that OS-level readahead can kick in. - */ - blkno = vacrel->rel_pages; - StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0, - "prefetch size must be power of 2"); - prefetchedUntil = InvalidBlockNumber; - while (blkno > vacrel->nonempty_pages) - { - Buffer buf; - Page page; - OffsetNumber offnum, - maxoff; - bool hastup; - - /* - * Check if another process requests a lock on our relation. We are - * holding an AccessExclusiveLock here, so they will be waiting. We - * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we - * only check if that interval has elapsed once every 32 blocks to - * keep the number of system calls and actual shared lock table - * lookups to a minimum. - */ - if ((blkno % 32) == 0) - { - instr_time currenttime; - instr_time elapsed; - - INSTR_TIME_SET_CURRENT(currenttime); - elapsed = currenttime; - INSTR_TIME_SUBTRACT(elapsed, starttime); - if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) - >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) - { - if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock)) - { - ereport(vacrel->verbose ? INFO : DEBUG2, - (errmsg("table \"%s\": suspending truncate due to conflicting lock request", - vacrel->relname))); - - *lock_waiter_detected = true; - return blkno; - } - starttime = currenttime; - } - } - - /* - * We don't insert a vacuum delay point here, because we have an - * exclusive lock on the table which we want to hold for as short a - * time as possible. We still need to check for interrupts however. - */ - CHECK_FOR_INTERRUPTS(); - - blkno--; - - /* If we haven't prefetched this lot yet, do so now. */ - if (prefetchedUntil > blkno) - { - BlockNumber prefetchStart; - BlockNumber pblkno; - - prefetchStart = blkno & ~(PREFETCH_SIZE - 1); - for (pblkno = prefetchStart; pblkno <= blkno; pblkno++) - { - PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno); - CHECK_FOR_INTERRUPTS(); - } - prefetchedUntil = prefetchStart; - } - - buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, - vacrel->bstrategy); - - /* In this phase we only need shared access to the buffer */ - LockBuffer(buf, BUFFER_LOCK_SHARE); - - page = BufferGetPage(buf); - - if (PageIsNew(page) || PageIsEmpty(page)) - { - UnlockReleaseBuffer(buf); - continue; - } - - hastup = false; - maxoff = PageGetMaxOffsetNumber(page); - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid; - - itemid = PageGetItemId(page, offnum); - - /* - * Note: any non-unused item should be taken as a reason to keep - * this page. Even an LP_DEAD item makes truncation unsafe, since - * we must not have cleaned out its index entries. - */ - if (ItemIdIsUsed(itemid)) - { - hastup = true; - break; /* can stop scanning */ - } - } /* scan along page */ - - UnlockReleaseBuffer(buf); - - /* Done scanning if we found a tuple here */ - if (hastup) - return blkno + 1; - } - - /* - * If we fall out of the loop, all the previously-thought-to-be-empty - * pages still are; we need not bother to look at the last known-nonempty - * page. - */ - return vacrel->nonempty_pages; -} - -/* - * Returns the number of dead TIDs that VACUUM should allocate space to - * store, given a heap rel of size vacrel->rel_pages, and given current - * maintenance_work_mem setting (or current autovacuum_work_mem setting, - * when applicable). - * - * See the comments at the head of this file for rationale. - */ -static int -dead_items_max_items(LVRelState *vacrel) -{ - int64 max_items; - int vac_work_mem = IsAutoVacuumWorkerProcess() && - autovacuum_work_mem != -1 ? - autovacuum_work_mem : maintenance_work_mem; - - if (vacrel->nindexes > 0) - { - BlockNumber rel_pages = vacrel->rel_pages; - - max_items = MAXDEADITEMS(vac_work_mem * 1024L); - max_items = Min(max_items, INT_MAX); - max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize)); - - /* curious coding here to ensure the multiplication can't overflow */ - if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages) - max_items = rel_pages * MaxHeapTuplesPerPage; - - /* stay sane if small maintenance_work_mem */ - max_items = Max(max_items, MaxHeapTuplesPerPage); - } - else - { - /* One-pass case only stores a single heap page's TIDs at a time */ - max_items = MaxHeapTuplesPerPage; - } - - return (int) max_items; -} - -/* - * Allocate dead_items (either using palloc, or in dynamic shared memory). - * Sets dead_items in vacrel for caller. - * - * Also handles parallel initialization as part of allocating dead_items in - * DSM when required. - */ -static void -dead_items_alloc(LVRelState *vacrel, int nworkers) -{ - VacDeadItems *dead_items; - int max_items; - - max_items = dead_items_max_items(vacrel); - Assert(max_items >= MaxHeapTuplesPerPage); - - /* - * Initialize state for a parallel vacuum. As of now, only one worker can - * be used for an index, so we invoke parallelism only if there are at - * least two indexes on a table. - */ - if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming) - { - /* - * Since parallel workers cannot access data in temporary tables, we - * can't perform parallel vacuum on them. - */ - if (RelationUsesLocalBuffers(vacrel->rel)) - { - /* - * Give warning only if the user explicitly tries to perform a - * parallel vacuum on the temporary table. - */ - if (nworkers > 0) - ereport(WARNING, - (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel", - vacrel->relname))); - } - else - vacrel->pvs = parallel_vacuum_init(vacrel->rel, vacrel->indrels, - vacrel->nindexes, nworkers, - max_items, - vacrel->verbose ? INFO : DEBUG2, - vacrel->bstrategy); - - /* If parallel mode started, dead_items space is allocated in DSM */ - if (ParallelVacuumIsActive(vacrel)) - { - vacrel->dead_items = parallel_vacuum_get_dead_items(vacrel->pvs); - return; - } - } - - /* Serial VACUUM case */ - dead_items = (VacDeadItems *) palloc(vac_max_items_to_alloc_size(max_items)); - dead_items->max_items = max_items; - dead_items->num_items = 0; - - vacrel->dead_items = dead_items; -} - -/* - * Perform cleanup for resources allocated in dead_items_alloc - */ -static void -dead_items_cleanup(LVRelState *vacrel) -{ - if (!ParallelVacuumIsActive(vacrel)) - { - /* Don't bother with pfree here */ - return; - } - - /* End parallel mode */ - parallel_vacuum_end(vacrel->pvs, vacrel->indstats); - vacrel->pvs = NULL; -} - -/* - * Check if every tuple in the given page is visible to all current and future - * transactions. Also return the visibility_cutoff_xid which is the highest - * xmin amongst the visible tuples. Set *all_frozen to true if every tuple - * on this page is frozen. - * - * This is a stripped down version of lazy_scan_prune(). If you change - * anything here, make sure that everything stays in sync. Note that an - * assertion calls us to verify that everybody still agrees. Be sure to avoid - * introducing new side-effects here. - */ -static bool -tdeheap_page_is_all_visible(LVRelState *vacrel, Buffer buf, - TransactionId *visibility_cutoff_xid, - bool *all_frozen) -{ - Page page = BufferGetPage(buf); - BlockNumber blockno = BufferGetBlockNumber(buf); - OffsetNumber offnum, - maxoff; - bool all_visible = true; - - *visibility_cutoff_xid = InvalidTransactionId; - *all_frozen = true; - - maxoff = PageGetMaxOffsetNumber(page); - for (offnum = FirstOffsetNumber; - offnum <= maxoff && all_visible; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid; - HeapTupleData tuple; - - /* - * Set the offset number so that we can display it along with any - * error that occurred while processing this tuple. - */ - vacrel->offnum = offnum; - itemid = PageGetItemId(page, offnum); - - /* Unused or redirect line pointers are of no interest */ - if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid)) - continue; - - ItemPointerSet(&(tuple.t_self), blockno, offnum); - - /* - * Dead line pointers can have index pointers pointing to them. So - * they can't be treated as visible - */ - if (ItemIdIsDead(itemid)) - { - all_visible = false; - *all_frozen = false; - break; - } - - Assert(ItemIdIsNormal(itemid)); - - tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(vacrel->rel); - - switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin, - buf)) - { - case HEAPTUPLE_LIVE: - { - TransactionId xmin; - - /* Check comments in lazy_scan_prune. */ - if (!HeapTupleHeaderXminCommitted(tuple.t_data)) - { - all_visible = false; - *all_frozen = false; - break; - } - - /* - * The inserter definitely committed. But is it old enough - * that everyone sees it as committed? - */ - xmin = HeapTupleHeaderGetXmin(tuple.t_data); - if (!TransactionIdPrecedes(xmin, - vacrel->cutoffs.OldestXmin)) - { - all_visible = false; - *all_frozen = false; - break; - } - - /* Track newest xmin on page. */ - if (TransactionIdFollows(xmin, *visibility_cutoff_xid) && - TransactionIdIsNormal(xmin)) - *visibility_cutoff_xid = xmin; - - /* Check whether this tuple is already frozen or not */ - if (all_visible && *all_frozen && - tdeheap_tuple_needs_eventual_freeze(tuple.t_data)) - *all_frozen = false; - } - break; - - case HEAPTUPLE_DEAD: - case HEAPTUPLE_RECENTLY_DEAD: - case HEAPTUPLE_INSERT_IN_PROGRESS: - case HEAPTUPLE_DELETE_IN_PROGRESS: - { - all_visible = false; - *all_frozen = false; - break; - } - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - } /* scan along page */ - - /* Clear the offset information once we have processed the given page. */ - vacrel->offnum = InvalidOffsetNumber; - - return all_visible; -} - -/* - * Update index statistics in pg_class if the statistics are accurate. - */ -static void -update_relstats_all_indexes(LVRelState *vacrel) -{ - Relation *indrels = vacrel->indrels; - int nindexes = vacrel->nindexes; - IndexBulkDeleteResult **indstats = vacrel->indstats; - - Assert(vacrel->do_index_cleanup); - - for (int idx = 0; idx < nindexes; idx++) - { - Relation indrel = indrels[idx]; - IndexBulkDeleteResult *istat = indstats[idx]; - - if (istat == NULL || istat->estimated_count) - continue; - - /* Update index statistics */ - vac_update_relstats(indrel, - istat->num_pages, - istat->num_index_tuples, - 0, - false, - InvalidTransactionId, - InvalidMultiXactId, - NULL, NULL, false); - } -} - -/* - * Error context callback for errors occurring during vacuum. The error - * context messages for index phases should match the messages set in parallel - * vacuum. If you change this function for those phases, change - * parallel_vacuum_error_callback() as well. - */ -static void -vacuum_error_callback(void *arg) -{ - LVRelState *errinfo = arg; - - switch (errinfo->phase) - { - case VACUUM_ERRCB_PHASE_SCAN_HEAP: - if (BlockNumberIsValid(errinfo->blkno)) - { - if (OffsetNumberIsValid(errinfo->offnum)) - errcontext("while scanning block %u offset %u of relation \"%s.%s\"", - errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname); - else - errcontext("while scanning block %u of relation \"%s.%s\"", - errinfo->blkno, errinfo->relnamespace, errinfo->relname); - } - else - errcontext("while scanning relation \"%s.%s\"", - errinfo->relnamespace, errinfo->relname); - break; - - case VACUUM_ERRCB_PHASE_VACUUM_HEAP: - if (BlockNumberIsValid(errinfo->blkno)) - { - if (OffsetNumberIsValid(errinfo->offnum)) - errcontext("while vacuuming block %u offset %u of relation \"%s.%s\"", - errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname); - else - errcontext("while vacuuming block %u of relation \"%s.%s\"", - errinfo->blkno, errinfo->relnamespace, errinfo->relname); - } - else - errcontext("while vacuuming relation \"%s.%s\"", - errinfo->relnamespace, errinfo->relname); - break; - - case VACUUM_ERRCB_PHASE_VACUUM_INDEX: - errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"", - errinfo->indname, errinfo->relnamespace, errinfo->relname); - break; - - case VACUUM_ERRCB_PHASE_INDEX_CLEANUP: - errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"", - errinfo->indname, errinfo->relnamespace, errinfo->relname); - break; - - case VACUUM_ERRCB_PHASE_TRUNCATE: - if (BlockNumberIsValid(errinfo->blkno)) - errcontext("while truncating relation \"%s.%s\" to %u blocks", - errinfo->relnamespace, errinfo->relname, errinfo->blkno); - break; - - case VACUUM_ERRCB_PHASE_UNKNOWN: - default: - return; /* do nothing; the errinfo may not be - * initialized */ - } -} - -/* - * Updates the information required for vacuum error callback. This also saves - * the current information which can be later restored via restore_vacuum_error_info. - */ -static void -update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel, - int phase, BlockNumber blkno, OffsetNumber offnum) -{ - if (saved_vacrel) - { - saved_vacrel->offnum = vacrel->offnum; - saved_vacrel->blkno = vacrel->blkno; - saved_vacrel->phase = vacrel->phase; - } - - vacrel->blkno = blkno; - vacrel->offnum = offnum; - vacrel->phase = phase; -} - -/* - * Restores the vacuum information saved via a prior call to update_vacuum_error_info. - */ -static void -restore_vacuum_error_info(LVRelState *vacrel, - const LVSavedErrInfo *saved_vacrel) -{ - vacrel->blkno = saved_vacrel->blkno; - vacrel->offnum = saved_vacrel->offnum; - vacrel->phase = saved_vacrel->phase; -} diff --git a/src/access/pg_tde_visibilitymap.c b/src/access/pg_tde_visibilitymap.c deleted file mode 100644 index bef5bbff..00000000 --- a/src/access/pg_tde_visibilitymap.c +++ /dev/null @@ -1,650 +0,0 @@ -/*------------------------------------------------------------------------- - * - * tdeheap_visibilitymap.c - * bitmap for tracking visibility of heap tuples - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/access/heap/pg_tde_visibilitymap.c - * - * INTERFACE ROUTINES - * tdeheap_visibilitymap_clear - clear bits for one page in the visibility map - * tdeheap_visibilitymap_pin - pin a map page for setting a bit - * tdeheap_visibilitymap_pin_ok - check whether correct map page is already pinned - * tdeheap_visibilitymap_set - set a bit in a previously pinned page - * tdeheap_visibilitymap_get_status - get status of bits - * tdeheap_visibilitymap_count - count number of bits set in visibility map - * tdeheap_visibilitymap_prepare_truncate - - * prepare for truncation of the visibility map - * - * NOTES - * - * The visibility map is a bitmap with two bits (all-visible and all-frozen) - * per heap page. A set all-visible bit means that all tuples on the page are - * known visible to all transactions, and therefore the page doesn't need to - * be vacuumed. A set all-frozen bit means that all tuples on the page are - * completely frozen, and therefore the page doesn't need to be vacuumed even - * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum). - * The all-frozen bit must be set only when the page is already all-visible. - * - * The map is conservative in the sense that we make sure that whenever a bit - * is set, we know the condition is true, but if a bit is not set, it might or - * might not be true. - * - * Clearing visibility map bits is not separately WAL-logged. The callers - * must make sure that whenever a bit is cleared, the bit is cleared on WAL - * replay of the updating operation as well. - * - * When we *set* a visibility map during VACUUM, we must write WAL. This may - * seem counterintuitive, since the bit is basically a hint: if it is clear, - * it may still be the case that every tuple on the page is visible to all - * transactions; we just don't know that for certain. The difficulty is that - * there are two bits which are typically set together: the PD_ALL_VISIBLE bit - * on the page itself, and the visibility map bit. If a crash occurs after the - * visibility map page makes it to disk and before the updated heap page makes - * it to disk, redo must set the bit on the heap page. Otherwise, the next - * insert, update, or delete on the heap page will fail to realize that the - * visibility map bit must be cleared, possibly causing index-only scans to - * return wrong answers. - * - * VACUUM will normally skip pages for which the visibility map bit is set; - * such pages can't contain any dead tuples and therefore don't need vacuuming. - * - * LOCKING - * - * In heapam.c, whenever a page is modified so that not all tuples on the - * page are visible to everyone anymore, the corresponding bit in the - * visibility map is cleared. In order to be crash-safe, we need to do this - * while still holding a lock on the heap page and in the same critical - * section that logs the page modification. However, we don't want to hold - * the buffer lock over any I/O that may be required to read in the visibility - * map page. To avoid this, we examine the heap page before locking it; - * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map - * bit. Then, we lock the buffer. But this creates a race condition: there - * is a possibility that in the time it takes to lock the buffer, the - * PD_ALL_VISIBLE bit gets set. If that happens, we have to unlock the - * buffer, pin the visibility map page, and relock the buffer. This shouldn't - * happen often, because only VACUUM currently sets visibility map bits, - * and the race will only occur if VACUUM processes a given page at almost - * exactly the same time that someone tries to further modify it. - * - * To set a bit, you need to hold a lock on the heap page. That prevents - * the race condition where VACUUM sees that all tuples on the page are - * visible to everyone, but another backend modifies the page before VACUUM - * sets the bit in the visibility map. - * - * When a bit is set, the LSN of the visibility map page is updated to make - * sure that the visibility map update doesn't get written to disk before the - * WAL record of the changes that made it possible to set the bit is flushed. - * But when a bit is cleared, we don't have to do that because it's always - * safe to clear a bit in the map from correctness point of view. - * - *------------------------------------------------------------------------- - */ -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include "access/pg_tdeam_xlog.h" -#include "access/pg_tde_visibilitymap.h" - -#include "access/xloginsert.h" -#include "access/xlogutils.h" -#include "miscadmin.h" -#include "port/pg_bitutils.h" -#include "storage/bufmgr.h" -#include "storage/lmgr.h" -#include "storage/smgr.h" -#include "utils/inval.h" - - -/*#define TRACE_VISIBILITYMAP */ - -/* - * Size of the bitmap on each visibility map page, in bytes. There's no - * extra headers, so the whole page minus the standard page header is - * used for the bitmap. - */ -#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) - -/* Number of heap blocks we can represent in one byte */ -#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) - -/* Number of heap blocks we can represent in one visibility map page. */ -#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) - -/* Mapping from heap block number to the right bit in the visibility map */ -#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) -#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) -#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) - -/* Masks for counting subsets of bits in the visibility map. */ -#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each - * bit pair */ -#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each - * bit pair */ - -/* prototypes for internal routines */ -static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); -static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks); - - -/* - * tdeheap_visibilitymap_clear - clear specified bits for one page in visibility map - * - * You must pass a buffer containing the correct map page to this function. - * Call tdeheap_visibilitymap_pin first to pin the right one. This function doesn't do - * any I/O. Returns true if any bits have been cleared and false otherwise. - */ -bool -tdeheap_visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags) -{ - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - int mapOffset = HEAPBLK_TO_OFFSET(heapBlk); - uint8 mask = flags << mapOffset; - char *map; - bool cleared = false; - - /* Must never clear all_visible bit while leaving all_frozen bit set */ - Assert(flags & VISIBILITYMAP_VALID_BITS); - Assert(flags != VISIBILITYMAP_ALL_VISIBLE); - -#ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk); -#endif - - if (!BufferIsValid(vmbuf) || BufferGetBlockNumber(vmbuf) != mapBlock) - elog(ERROR, "wrong buffer passed to tdeheap_visibilitymap_clear"); - - LockBuffer(vmbuf, BUFFER_LOCK_EXCLUSIVE); - map = PageGetContents(BufferGetPage(vmbuf)); - - if (map[mapByte] & mask) - { - map[mapByte] &= ~mask; - - MarkBufferDirty(vmbuf); - cleared = true; - } - - LockBuffer(vmbuf, BUFFER_LOCK_UNLOCK); - - return cleared; -} - -/* - * tdeheap_visibilitymap_pin - pin a map page for setting a bit - * - * Setting a bit in the visibility map is a two-phase operation. First, call - * tdeheap_visibilitymap_pin, to pin the visibility map page containing the bit for - * the heap page. Because that can require I/O to read the map page, you - * shouldn't hold a lock on the heap page while doing that. Then, call - * tdeheap_visibilitymap_set to actually set the bit. - * - * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by - * an earlier call to tdeheap_visibilitymap_pin or tdeheap_visibilitymap_get_status on the same - * relation. On return, *vmbuf is a valid buffer with the map page containing - * the bit for heapBlk. - * - * If the page doesn't exist in the map file yet, it is extended. - */ -void -tdeheap_visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) -{ - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - - /* Reuse the old pinned buffer if possible */ - if (BufferIsValid(*vmbuf)) - { - if (BufferGetBlockNumber(*vmbuf) == mapBlock) - return; - - ReleaseBuffer(*vmbuf); - } - *vmbuf = vm_readbuf(rel, mapBlock, true); -} - -/* - * tdeheap_visibilitymap_pin_ok - do we already have the correct page pinned? - * - * On entry, vmbuf should be InvalidBuffer or a valid buffer returned by - * an earlier call to tdeheap_visibilitymap_pin or tdeheap_visibilitymap_get_status on the same - * relation. The return value indicates whether the buffer covers the - * given heapBlk. - */ -bool -tdeheap_visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf) -{ - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - - return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock; -} - -/* - * tdeheap_visibilitymap_set - set bit(s) on a previously pinned page - * - * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, - * or InvalidXLogRecPtr in normal running. The VM page LSN is advanced to the - * one provided; in normal running, we generate a new XLOG record and set the - * page LSN to that value (though the heap page's LSN may *not* be updated; - * see below). cutoff_xid is the largest xmin on the page being marked - * all-visible; it is needed for Hot Standby, and can be InvalidTransactionId - * if the page contains no tuples. It can also be set to InvalidTransactionId - * when a page that is already all-visible is being marked all-frozen. - * - * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling - * this function. Except in recovery, caller should also pass the heap - * buffer. When checksums are enabled and we're not in recovery, we must add - * the heap buffer to the WAL chain to protect it from being torn. - * - * You must pass a buffer containing the correct map page to this function. - * Call tdeheap_visibilitymap_pin first to pin the right one. This function doesn't do - * any I/O. - */ -void -tdeheap_visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, - uint8 flags) -{ - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); - Page page; - uint8 *map; - -#ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); -#endif - - Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); - Assert(InRecovery || PageIsAllVisible((Page) BufferGetPage(heapBuf))); - Assert((flags & VISIBILITYMAP_VALID_BITS) == flags); - - /* Must never set all_frozen bit without also setting all_visible bit */ - Assert(flags != VISIBILITYMAP_ALL_FROZEN); - - /* Check that we have the right heap page pinned, if present */ - if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) - elog(ERROR, "wrong heap buffer passed to tdeheap_visibilitymap_set"); - - /* Check that we have the right VM page pinned */ - if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) - elog(ERROR, "wrong VM buffer passed to tdeheap_visibilitymap_set"); - - page = BufferGetPage(vmBuf); - map = (uint8 *) PageGetContents(page); - LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); - - if (flags != (map[mapByte] >> mapOffset & VISIBILITYMAP_VALID_BITS)) - { - START_CRIT_SECTION(); - - map[mapByte] |= (flags << mapOffset); - MarkBufferDirty(vmBuf); - - if (RelationNeedsWAL(rel)) - { - if (XLogRecPtrIsInvalid(recptr)) - { - Assert(!InRecovery); - recptr = log_tdeheap_visible(rel, heapBuf, vmBuf, cutoff_xid, flags); - - /* - * If data checksums are enabled (or wal_log_hints=on), we - * need to protect the heap page from being torn. - * - * If not, then we must *not* update the heap page's LSN. In - * this case, the FPI for the heap page was omitted from the - * WAL record inserted above, so it would be incorrect to - * update the heap page's LSN. - */ - if (XLogHintBitIsNeeded()) - { - Page heapPage = BufferGetPage(heapBuf); - - PageSetLSN(heapPage, recptr); - } - } - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - } - - LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); -} - -/* - * tdeheap_visibilitymap_get_status - get status of bits - * - * Are all tuples on heapBlk visible to all or are marked frozen, according - * to the visibility map? - * - * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by an - * earlier call to tdeheap_visibilitymap_pin or tdeheap_visibilitymap_get_status on the same - * relation. On return, *vmbuf is a valid buffer with the map page containing - * the bit for heapBlk, or InvalidBuffer. The caller is responsible for - * releasing *vmbuf after it's done testing and setting bits. - * - * NOTE: This function is typically called without a lock on the heap page, - * so somebody else could change the bit just after we look at it. In fact, - * since we don't lock the visibility map page either, it's even possible that - * someone else could have changed the bit just before we look at it, but yet - * we might see the old value. It is the caller's responsibility to deal with - * all concurrency issues! - */ -uint8 -tdeheap_visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) -{ - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); - char *map; - uint8 result; - -#ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk); -#endif - - /* Reuse the old pinned buffer if possible */ - if (BufferIsValid(*vmbuf)) - { - if (BufferGetBlockNumber(*vmbuf) != mapBlock) - { - ReleaseBuffer(*vmbuf); - *vmbuf = InvalidBuffer; - } - } - - if (!BufferIsValid(*vmbuf)) - { - *vmbuf = vm_readbuf(rel, mapBlock, false); - if (!BufferIsValid(*vmbuf)) - return false; - } - - map = PageGetContents(BufferGetPage(*vmbuf)); - - /* - * A single byte read is atomic. There could be memory-ordering effects - * here, but for performance reasons we make it the caller's job to worry - * about that. - */ - result = ((map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS); - return result; -} - -/* - * tdeheap_visibilitymap_count - count number of bits set in visibility map - * - * Note: we ignore the possibility of race conditions when the table is being - * extended concurrently with the call. New pages added to the table aren't - * going to be marked all-visible or all-frozen, so they won't affect the result. - */ -void -tdeheap_visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen) -{ - BlockNumber mapBlock; - BlockNumber nvisible = 0; - BlockNumber nfrozen = 0; - - /* all_visible must be specified */ - Assert(all_visible); - - for (mapBlock = 0;; mapBlock++) - { - Buffer mapBuffer; - uint64 *map; - int i; - - /* - * Read till we fall off the end of the map. We assume that any extra - * bytes in the last page are zeroed, so we don't bother excluding - * them from the count. - */ - mapBuffer = vm_readbuf(rel, mapBlock, false); - if (!BufferIsValid(mapBuffer)) - break; - - /* - * We choose not to lock the page, since the result is going to be - * immediately stale anyway if anyone is concurrently setting or - * clearing bits, and we only really need an approximate value. - */ - map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer)); - - StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0, - "unsupported MAPSIZE"); - if (all_frozen == NULL) - { - for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - } - else - { - for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - { - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - nfrozen += pg_popcount64(map[i] & FROZEN_MASK64); - } - } - - ReleaseBuffer(mapBuffer); - } - - *all_visible = nvisible; - if (all_frozen) - *all_frozen = nfrozen; -} - -/* - * tdeheap_visibilitymap_prepare_truncate - - * prepare for truncation of the visibility map - * - * nheapblocks is the new size of the heap. - * - * Return the number of blocks of new visibility map. - * If it's InvalidBlockNumber, there is nothing to truncate; - * otherwise the caller is responsible for calling smgrtruncate() - * to truncate the visibility map pages. - */ -BlockNumber -tdeheap_visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) -{ - BlockNumber newnblocks; - - /* last remaining block, byte, and bit */ - BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); - uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); - uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks); - -#ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); -#endif - - /* - * If no visibility map has been created yet for this relation, there's - * nothing to truncate. - */ - if (!smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM)) - return InvalidBlockNumber; - - /* - * Unless the new size is exactly at a visibility map page boundary, the - * tail bits in the last remaining map page, representing truncated heap - * blocks, need to be cleared. This is not only tidy, but also necessary - * because we don't get a chance to clear the bits if the heap is extended - * again. - */ - if (truncByte != 0 || truncOffset != 0) - { - Buffer mapBuffer; - Page page; - char *map; - - newnblocks = truncBlock + 1; - - mapBuffer = vm_readbuf(rel, truncBlock, false); - if (!BufferIsValid(mapBuffer)) - { - /* nothing to do, the file was already smaller */ - return InvalidBlockNumber; - } - - page = BufferGetPage(mapBuffer); - map = PageGetContents(page); - - LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); - - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - /* Clear out the unwanted bytes. */ - MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); - - /*---- - * Mask out the unwanted bits of the last remaining byte. - * - * ((1 << 0) - 1) = 00000000 - * ((1 << 1) - 1) = 00000001 - * ... - * ((1 << 6) - 1) = 00111111 - * ((1 << 7) - 1) = 01111111 - *---- - */ - map[truncByte] &= (1 << truncOffset) - 1; - - /* - * Truncation of a relation is WAL-logged at a higher-level, and we - * will be called at WAL replay. But if checksums are enabled, we need - * to still write a WAL record to protect against a torn page, if the - * page is flushed to disk before the truncation WAL record. We cannot - * use MarkBufferDirtyHint here, because that will not dirty the page - * during recovery. - */ - MarkBufferDirty(mapBuffer); - if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) - log_newpage_buffer(mapBuffer, false); - - END_CRIT_SECTION(); - - UnlockReleaseBuffer(mapBuffer); - } - else - newnblocks = truncBlock; - - if (smgrnblocks(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM) <= newnblocks) - { - /* nothing to do, the file was already smaller than requested size */ - return InvalidBlockNumber; - } - - return newnblocks; -} - -/* - * Read a visibility map page. - * - * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is - * true, the visibility map file is extended. - */ -static Buffer -vm_readbuf(Relation rel, BlockNumber blkno, bool extend) -{ - Buffer buf; - SMgrRelation reln; - - /* - * Caution: re-using this smgr pointer could fail if the relcache entry - * gets closed. It's safe as long as we only do smgr-level operations - * between here and the last use of the pointer. - */ - reln = RelationGetSmgr(rel); - - /* - * If we haven't cached the size of the visibility map fork yet, check it - * first. - */ - if (reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber) - { - if (smgrexists(reln, VISIBILITYMAP_FORKNUM)) - smgrnblocks(reln, VISIBILITYMAP_FORKNUM); - else - reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0; - } - - /* - * For reading we use ZERO_ON_ERROR mode, and initialize the page if - * necessary. It's always safe to clear bits, so it's better to clear - * corrupt pages than error out. - * - * We use the same path below to initialize pages when extending the - * relation, as a concurrent extension can end up with vm_extend() - * returning an already-initialized page. - */ - if (blkno >= reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM]) - { - if (extend) - buf = vm_extend(rel, blkno + 1); - else - return InvalidBuffer; - } - else - buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, - RBM_ZERO_ON_ERROR, NULL); - - /* - * Initializing the page when needed is trickier than it looks, because of - * the possibility of multiple backends doing this concurrently, and our - * desire to not uselessly take the buffer lock in the normal path where - * the page is OK. We must take the lock to initialize the page, so - * recheck page newness after we have the lock, in case someone else - * already did it. Also, because we initially check PageIsNew with no - * lock, it's possible to fall through and return the buffer while someone - * else is still initializing the page (i.e., we might see pd_upper as set - * but other page header fields are still zeroes). This is harmless for - * callers that will take a buffer lock themselves, but some callers - * inspect the page without any lock at all. The latter is OK only so - * long as it doesn't depend on the page header having correct contents. - * Current usage is safe because PageGetContents() does not require that. - */ - if (PageIsNew(BufferGetPage(buf))) - { - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - if (PageIsNew(BufferGetPage(buf))) - PageInit(BufferGetPage(buf), BLCKSZ, 0); - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - } - return buf; -} - -/* - * Ensure that the visibility map fork is at least vm_nblocks long, extending - * it if necessary with zeroed pages. - */ -static Buffer -vm_extend(Relation rel, BlockNumber vm_nblocks) -{ - Buffer buf; - - buf = ExtendBufferedRelTo(BMR_REL(rel), VISIBILITYMAP_FORKNUM, NULL, - EB_CREATE_FORK_IF_NEEDED | - EB_CLEAR_SIZE_CACHE, - vm_nblocks, - RBM_ZERO_ON_ERROR); - - /* - * Send a shared-inval message to force other backends to close any smgr - * references they may have for this rel, which we are about to change. - * This is a useful optimization because it means that backends don't have - * to keep checking for creation or extension of the file, which happens - * infrequently. - */ - CacheInvalidateSmgr(RelationGetSmgr(rel)->smgr_rlocator); - - return buf; -} diff --git a/src/access/pg_tdeam.c b/src/access/pg_tdeam.c deleted file mode 100644 index e4d1267a..00000000 --- a/src/access/pg_tdeam.c +++ /dev/null @@ -1,10311 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_tdeam.c - * pg_tde access method code - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * contrib/pg_tde/pg_tdeam.c - * - * - * INTERFACE ROUTINES - * tdeheap_beginscan - begin relation scan - * tdeheap_rescan - restart a relation scan - * tdeheap_endscan - end relation scan - * tdeheap_getnext - retrieve next tuple in scan - * tdeheap_fetch - retrieve tuple with given tid - * tdeheap_insert - insert tuple into a relation - * tdeheap_multi_insert - insert multiple tuples into a relation - * tdeheap_delete - delete a tuple from a relation - * tdeheap_update - replace a tuple in a relation with another tuple - * - * NOTES - * This file contains the tdeheap_ routines which implement - * the POSTGRES pg_tde access method used for all POSTGRES - * relations. - * - *------------------------------------------------------------------------- - */ - -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include "access/pg_tdeam.h" -#include "access/pg_tdeam_xlog.h" -#include "access/pg_tdetoast.h" -#include "access/pg_tde_io.h" -#include "access/pg_tde_visibilitymap.h" -#include "access/pg_tde_slot.h" -#include "encryption/enc_tde.h" - -#include "access/bufmask.h" -#include "access/genam.h" -#include "access/multixact.h" -#include "access/parallel.h" -#include "access/relscan.h" -#include "access/subtrans.h" -#include "access/syncscan.h" -#include "access/sysattr.h" -#include "access/tableam.h" -#include "access/transam.h" -#include "access/valid.h" -#include "access/xact.h" -#include "access/xlog.h" -#include "access/xloginsert.h" -#include "access/xlogutils.h" -#include "catalog/catalog.h" -#include "commands/vacuum.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "port/atomics.h" -#include "port/pg_bitutils.h" -#include "storage/bufmgr.h" -#include "storage/freespace.h" -#include "storage/lmgr.h" -#include "storage/predicate.h" -#include "storage/procarray.h" -#include "storage/smgr.h" -#include "storage/spin.h" -#include "storage/standby.h" -#include "utils/datum.h" -#include "utils/inval.h" -#include "utils/lsyscache.h" -#include "utils/relcache.h" -#include "utils/snapmgr.h" -#include "utils/spccache.h" -#include "utils/memutils.h" - - -static HeapTuple tdeheap_prepare_insert(Relation relation, HeapTuple tup, - TransactionId xid, CommandId cid, int options); -static XLogRecPtr log_tdeheap_update(Relation reln, Buffer oldbuf, - Buffer newbuf, HeapTuple oldtup, - HeapTuple newtup, HeapTuple old_key_tuple, - bool all_visible_cleared, bool new_all_visible_cleared); -static Bitmapset *HeapDetermineColumnsInfo(Relation relation, - Bitmapset *interesting_cols, - Bitmapset *external_cols, - HeapTuple oldtup, HeapTuple newtup, - bool *has_external); -static bool tdeheap_acquire_tuplock(Relation relation, ItemPointer tid, - LockTupleMode mode, LockWaitPolicy wait_policy, - bool *have_tuple_lock); -static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, - uint16 old_infomask2, TransactionId add_to_xmax, - LockTupleMode mode, bool is_update, - TransactionId *result_xmax, uint16 *result_infomask, - uint16 *result_infomask2); -static TM_Result tdeheap_lock_updated_tuple(Relation rel, HeapTuple tuple, - ItemPointer ctid, TransactionId xid, - LockTupleMode mode); -static int tdeheap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, - xl_tdeheap_freeze_plan *plans_out, - OffsetNumber *offsets_out); -static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, - uint16 *new_infomask2); -static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, - uint16 t_infomask); -static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, - LockTupleMode lockmode, bool *current_is_member); -static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, - Relation rel, ItemPointer ctid, XLTW_Oper oper, - int *remaining); -static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, - uint16 infomask, Relation rel, int *remaining); -static void index_delete_sort(TM_IndexDeleteOp *delstate); -static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); -static XLogRecPtr log_tdeheap_new_cid(Relation relation, HeapTuple tup); -static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, - bool *copy); - - -/* - * Each tuple lock mode has a corresponding heavyweight lock, and one or two - * corresponding MultiXactStatuses (one to merely lock tuples, another one to - * update them). This table (and the macros below) helps us determine the - * heavyweight lock mode and MultiXactStatus values to use for any particular - * tuple lock strength. - * - * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock - * instead. - */ -static const struct -{ - LOCKMODE hwlock; - int lockstatus; - int updstatus; -} - - tupleLockExtraInfo[MaxLockTupleMode + 1] = -{ - { /* LockTupleKeyShare */ - AccessShareLock, - MultiXactStatusForKeyShare, - -1 /* KeyShare does not allow updating tuples */ - }, - { /* LockTupleShare */ - RowShareLock, - MultiXactStatusForShare, - -1 /* Share does not allow updating tuples */ - }, - { /* LockTupleNoKeyExclusive */ - ExclusiveLock, - MultiXactStatusForNoKeyUpdate, - MultiXactStatusNoKeyUpdate - }, - { /* LockTupleExclusive */ - AccessExclusiveLock, - MultiXactStatusForUpdate, - MultiXactStatusUpdate - } -}; - -/* Get the LOCKMODE for a given MultiXactStatus */ -#define LOCKMODE_from_mxstatus(status) \ - (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock) - -/* - * Acquire heavyweight locks on tuples, using a LockTupleMode strength value. - * This is more readable than having every caller translate it to lock.h's - * LOCKMODE. - */ -#define LockTupleTuplock(rel, tup, mode) \ - LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) -#define UnlockTupleTuplock(rel, tup, mode) \ - UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) -#define ConditionalLockTupleTuplock(rel, tup, mode) \ - ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) - -#ifdef USE_PREFETCH -/* - * tdeheap_index_delete_tuples and index_delete_prefetch_buffer use this - * structure to coordinate prefetching activity - */ -typedef struct -{ - BlockNumber cur_hblkno; - int next_item; - int ndeltids; - TM_IndexDelete *deltids; -} IndexDeletePrefetchState; -#endif - -/* tdeheap_index_delete_tuples bottom-up index deletion costing constants */ -#define BOTTOMUP_MAX_NBLOCKS 6 -#define BOTTOMUP_TOLERANCE_NBLOCKS 3 - -/* - * tdeheap_index_delete_tuples uses this when determining which heap blocks it - * must visit to help its bottom-up index deletion caller - */ -typedef struct IndexDeleteCounts -{ - int16 npromisingtids; /* Number of "promising" TIDs in group */ - int16 ntids; /* Number of TIDs in group */ - int16 ifirsttid; /* Offset to group's first deltid */ -} IndexDeleteCounts; - -/* - * This table maps tuple lock strength values for each particular - * MultiXactStatus value. - */ -static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = -{ - LockTupleKeyShare, /* ForKeyShare */ - LockTupleShare, /* ForShare */ - LockTupleNoKeyExclusive, /* ForNoKeyUpdate */ - LockTupleExclusive, /* ForUpdate */ - LockTupleNoKeyExclusive, /* NoKeyUpdate */ - LockTupleExclusive /* Update */ -}; - -/* Get the LockTupleMode for a given MultiXactStatus */ -#define TUPLOCK_from_mxstatus(status) \ - (MultiXactStatusLock[(status)]) - -/* ---------------------------------------------------------------- - * heap support routines - * ---------------------------------------------------------------- - */ - -/* ---------------- - * initscan - scan code common to tdeheap_beginscan and tdeheap_rescan - * ---------------- - */ -static void -initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) -{ - ParallelBlockTableScanDesc bpscan = NULL; - bool allow_strat; - bool allow_sync; - - /* - * Determine the number of blocks we have to scan. - * - * It is sufficient to do this once at scan start, since any tuples added - * while the scan is in progress will be invisible to my snapshot anyway. - * (That is not true when using a non-MVCC snapshot. However, we couldn't - * guarantee to return tuples added after scan start anyway, since they - * might go into pages we already scanned. To guarantee consistent - * results for a non-MVCC snapshot, the caller must hold some higher-level - * lock that ensures the interesting tuple(s) won't change.) - */ - if (scan->rs_base.rs_parallel != NULL) - { - bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; - scan->rs_nblocks = bpscan->phs_nblocks; - } - else - scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd); - - /* - * If the table is large relative to NBuffers, use a bulk-read access - * strategy and enable synchronized scanning (see syncscan.c). Although - * the thresholds for these features could be different, we make them the - * same so that there are only two behaviors to tune rather than four. - * (However, some callers need to be able to disable one or both of these - * behaviors, independently of the size of the table; also there is a GUC - * variable that can disable synchronized scanning.) - * - * Note that table_block_parallelscan_initialize has a very similar test; - * if you change this, consider changing that one, too. - */ - if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) && - scan->rs_nblocks > NBuffers / 4) - { - allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0; - allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0; - } - else - allow_strat = allow_sync = false; - - if (allow_strat) - { - /* During a rescan, keep the previous strategy object. */ - if (scan->rs_strategy == NULL) - scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD); - } - else - { - if (scan->rs_strategy != NULL) - FreeAccessStrategy(scan->rs_strategy); - scan->rs_strategy = NULL; - } - - if (scan->rs_base.rs_parallel != NULL) - { - /* For parallel scan, believe whatever ParallelTableScanDesc says. */ - if (scan->rs_base.rs_parallel->phs_syncscan) - scan->rs_base.rs_flags |= SO_ALLOW_SYNC; - else - scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; - } - else if (keep_startblock) - { - /* - * When rescanning, we want to keep the previous startblock setting, - * so that rewinding a cursor doesn't generate surprising results. - * Reset the active syncscan setting, though. - */ - if (allow_sync && synchronize_seqscans) - scan->rs_base.rs_flags |= SO_ALLOW_SYNC; - else - scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; - } - else if (allow_sync && synchronize_seqscans) - { - scan->rs_base.rs_flags |= SO_ALLOW_SYNC; - scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks); - } - else - { - scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; - scan->rs_startblock = 0; - } - - scan->rs_numblocks = InvalidBlockNumber; - scan->rs_inited = false; - scan->rs_ctup.t_data = NULL; - ItemPointerSetInvalid(&scan->rs_ctup.t_self); - scan->rs_cbuf = InvalidBuffer; - scan->rs_cblock = InvalidBlockNumber; - - /* page-at-a-time fields are always invalid when not rs_inited */ - - /* - * copy the scan key, if appropriate - */ - if (key != NULL && scan->rs_base.rs_nkeys > 0) - memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData)); - - /* - * Currently, we only have a stats counter for sequential heap scans (but - * e.g for bitmap scans the underlying bitmap index scans will be counted, - * and for sample scans we update stats for tuple fetches). - */ - if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN) - pgstat_count_tdeheap_scan(scan->rs_base.rs_rd); -} - -/* - * tdeheap_setscanlimits - restrict range of a heapscan - * - * startBlk is the page to start at - * numBlks is number of pages to scan (InvalidBlockNumber means "all") - */ -void -tdeheap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks) -{ - HeapScanDesc scan = (HeapScanDesc) sscan; - - Assert(!scan->rs_inited); /* else too late to change */ - /* else rs_startblock is significant */ - Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC)); - - /* Check startBlk is valid (but allow case of zero blocks...) */ - Assert(startBlk == 0 || startBlk < scan->rs_nblocks); - - scan->rs_startblock = startBlk; - scan->rs_numblocks = numBlks; -} - -/* - * tdeheapgetpage - subroutine for tdeheapgettup() - * - * This routine reads and pins the specified page of the relation. - * In page-at-a-time mode it performs additional work, namely determining - * which tuples on the page are visible. - */ -void -tdeheapgetpage(TableScanDesc sscan, BlockNumber block) -{ - HeapScanDesc scan = (HeapScanDesc) sscan; - Buffer buffer; - Snapshot snapshot; - Page page; - int lines; - int ntup; - OffsetNumber lineoff; - bool all_visible; - - Assert(block < scan->rs_nblocks); - - /* release previous scan buffer, if any */ - if (BufferIsValid(scan->rs_cbuf)) - { - ReleaseBuffer(scan->rs_cbuf); - scan->rs_cbuf = InvalidBuffer; - } - - /* - * Be sure to check for interrupts at least once per page. Checks at - * higher code levels won't be able to stop a seqscan that encounters many - * pages' worth of consecutive dead tuples. - */ - CHECK_FOR_INTERRUPTS(); - - /* read page using selected strategy */ - scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, block, - RBM_NORMAL, scan->rs_strategy); - scan->rs_cblock = block; - - if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)) - return; - - buffer = scan->rs_cbuf; - snapshot = scan->rs_base.rs_snapshot; - - /* - * Prune and repair fragmentation for the whole page, if possible. - */ - tdeheap_page_prune_opt(scan->rs_base.rs_rd, buffer); - - /* - * We must hold share lock on the buffer content while examining tuple - * visibility. Afterwards, however, the tuples we have found to be - * visible are guaranteed good as long as we hold the buffer pin. - */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - - page = BufferGetPage(buffer); - TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, page); - lines = PageGetMaxOffsetNumber(page); - ntup = 0; - - /* - * If the all-visible flag indicates that all tuples on the page are - * visible to everyone, we can skip the per-tuple visibility tests. - * - * Note: In hot standby, a tuple that's already visible to all - * transactions on the primary might still be invisible to a read-only - * transaction in the standby. We partly handle this problem by tracking - * the minimum xmin of visible tuples as the cut-off XID while marking a - * page all-visible on the primary and WAL log that along with the - * visibility map SET operation. In hot standby, we wait for (or abort) - * all transactions that can potentially may not see one or more tuples on - * the page. That's how index-only scans work fine in hot standby. A - * crucial difference between index-only scans and heap scans is that the - * index-only scan completely relies on the visibility map where as heap - * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if - * the page-level flag can be trusted in the same way, because it might - * get propagated somehow without being explicitly WAL-logged, e.g. via a - * full page write. Until we can prove that beyond doubt, let's check each - * tuple for visibility the hard way. - */ - all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; - - for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++) - { - ItemId lpp = PageGetItemId(page, lineoff); - HeapTupleData loctup; - bool valid; - - if (!ItemIdIsNormal(lpp)) - continue; - - loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); - loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp); - loctup.t_len = ItemIdGetLength(lpp); - ItemPointerSet(&(loctup.t_self), block, lineoff); - - if (all_visible) - valid = true; - else - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); - - HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, - &loctup, buffer, snapshot); - - if (valid) - scan->rs_vistuples[ntup++] = lineoff; - } - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - Assert(ntup <= MaxHeapTuplesPerPage); - scan->rs_ntuples = ntup; -} - -/* - * tdeheapgettup_initial_block - return the first BlockNumber to scan - * - * Returns InvalidBlockNumber when there are no blocks to scan. This can - * occur with empty tables and in parallel scans when parallel workers get all - * of the pages before we can get a chance to get our first page. - */ -static BlockNumber -tdeheapgettup_initial_block(HeapScanDesc scan, ScanDirection dir) -{ - Assert(!scan->rs_inited); - - /* When there are no pages to scan, return InvalidBlockNumber */ - if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0) - return InvalidBlockNumber; - - if (ScanDirectionIsForward(dir)) - { - /* serial scan */ - if (scan->rs_base.rs_parallel == NULL) - return scan->rs_startblock; - else - { - /* parallel scan */ - table_block_parallelscan_startblock_init(scan->rs_base.rs_rd, - scan->rs_parallelworkerdata, - (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel); - - /* may return InvalidBlockNumber if there are no more blocks */ - return table_block_parallelscan_nextpage(scan->rs_base.rs_rd, - scan->rs_parallelworkerdata, - (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel); - } - } - else - { - /* backward parallel scan not supported */ - Assert(scan->rs_base.rs_parallel == NULL); - - /* - * Disable reporting to syncscan logic in a backwards scan; it's not - * very likely anyone else is doing the same thing at the same time, - * and much more likely that we'll just bollix things for forward - * scanners. - */ - scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; - - /* - * Start from last page of the scan. Ensure we take into account - * rs_numblocks if it's been adjusted by tdeheap_setscanlimits(). - */ - if (scan->rs_numblocks != InvalidBlockNumber) - return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks; - - if (scan->rs_startblock > 0) - return scan->rs_startblock - 1; - - return scan->rs_nblocks - 1; - } -} - - -/* - * tdeheapgettup_start_page - helper function for tdeheapgettup() - * - * Return the next page to scan based on the scan->rs_cbuf and set *linesleft - * to the number of tuples on this page. Also set *lineoff to the first - * offset to scan with forward scans getting the first offset and backward - * getting the final offset on the page. - */ -static Page -tdeheapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, - OffsetNumber *lineoff) -{ - Page page; - - Assert(scan->rs_inited); - Assert(BufferIsValid(scan->rs_cbuf)); - - /* Caller is responsible for ensuring buffer is locked if needed */ - page = BufferGetPage(scan->rs_cbuf); - - TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); - - *linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1; - - if (ScanDirectionIsForward(dir)) - *lineoff = FirstOffsetNumber; - else - *lineoff = (OffsetNumber) (*linesleft); - - /* lineoff now references the physically previous or next tid */ - return page; -} - - -/* - * tdeheapgettup_continue_page - helper function for tdeheapgettup() - * - * Return the next page to scan based on the scan->rs_cbuf and set *linesleft - * to the number of tuples left to scan on this page. Also set *lineoff to - * the next offset to scan according to the ScanDirection in 'dir'. - */ -static inline Page -tdeheapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, - OffsetNumber *lineoff) -{ - Page page; - - Assert(scan->rs_inited); - Assert(BufferIsValid(scan->rs_cbuf)); - - /* Caller is responsible for ensuring buffer is locked if needed */ - page = BufferGetPage(scan->rs_cbuf); - - TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); - - if (ScanDirectionIsForward(dir)) - { - *lineoff = OffsetNumberNext(scan->rs_coffset); - *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1; - } - else - { - /* - * The previous returned tuple may have been vacuumed since the - * previous scan when we use a non-MVCC snapshot, so we must - * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant - */ - *lineoff = Min(PageGetMaxOffsetNumber(page), OffsetNumberPrev(scan->rs_coffset)); - *linesleft = *lineoff; - } - - /* lineoff now references the physically previous or next tid */ - return page; -} - -/* - * tdeheapgettup_advance_block - helper for tdeheapgettup() and tdeheapgettup_pagemode() - * - * Given the current block number, the scan direction, and various information - * contained in the scan descriptor, calculate the BlockNumber to scan next - * and return it. If there are no further blocks to scan, return - * InvalidBlockNumber to indicate this fact to the caller. - * - * This should not be called to determine the initial block number -- only for - * subsequent blocks. - * - * This also adjusts rs_numblocks when a limit has been imposed by - * tdeheap_setscanlimits(). - */ -static inline BlockNumber -tdeheapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir) -{ - if (ScanDirectionIsForward(dir)) - { - if (scan->rs_base.rs_parallel == NULL) - { - block++; - - /* wrap back to the start of the heap */ - if (block >= scan->rs_nblocks) - block = 0; - - /* - * Report our new scan position for synchronization purposes. We - * don't do that when moving backwards, however. That would just - * mess up any other forward-moving scanners. - * - * Note: we do this before checking for end of scan so that the - * final state of the position hint is back at the start of the - * rel. That's not strictly necessary, but otherwise when you run - * the same query multiple times the starting position would shift - * a little bit backwards on every invocation, which is confusing. - * We don't guarantee any specific ordering in general, though. - */ - if (scan->rs_base.rs_flags & SO_ALLOW_SYNC) - ss_report_location(scan->rs_base.rs_rd, block); - - /* we're done if we're back at where we started */ - if (block == scan->rs_startblock) - return InvalidBlockNumber; - - /* check if the limit imposed by tdeheap_setscanlimits() is met */ - if (scan->rs_numblocks != InvalidBlockNumber) - { - if (--scan->rs_numblocks == 0) - return InvalidBlockNumber; - } - - return block; - } - else - { - return table_block_parallelscan_nextpage(scan->rs_base.rs_rd, - scan->rs_parallelworkerdata, (ParallelBlockTableScanDesc) - scan->rs_base.rs_parallel); - } - } - else - { - /* we're done if the last block is the start position */ - if (block == scan->rs_startblock) - return InvalidBlockNumber; - - /* check if the limit imposed by tdeheap_setscanlimits() is met */ - if (scan->rs_numblocks != InvalidBlockNumber) - { - if (--scan->rs_numblocks == 0) - return InvalidBlockNumber; - } - - /* wrap to the end of the heap when the last page was page 0 */ - if (block == 0) - block = scan->rs_nblocks; - - block--; - - return block; - } -} - -/* ---------------- - * tdeheapgettup - fetch next heap tuple - * - * Initialize the scan if not already done; then advance to the next - * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup, - * or set scan->rs_ctup.t_data = NULL if no more tuples. - * - * Note: the reason nkeys/key are passed separately, even though they are - * kept in the scan descriptor, is that the caller may not want us to check - * the scankeys. - * - * Note: when we fall off the end of the scan in either direction, we - * reset rs_inited. This means that a further request with the same - * scan direction will restart the scan, which is a bit odd, but a - * request with the opposite scan direction will start a fresh scan - * in the proper direction. The latter is required behavior for cursors, - * while the former case is generally undefined behavior in Postgres - * so we don't care too much. - * ---------------- - */ -static void -tdeheapgettup(HeapScanDesc scan, - ScanDirection dir, - int nkeys, - ScanKey key) -{ - HeapTuple tuple = &(scan->rs_ctup); - BlockNumber block; - Page page; - OffsetNumber lineoff; - int linesleft; - - if (unlikely(!scan->rs_inited)) - { - block = tdeheapgettup_initial_block(scan, dir); - /* ensure rs_cbuf is invalid when we get InvalidBlockNumber */ - Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf)); - scan->rs_inited = true; - } - else - { - /* continue from previously returned page/tuple */ - block = scan->rs_cblock; - - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - page = tdeheapgettup_continue_page(scan, dir, &linesleft, &lineoff); - goto continue_page; - } - - /* - * advance the scan until we find a qualifying tuple or run out of stuff - * to scan - */ - while (block != InvalidBlockNumber) - { - tdeheapgetpage((TableScanDesc) scan, block); - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - page = tdeheapgettup_start_page(scan, dir, &linesleft, &lineoff); -continue_page: - - /* - * Only continue scanning the page while we have lines left. - * - * Note that this protects us from accessing line pointers past - * PageGetMaxOffsetNumber(); both for forward scans when we resume the - * table scan, and for when we start scanning a new page. - */ - for (; linesleft > 0; linesleft--, lineoff += dir) - { - bool visible; - ItemId lpp = PageGetItemId(page, lineoff); - - if (!ItemIdIsNormal(lpp)) - continue; - - tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp); - tuple->t_len = ItemIdGetLength(lpp); - ItemPointerSet(&(tuple->t_self), block, lineoff); - - visible = HeapTupleSatisfiesVisibility(tuple, - scan->rs_base.rs_snapshot, - scan->rs_cbuf); - - HeapCheckForSerializableConflictOut(visible, scan->rs_base.rs_rd, - tuple, scan->rs_cbuf, - scan->rs_base.rs_snapshot); - - /* skip tuples not visible to this snapshot */ - if (!visible) - continue; - - /* skip any tuples that don't match the scan key */ - if (key != NULL && - !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd), - nkeys, key)) - continue; - - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - scan->rs_coffset = lineoff; - return; - } - - /* - * if we get here, it means we've exhausted the items on this page and - * it's time to move to the next. - */ - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - /* get the BlockNumber to scan next */ - block = tdeheapgettup_advance_block(scan, block, dir); - } - - /* end of scan */ - if (BufferIsValid(scan->rs_cbuf)) - ReleaseBuffer(scan->rs_cbuf); - - scan->rs_cbuf = InvalidBuffer; - scan->rs_cblock = InvalidBlockNumber; - tuple->t_data = NULL; - scan->rs_inited = false; -} - -/* ---------------- - * tdeheapgettup_pagemode - fetch next heap tuple in page-at-a-time mode - * - * Same API as tdeheapgettup, but used in page-at-a-time mode - * - * The internal logic is much the same as tdeheapgettup's too, but there are some - * differences: we do not take the buffer content lock (that only needs to - * happen inside tdeheapgetpage), and we iterate through just the tuples listed - * in rs_vistuples[] rather than all tuples on the page. Notice that - * lineindex is 0-based, where the corresponding loop variable lineoff in - * tdeheapgettup is 1-based. - * ---------------- - */ -static void -tdeheapgettup_pagemode(HeapScanDesc scan, - ScanDirection dir, - int nkeys, - ScanKey key) -{ - HeapTuple tuple = &(scan->rs_ctup); - BlockNumber block; - Page page; - int lineindex; - int linesleft; - - if (unlikely(!scan->rs_inited)) - { - block = tdeheapgettup_initial_block(scan, dir); - /* ensure rs_cbuf is invalid when we get InvalidBlockNumber */ - Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf)); - scan->rs_inited = true; - } - else - { - /* continue from previously returned page/tuple */ - block = scan->rs_cblock; /* current page */ - page = BufferGetPage(scan->rs_cbuf); - TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); - - lineindex = scan->rs_cindex + dir; - if (ScanDirectionIsForward(dir)) - linesleft = scan->rs_ntuples - lineindex; - else - linesleft = scan->rs_cindex; - /* lineindex now references the next or previous visible tid */ - - goto continue_page; - } - - /* - * advance the scan until we find a qualifying tuple or run out of stuff - * to scan - */ - while (block != InvalidBlockNumber) - { - tdeheapgetpage((TableScanDesc) scan, block); - page = BufferGetPage(scan->rs_cbuf); - TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page); - linesleft = scan->rs_ntuples; - lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1; - - /* lineindex now references the next or previous visible tid */ -continue_page: - - for (; linesleft > 0; linesleft--, lineindex += dir) - { - ItemId lpp; - OffsetNumber lineoff; - - lineoff = scan->rs_vistuples[lineindex]; - lpp = PageGetItemId(page, lineoff); - Assert(ItemIdIsNormal(lpp)); - - tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp); - tuple->t_len = ItemIdGetLength(lpp); - ItemPointerSet(&(tuple->t_self), block, lineoff); - - /* skip any tuples that don't match the scan key */ - if (key != NULL && - !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd), - nkeys, key)) - continue; - - scan->rs_cindex = lineindex; - return; - } - - /* get the BlockNumber to scan next */ - block = tdeheapgettup_advance_block(scan, block, dir); - } - - /* end of scan */ - if (BufferIsValid(scan->rs_cbuf)) - ReleaseBuffer(scan->rs_cbuf); - scan->rs_cbuf = InvalidBuffer; - scan->rs_cblock = InvalidBlockNumber; - tuple->t_data = NULL; - scan->rs_inited = false; -} - - -/* ---------------------------------------------------------------- - * heap access method interface - * ---------------------------------------------------------------- - */ - - -TableScanDesc -tdeheap_beginscan(Relation relation, Snapshot snapshot, - int nkeys, ScanKey key, - ParallelTableScanDesc parallel_scan, - uint32 flags) -{ - HeapScanDesc scan; - - /* - * increment relation ref count while scanning relation - * - * This is just to make really sure the relcache entry won't go away while - * the scan has a pointer to it. Caller should be holding the rel open - * anyway, so this is redundant in all normal scenarios... - */ - RelationIncrementReferenceCount(relation); - - /* - * allocate and initialize scan descriptor - */ - scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData)); - - scan->rs_base.rs_rd = relation; - scan->rs_base.rs_snapshot = snapshot; - scan->rs_base.rs_nkeys = nkeys; - scan->rs_base.rs_flags = flags; - scan->rs_base.rs_parallel = parallel_scan; - scan->rs_strategy = NULL; /* set in initscan */ - - /* - * Disable page-at-a-time mode if it's not a MVCC-safe snapshot. - */ - if (!(snapshot && IsMVCCSnapshot(snapshot))) - scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; - - /* - * For seqscan and sample scans in a serializable transaction, acquire a - * predicate lock on the entire relation. This is required not only to - * lock all the matching tuples, but also to conflict with new insertions - * into the table. In an indexscan, we take page locks on the index pages - * covering the range specified in the scan qual, but in a heap scan there - * is nothing more fine-grained to lock. A bitmap scan is a different - * story, there we have already scanned the index and locked the index - * pages covering the predicate. But in that case we still have to lock - * any matching heap tuples. For sample scan we could optimize the locking - * to be at least page-level granularity, but we'd need to add per-tuple - * locking for that. - */ - if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN)) - { - /* - * Ensure a missing snapshot is noticed reliably, even if the - * isolation mode means predicate locking isn't performed (and - * therefore the snapshot isn't used here). - */ - Assert(snapshot); - PredicateLockRelation(relation, snapshot); - } - - /* we only need to set this up once */ - scan->rs_ctup.t_tableOid = RelationGetRelid(relation); - - /* - * Allocate memory to keep track of page allocation for parallel workers - * when doing a parallel scan. - */ - if (parallel_scan != NULL) - scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData)); - else - scan->rs_parallelworkerdata = NULL; - - /* - * we do this here instead of in initscan() because tdeheap_rescan also calls - * initscan() and we don't want to allocate memory again - */ - if (nkeys > 0) - scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); - else - scan->rs_base.rs_key = NULL; - - initscan(scan, key, false); - - return (TableScanDesc) scan; -} - -void -tdeheap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, - bool allow_strat, bool allow_sync, bool allow_pagemode) -{ - HeapScanDesc scan = (HeapScanDesc) sscan; - - if (set_params) - { - if (allow_strat) - scan->rs_base.rs_flags |= SO_ALLOW_STRAT; - else - scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT; - - if (allow_sync) - scan->rs_base.rs_flags |= SO_ALLOW_SYNC; - else - scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; - - if (allow_pagemode && scan->rs_base.rs_snapshot && - IsMVCCSnapshot(scan->rs_base.rs_snapshot)) - scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE; - else - scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; - } - - /* - * unpin scan buffers - */ - if (BufferIsValid(scan->rs_cbuf)) - ReleaseBuffer(scan->rs_cbuf); - - /* - * reinitialize scan descriptor - */ - initscan(scan, key, true); -} - -void -tdeheap_endscan(TableScanDesc sscan) -{ - HeapScanDesc scan = (HeapScanDesc) sscan; - - /* Note: no locking manipulations needed */ - - /* - * unpin scan buffers - */ - if (BufferIsValid(scan->rs_cbuf)) - ReleaseBuffer(scan->rs_cbuf); - - /* - * decrement relation reference count and free scan descriptor storage - */ - RelationDecrementReferenceCount(scan->rs_base.rs_rd); - - if (scan->rs_base.rs_key) - pfree(scan->rs_base.rs_key); - - if (scan->rs_strategy != NULL) - FreeAccessStrategy(scan->rs_strategy); - - if (scan->rs_parallelworkerdata != NULL) - pfree(scan->rs_parallelworkerdata); - - if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) - UnregisterSnapshot(scan->rs_base.rs_snapshot); - - pfree(scan); -} - -HeapTuple -tdeheap_getnext(TableScanDesc sscan, ScanDirection direction) -{ - HeapScanDesc scan = (HeapScanDesc) sscan; - - /* - * This is still widely used directly, without going through table AM, so - * add a safety check. It's possible we should, at a later point, - * downgrade this to an assert. The reason for checking the AM routine, - * rather than the AM oid, is that this allows to write regression tests - * that create another AM reusing the heap handler. - */ - if (unlikely(sscan->rs_rd->rd_tableam != GetPGTdeamTableAmRoutine())) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg_internal("only pg_tde AM is supported"))); - - /* - * We don't expect direct calls to tdeheap_getnext with valid CheckXidAlive - * for catalog or regular tables. See detailed comments in xact.c where - * these variables are declared. Normally we have such a check at tableam - * level API but this is called from many places so we need to ensure it - * here. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected tdeheap_getnext call during logical decoding"); - - /* Note: no locking manipulations needed */ - - if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE) - tdeheapgettup_pagemode(scan, direction, - scan->rs_base.rs_nkeys, scan->rs_base.rs_key); - else - tdeheapgettup(scan, direction, - scan->rs_base.rs_nkeys, scan->rs_base.rs_key); - - if (scan->rs_ctup.t_data == NULL) - return NULL; - - /* - * if we get here it means we have a new current scan tuple, so point to - * the proper return buffer and return the tuple. - */ - - pgstat_count_tdeheap_getnext(scan->rs_base.rs_rd); - - return &scan->rs_ctup; -} - -bool -tdeheap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) -{ - HeapScanDesc scan = (HeapScanDesc) sscan; - - /* Note: no locking manipulations needed */ - - if (sscan->rs_flags & SO_ALLOW_PAGEMODE) - tdeheapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key); - else - tdeheapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key); - - if (scan->rs_ctup.t_data == NULL) - { - TdeSlotForgetDecryptedTuple(slot); - ExecClearTuple(slot); - return false; - } - - /* - * if we get here it means we have a new current scan tuple, so point to - * the proper return buffer and return the tuple. - */ - - pgstat_count_tdeheap_getnext(scan->rs_base.rs_rd); - - PGTdeExecStoreBufferHeapTuple(sscan->rs_rd, &scan->rs_ctup, slot, - scan->rs_cbuf); - return true; -} - -void -tdeheap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, - ItemPointer maxtid) -{ - HeapScanDesc scan = (HeapScanDesc) sscan; - BlockNumber startBlk; - BlockNumber numBlks; - ItemPointerData highestItem; - ItemPointerData lowestItem; - - /* - * For relations without any pages, we can simply leave the TID range - * unset. There will be no tuples to scan, therefore no tuples outside - * the given TID range. - */ - if (scan->rs_nblocks == 0) - return; - - /* - * Set up some ItemPointers which point to the first and last possible - * tuples in the heap. - */ - ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber); - ItemPointerSet(&lowestItem, 0, FirstOffsetNumber); - - /* - * If the given maximum TID is below the highest possible TID in the - * relation, then restrict the range to that, otherwise we scan to the end - * of the relation. - */ - if (ItemPointerCompare(maxtid, &highestItem) < 0) - ItemPointerCopy(maxtid, &highestItem); - - /* - * If the given minimum TID is above the lowest possible TID in the - * relation, then restrict the range to only scan for TIDs above that. - */ - if (ItemPointerCompare(mintid, &lowestItem) > 0) - ItemPointerCopy(mintid, &lowestItem); - - /* - * Check for an empty range and protect from would be negative results - * from the numBlks calculation below. - */ - if (ItemPointerCompare(&highestItem, &lowestItem) < 0) - { - /* Set an empty range of blocks to scan */ - tdeheap_setscanlimits(sscan, 0, 0); - return; - } - - /* - * Calculate the first block and the number of blocks we must scan. We - * could be more aggressive here and perform some more validation to try - * and further narrow the scope of blocks to scan by checking if the - * lowestItem has an offset above MaxOffsetNumber. In this case, we could - * advance startBlk by one. Likewise, if highestItem has an offset of 0 - * we could scan one fewer blocks. However, such an optimization does not - * seem worth troubling over, currently. - */ - startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem); - - numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) - - ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1; - - /* Set the start block and number of blocks to scan */ - tdeheap_setscanlimits(sscan, startBlk, numBlks); - - /* Finally, set the TID range in sscan */ - ItemPointerCopy(&lowestItem, &sscan->rs_mintid); - ItemPointerCopy(&highestItem, &sscan->rs_maxtid); -} - -bool -tdeheap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, - TupleTableSlot *slot) -{ - HeapScanDesc scan = (HeapScanDesc) sscan; - ItemPointer mintid = &sscan->rs_mintid; - ItemPointer maxtid = &sscan->rs_maxtid; - - /* Note: no locking manipulations needed */ - for (;;) - { - if (sscan->rs_flags & SO_ALLOW_PAGEMODE) - tdeheapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key); - else - tdeheapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key); - - if (scan->rs_ctup.t_data == NULL) - { - TdeSlotForgetDecryptedTuple(slot); - ExecClearTuple(slot); - return false; - } - - /* - * tdeheap_set_tidrange will have used tdeheap_setscanlimits to limit the - * range of pages we scan to only ones that can contain the TID range - * we're scanning for. Here we must filter out any tuples from these - * pages that are outside of that range. - */ - if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0) - { - ExecClearTuple(slot); - - /* - * When scanning backwards, the TIDs will be in descending order. - * Future tuples in this direction will be lower still, so we can - * just return false to indicate there will be no more tuples. - */ - if (ScanDirectionIsBackward(direction)) - return false; - - continue; - } - - /* - * Likewise for the final page, we must filter out TIDs greater than - * maxtid. - */ - if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0) - { - ExecClearTuple(slot); - - /* - * When scanning forward, the TIDs will be in ascending order. - * Future tuples in this direction will be higher still, so we can - * just return false to indicate there will be no more tuples. - */ - if (ScanDirectionIsForward(direction)) - return false; - continue; - } - - break; - } - - /* - * if we get here it means we have a new current scan tuple, so point to - * the proper return buffer and return the tuple. - */ - pgstat_count_tdeheap_getnext(scan->rs_base.rs_rd); - - PGTdeExecStoreBufferHeapTuple(sscan->rs_rd, &scan->rs_ctup, slot, scan->rs_cbuf); - return true; -} - -/* - * tdeheap_fetch - retrieve tuple with given tid - * - * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding - * the tuple, fill in the remaining fields of *tuple, and check the tuple - * against the specified snapshot. - * - * If successful (tuple found and passes snapshot time qual), then *userbuf - * is set to the buffer holding the tuple and true is returned. The caller - * must unpin the buffer when done with the tuple. - * - * If the tuple is not found (ie, item number references a deleted slot), - * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer, - * and false is returned. - * - * If the tuple is found but fails the time qual check, then the behavior - * depends on the keep_buf parameter. If keep_buf is false, the results - * are the same as for the tuple-not-found case. If keep_buf is true, - * then tuple->t_data and *userbuf are returned as for the success case, - * and again the caller must unpin the buffer; but false is returned. - * - * tdeheap_fetch does not follow HOT chains: only the exact TID requested will - * be fetched. - * - * It is somewhat inconsistent that we ereport() on invalid block number but - * return false on invalid item number. There are a couple of reasons though. - * One is that the caller can relatively easily check the block number for - * validity, but cannot check the item number without reading the page - * himself. Another is that when we are following a t_ctid link, we can be - * reasonably confident that the page number is valid (since VACUUM shouldn't - * truncate off the destination page without having killed the referencing - * tuple first), but the item number might well not be good. - */ -bool -tdeheap_fetch(Relation relation, - Snapshot snapshot, - HeapTuple tuple, - Buffer *userbuf, - bool keep_buf) -{ - ItemPointer tid = &(tuple->t_self); - ItemId lp; - Buffer buffer; - Page page; - OffsetNumber offnum; - bool valid; - - /* - * Fetch and pin the appropriate page of the relation. - */ - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); - - /* - * Need share lock on buffer to examine tuple commit status. - */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - page = BufferGetPage(buffer); - TestForOldSnapshot(snapshot, relation, page); - - /* - * We'd better check for out-of-range offnum in case of VACUUM since the - * TID was obtained. - */ - offnum = ItemPointerGetOffsetNumber(tid); - if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); - *userbuf = InvalidBuffer; - tuple->t_data = NULL; - return false; - } - - /* - * get the item line pointer corresponding to the requested tid - */ - lp = PageGetItemId(page, offnum); - - /* - * Must check for deleted tuple. - */ - if (!ItemIdIsNormal(lp)) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); - *userbuf = InvalidBuffer; - tuple->t_data = NULL; - return false; - } - - /* - * fill in *tuple fields - */ - tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); - tuple->t_len = ItemIdGetLength(lp); - tuple->t_tableOid = RelationGetRelid(relation); - - /* - * check tuple visibility, then release lock - */ - valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); - - if (valid) - PredicateLockTID(relation, &(tuple->t_self), snapshot, - HeapTupleHeaderGetXmin(tuple->t_data)); - - HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - if (valid) - { - /* - * All checks passed, so return the tuple as valid. Caller is now - * responsible for releasing the buffer. - */ - *userbuf = buffer; - - return true; - } - - /* Tuple failed time qual, but maybe caller wants to see it anyway. */ - if (keep_buf) - *userbuf = buffer; - else - { - ReleaseBuffer(buffer); - *userbuf = InvalidBuffer; - tuple->t_data = NULL; - } - - return false; -} - -/* - * tdeheap_hot_search_buffer - search HOT chain for tuple satisfying snapshot - * - * On entry, *tid is the TID of a tuple (either a simple tuple, or the root - * of a HOT chain), and buffer is the buffer holding this tuple. We search - * for the first chain member satisfying the given snapshot. If one is - * found, we update *tid to reference that tuple's offset number, and - * return true. If no match, return false without modifying *tid. - * - * heapTuple is a caller-supplied buffer. When a match is found, we return - * the tuple here, in addition to updating *tid. If no match is found, the - * contents of this buffer on return are undefined. - * - * If all_dead is not NULL, we check non-visible tuples to see if they are - * globally dead; *all_dead is set true if all members of the HOT chain - * are vacuumable, false if not. - * - * Unlike tdeheap_fetch, the caller must already have pin and (at least) share - * lock on the buffer; it is still pinned/locked at exit. - */ -bool -tdeheap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, - Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call) -{ - Page page = BufferGetPage(buffer); - TransactionId prev_xmax = InvalidTransactionId; - BlockNumber blkno; - OffsetNumber offnum; - bool at_chain_start; - bool valid; - bool skip; - GlobalVisState *vistest = NULL; - - /* If this is not the first call, previous call returned a (live!) tuple */ - if (all_dead) - *all_dead = first_call; - - blkno = ItemPointerGetBlockNumber(tid); - offnum = ItemPointerGetOffsetNumber(tid); - at_chain_start = first_call; - skip = !first_call; - - /* XXX: we should assert that a snapshot is pushed or registered */ - Assert(TransactionIdIsValid(RecentXmin)); - Assert(BufferGetBlockNumber(buffer) == blkno); - - /* Scan through possible multiple members of HOT-chain */ - for (;;) - { - ItemId lp; - - /* check for bogus TID */ - if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) - break; - - lp = PageGetItemId(page, offnum); - - /* check for unused, dead, or redirected items */ - if (!ItemIdIsNormal(lp)) - { - /* We should only see a redirect at start of chain */ - if (ItemIdIsRedirected(lp) && at_chain_start) - { - /* Follow the redirect */ - offnum = ItemIdGetRedirect(lp); - at_chain_start = false; - continue; - } - /* else must be end of chain */ - break; - } - - /* - * Update heapTuple to point to the element of the HOT chain we're - * currently investigating. Having t_self set correctly is important - * because the SSI checks and the *Satisfies routine for historical - * MVCC snapshots need the correct tid to decide about the visibility. - */ - heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); - heapTuple->t_len = ItemIdGetLength(lp); - heapTuple->t_tableOid = RelationGetRelid(relation); - ItemPointerSet(&heapTuple->t_self, blkno, offnum); - - /* - * Shouldn't see a HEAP_ONLY tuple at chain start. - */ - if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) - break; - - /* - * The xmin should match the previous xmax value, else chain is - * broken. - */ - if (TransactionIdIsValid(prev_xmax) && - !TransactionIdEquals(prev_xmax, - HeapTupleHeaderGetXmin(heapTuple->t_data))) - break; - - /* - * When first_call is true (and thus, skip is initially false) we'll - * return the first tuple we find. But on later passes, heapTuple - * will initially be pointing to the tuple we returned last time. - * Returning it again would be incorrect (and would loop forever), so - * we skip it and return the next match we find. - */ - if (!skip) - { - /* If it's visible per the snapshot, we must return it */ - valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer); - HeapCheckForSerializableConflictOut(valid, relation, heapTuple, - buffer, snapshot); - - if (valid) - { - ItemPointerSetOffsetNumber(tid, offnum); - PredicateLockTID(relation, &heapTuple->t_self, snapshot, - HeapTupleHeaderGetXmin(heapTuple->t_data)); - if (all_dead) - *all_dead = false; - return true; - } - } - skip = false; - - /* - * If we can't see it, maybe no one else can either. At caller - * request, check whether all chain members are dead to all - * transactions. - * - * Note: if you change the criterion here for what is "dead", fix the - * planner's get_actual_variable_range() function to match. - */ - if (all_dead && *all_dead) - { - if (!vistest) - vistest = GlobalVisTestFor(relation); - - if (!HeapTupleIsSurelyDead(heapTuple, vistest)) - *all_dead = false; - } - - /* - * Check to see if HOT chain continues past this tuple; if so fetch - * the next offnum and loop around. - */ - if (HeapTupleIsHotUpdated(heapTuple)) - { - Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) == - blkno); - offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid); - at_chain_start = false; - prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); - } - else - break; /* end of chain */ - } - - return false; -} - -/* - * tdeheap_get_latest_tid - get the latest tid of a specified tuple - * - * Actually, this gets the latest version that is visible according to the - * scan's snapshot. Create a scan using SnapshotDirty to get the very latest, - * possibly uncommitted version. - * - * *tid is both an input and an output parameter: it is updated to - * show the latest version of the row. Note that it will not be changed - * if no version of the row passes the snapshot test. - */ -void -tdeheap_get_latest_tid(TableScanDesc sscan, - ItemPointer tid) -{ - Relation relation = sscan->rs_rd; - Snapshot snapshot = sscan->rs_snapshot; - ItemPointerData ctid; - TransactionId priorXmax; - - /* - * table_tuple_get_latest_tid() verified that the passed in tid is valid. - * Assume that t_ctid links are valid however - there shouldn't be invalid - * ones in the table. - */ - Assert(ItemPointerIsValid(tid)); - - /* - * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we - * need to examine, and *tid is the TID we will return if ctid turns out - * to be bogus. - * - * Note that we will loop until we reach the end of the t_ctid chain. - * Depending on the snapshot passed, there might be at most one visible - * version of the row, but we don't try to optimize for that. - */ - ctid = *tid; - priorXmax = InvalidTransactionId; /* cannot check first XMIN */ - for (;;) - { - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp; - HeapTupleData tp; - bool valid; - - /* - * Read, pin, and lock the page. - */ - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); - LockBuffer(buffer, BUFFER_LOCK_SHARE); - page = BufferGetPage(buffer); - TestForOldSnapshot(snapshot, relation, page); - - /* - * Check for bogus item number. This is not treated as an error - * condition because it can happen while following a t_ctid link. We - * just assume that the prior tid is OK and return it unchanged. - */ - offnum = ItemPointerGetOffsetNumber(&ctid); - if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) - { - UnlockReleaseBuffer(buffer); - break; - } - lp = PageGetItemId(page, offnum); - if (!ItemIdIsNormal(lp)) - { - UnlockReleaseBuffer(buffer); - break; - } - - /* OK to access the tuple */ - tp.t_self = ctid; - tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); - tp.t_len = ItemIdGetLength(lp); - tp.t_tableOid = RelationGetRelid(relation); - - /* - * After following a t_ctid link, we might arrive at an unrelated - * tuple. Check for XMIN match. - */ - if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) - { - UnlockReleaseBuffer(buffer); - break; - } - - /* - * Check tuple visibility; if visible, set it as the new result - * candidate. - */ - valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer); - HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot); - if (valid) - *tid = ctid; - - /* - * If there's a valid t_ctid link, follow it, else we're done. - */ - if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || - HeapTupleHeaderIsOnlyLocked(tp.t_data) || - HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) || - ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) - { - UnlockReleaseBuffer(buffer); - break; - } - - ctid = tp.t_data->t_ctid; - priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); - UnlockReleaseBuffer(buffer); - } /* end of loop */ -} - - -/* - * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends - * - * This is called after we have waited for the XMAX transaction to terminate. - * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will - * be set on exit. If the transaction committed, we set the XMAX_COMMITTED - * hint bit if possible --- but beware that that may not yet be possible, - * if the transaction committed asynchronously. - * - * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID - * even if it commits. - * - * Hence callers should look only at XMAX_INVALID. - * - * Note this is not allowed for tuples whose xmax is a multixact. - */ -static void -UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid) -{ - Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid)); - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); - - if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID))) - { - if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) && - TransactionIdDidCommit(xid)) - HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - xid); - else - HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - } -} - - -/* - * GetBulkInsertState - prepare status object for a bulk insert - */ -BulkInsertState -GetBulkInsertState(void) -{ - BulkInsertState bistate; - - bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData)); - bistate->strategy = GetAccessStrategy(BAS_BULKWRITE); - bistate->current_buf = InvalidBuffer; - bistate->next_free = InvalidBlockNumber; - bistate->last_free = InvalidBlockNumber; - bistate->already_extended_by = 0; - return bistate; -} - -/* - * FreeBulkInsertState - clean up after finishing a bulk insert - */ -void -FreeBulkInsertState(BulkInsertState bistate) -{ - if (bistate->current_buf != InvalidBuffer) - ReleaseBuffer(bistate->current_buf); - FreeAccessStrategy(bistate->strategy); - pfree(bistate); -} - -/* - * ReleaseBulkInsertStatePin - release a buffer currently held in bistate - */ -void -ReleaseBulkInsertStatePin(BulkInsertState bistate) -{ - if (bistate->current_buf != InvalidBuffer) - ReleaseBuffer(bistate->current_buf); - bistate->current_buf = InvalidBuffer; - - /* - * Despite the name, we also reset bulk relation extension state. - * Otherwise we can end up erroring out due to looking for free space in - * ->next_free of one partition, even though ->next_free was set when - * extending another partition. It could obviously also be bad for - * efficiency to look at existing blocks at offsets from another - * partition, even if we don't error out. - */ - bistate->next_free = InvalidBlockNumber; - bistate->last_free = InvalidBlockNumber; -} - - -/* - * tdeheap_insert - insert tuple into a heap - * - * The new tuple is stamped with current transaction ID and the specified - * command ID. - * - * See table_tuple_insert for comments about most of the input flags, except - * that this routine directly takes a tuple rather than a slot. - * - * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_ - * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to - * implement table_tuple_insert_speculative(). - * - * On return the header fields of *tup are updated to match the stored tuple; - * in particular tup->t_self receives the actual TID where the tuple was - * stored. But note that any toasting of fields within the tuple data is NOT - * reflected into *tup. - */ -void -tdeheap_insert(Relation relation, HeapTuple tup, CommandId cid, - int options, BulkInsertState bistate) -{ - TransactionId xid = GetCurrentTransactionId(); - HeapTuple heaptup; - Buffer buffer; - Buffer vmbuffer = InvalidBuffer; - bool all_visible_cleared = false; - - /* Cheap, simplistic check that the tuple matches the rel's rowtype. */ - Assert(HeapTupleHeaderGetNatts(tup->t_data) <= - RelationGetNumberOfAttributes(relation)); - - /* - * Fill in tuple header fields and toast the tuple if necessary. - * - * Note: below this point, heaptup is the data we actually intend to store - * into the relation; tup is the caller's original untoasted data. - */ - heaptup = tdeheap_prepare_insert(relation, tup, xid, cid, options); - - /* - * Find buffer to insert this tuple into. If the page is all visible, - * this will also pin the requisite visibility map page. - */ - buffer = tdeheap_RelationGetBufferForTuple(relation, heaptup->t_len, - InvalidBuffer, options, bistate, - &vmbuffer, NULL, - 0); - - /* - * We're about to do the actual insert -- but check for conflict first, to - * avoid possibly having to roll back work we've just done. - * - * This is safe without a recheck as long as there is no possibility of - * another process scanning the page between this check and the insert - * being visible to the scan (i.e., an exclusive buffer content lock is - * continuously held from this point until the tuple insert is visible). - * - * For a heap insert, we only need to check for table-level SSI locks. Our - * new tuple can't possibly conflict with existing tuple locks, and heap - * page locks are only consolidated versions of tuple locks; they do not - * lock "gaps" as index page locks do. So we don't need to specify a - * buffer when making the call, which makes for a faster check. - */ - CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); - - /* - * Make sure relation keys in the cahce to avoid pallocs in - * the critical section. - */ - GetRelationKey(relation->rd_locator); - - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, - (options & HEAP_INSERT_TDE_NO_ENCRYPT) == 0, - (options & HEAP_INSERT_SPECULATIVE) != 0); - - if (PageIsAllVisible(BufferGetPage(buffer))) - { - all_visible_cleared = true; - PageClearAllVisible(BufferGetPage(buffer)); - tdeheap_visibilitymap_clear(relation, - ItemPointerGetBlockNumber(&(heaptup->t_self)), - vmbuffer, VISIBILITYMAP_VALID_BITS); - } - - /* - * XXX Should we set PageSetPrunable on this page ? - * - * The inserting transaction may eventually abort thus making this tuple - * DEAD and hence available for pruning. Though we don't want to optimize - * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the - * aborted tuple will never be pruned until next vacuum is triggered. - * - * If you do add PageSetPrunable here, add it in tdeheap_xlog_insert too. - */ - - MarkBufferDirty(buffer); - - /* XLOG stuff */ - if (RelationNeedsWAL(relation)) - { - xl_tdeheap_insert xlrec; - xl_tdeheap_header xlhdr; - XLogRecPtr recptr; - Page page = BufferGetPage(buffer); - uint8 info = XLOG_HEAP_INSERT; - int bufflags = 0; - - /* - * If this is a catalog, we need to transmit combo CIDs to properly - * decode, so log that as well. - */ - if (RelationIsAccessibleInLogicalDecoding(relation)) - log_tdeheap_new_cid(relation, heaptup); - - /* - * If this is the single and first tuple on page, we can reinit the - * page instead of restoring the whole thing. Set flag, and hide - * buffer references from XLogInsert. - */ - if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber && - PageGetMaxOffsetNumber(page) == FirstOffsetNumber) - { - info |= XLOG_HEAP_INIT_PAGE; - bufflags |= REGBUF_WILL_INIT; - } - - xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self); - xlrec.flags = 0; - if (all_visible_cleared) - xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED; - if (options & HEAP_INSERT_SPECULATIVE) - xlrec.flags |= XLH_INSERT_IS_SPECULATIVE; - Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer)); - - /* - * For logical decoding, we need the tuple even if we're doing a full - * page write, so make sure it's included even if we take a full-page - * image. (XXX We could alternatively store a pointer into the FPW). - */ - if (RelationIsLogicallyLogged(relation) && - !(options & HEAP_INSERT_NO_LOGICAL)) - { - xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; - bufflags |= REGBUF_KEEP_DATA; - - if (IsToastRelation(relation)) - xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION; - } - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapInsert); - - xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; - xlhdr.t_infomask = heaptup->t_data->t_infomask; - xlhdr.t_hoff = heaptup->t_data->t_hoff; - - /* - * note we mark xlhdr as belonging to buffer; if XLogInsert decides to - * write the whole page to the xlog, we don't need to store - * xl_tdeheap_header in the xlog. - */ - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); - XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); - /* register encrypted tuple data from the buffer */ - PageHeader phdr = (PageHeader) BufferGetPage(buffer); - /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ - XLogRegisterBufData(0, - ((char *) phdr) + phdr->pd_upper + SizeofHeapTupleHeader, - heaptup->t_len - SizeofHeapTupleHeader); - - /* filtering by origin on a row level is much more efficient */ - XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - - recptr = XLogInsert(RM_HEAP_ID, info); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - UnlockReleaseBuffer(buffer); - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - - /* - * If tuple is cachable, mark it for invalidation from the caches in case - * we abort. Note it is OK to do this after releasing the buffer, because - * the heaptup data structure is all in local memory, not in the shared - * buffer. - */ - CacheInvalidateHeapTuple(relation, heaptup, NULL); - - /* Note: speculative insertions are counted too, even if aborted later */ - pgstat_count_tdeheap_insert(relation, 1); - - /* - * If heaptup is a private copy, release it. Don't forget to copy t_self - * back to the caller's image, too. - */ - if (heaptup != tup) - { - tup->t_self = heaptup->t_self; - tdeheap_freetuple(heaptup); - } -} - -/* - * Subroutine for tdeheap_insert(). Prepares a tuple for insertion. This sets the - * tuple header fields and toasts the tuple if necessary. Returns a toasted - * version of the tuple if it was toasted, or the original tuple if not. Note - * that in any case, the header fields are also set in the original tuple. - */ -static HeapTuple -tdeheap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, - CommandId cid, int options) -{ - /* - * To allow parallel inserts, we need to ensure that they are safe to be - * performed in workers. We have the infrastructure to allow parallel - * inserts in general except for the cases where inserts generate a new - * CommandId (eg. inserts into a table having a foreign key column). - */ - if (IsParallelWorker()) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TRANSACTION_STATE), - errmsg("cannot insert tuples in a parallel worker"))); - - tup->t_data->t_infomask &= ~(HEAP_XACT_MASK); - tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); - tup->t_data->t_infomask |= HEAP_XMAX_INVALID; - HeapTupleHeaderSetXmin(tup->t_data, xid); - if (options & HEAP_INSERT_FROZEN) - HeapTupleHeaderSetXminFrozen(tup->t_data); - - HeapTupleHeaderSetCmin(tup->t_data, cid); - HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */ - tup->t_tableOid = RelationGetRelid(relation); - - /* - * If the new tuple is too big for storage or contains already toasted - * out-of-line attributes from some other relation, invoke the toaster. - */ - if (relation->rd_rel->relkind != RELKIND_RELATION && - relation->rd_rel->relkind != RELKIND_MATVIEW) - { - /* toast table entries should never be recursively toasted */ - Assert(!HeapTupleHasExternal(tup)); - return tup; - } - else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) - return tdeheap_toast_insert_or_update(relation, tup, NULL, options); - else - return tup; -} - -/* - * Helper for tdeheap_multi_insert() that computes the number of entire pages - * that inserting the remaining heaptuples requires. Used to determine how - * much the relation needs to be extended by. - */ -static int -tdeheap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace) -{ - size_t page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace; - int npages = 1; - - for (int i = done; i < ntuples; i++) - { - size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len); - - if (page_avail < tup_sz) - { - npages++; - page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace; - } - page_avail -= tup_sz; - } - - return npages; -} - -/* - * tdeheap_multi_insert - insert multiple tuples into a heap - * - * This is like tdeheap_insert(), but inserts multiple tuples in one operation. - * That's faster than calling tdeheap_insert() in a loop, because when multiple - * tuples can be inserted on a single page, we can write just a single WAL - * record covering all of them, and only need to lock/unlock the page once. - * - * Note: this leaks memory into the current memory context. You can create a - * temporary context before calling this, if that's a problem. - */ -void -tdeheap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, - CommandId cid, int options, BulkInsertState bistate) -{ - TransactionId xid = GetCurrentTransactionId(); - HeapTuple *heaptuples; - int i; - int ndone; - PGAlignedBlock scratch; - Page page; - Buffer vmbuffer = InvalidBuffer; - bool needwal; - Size saveFreeSpace; - bool need_tuple_data = RelationIsLogicallyLogged(relation); - bool need_cids = RelationIsAccessibleInLogicalDecoding(relation); - bool starting_with_empty_page = false; - int npages = 0; - int npages_used = 0; - - /* currently not needed (thus unsupported) for tdeheap_multi_insert() */ - Assert(!(options & HEAP_INSERT_NO_LOGICAL)); - - needwal = RelationNeedsWAL(relation); - saveFreeSpace = RelationGetTargetPageFreeSpace(relation, - HEAP_DEFAULT_FILLFACTOR); - - /* Toast and set header data in all the slots */ - heaptuples = palloc(ntuples * sizeof(HeapTuple)); - for (i = 0; i < ntuples; i++) - { - HeapTuple tuple; - - tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL); - slots[i]->tts_tableOid = RelationGetRelid(relation); - tuple->t_tableOid = slots[i]->tts_tableOid; - heaptuples[i] = tdeheap_prepare_insert(relation, tuple, xid, cid, - options); - } - - /* - * We're about to do the actual inserts -- but check for conflict first, - * to minimize the possibility of having to roll back work we've just - * done. - * - * A check here does not definitively prevent a serialization anomaly; - * that check MUST be done at least past the point of acquiring an - * exclusive buffer content lock on every buffer that will be affected, - * and MAY be done after all inserts are reflected in the buffers and - * those locks are released; otherwise there is a race condition. Since - * multiple buffers can be locked and unlocked in the loop below, and it - * would not be feasible to identify and lock all of those buffers before - * the loop, we must do a final check at the end. - * - * The check here could be omitted with no loss of correctness; it is - * present strictly as an optimization. - * - * For heap inserts, we only need to check for table-level SSI locks. Our - * new tuples can't possibly conflict with existing tuple locks, and heap - * page locks are only consolidated versions of tuple locks; they do not - * lock "gaps" as index page locks do. So we don't need to specify a - * buffer when making the call, which makes for a faster check. - */ - CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); - - ndone = 0; - while (ndone < ntuples) - { - Buffer buffer; - bool all_visible_cleared = false; - bool all_frozen_set = false; - int nthispage; - - CHECK_FOR_INTERRUPTS(); - - /* - * Compute number of pages needed to fit the to-be-inserted tuples in - * the worst case. This will be used to determine how much to extend - * the relation by in tdeheap_RelationGetBufferForTuple(), if needed. If we - * filled a prior page from scratch, we can just update our last - * computation, but if we started with a partially filled page, - * recompute from scratch, the number of potentially required pages - * can vary due to tuples needing to fit onto the page, page headers - * etc. - */ - if (ndone == 0 || !starting_with_empty_page) - { - npages = tdeheap_multi_insert_pages(heaptuples, ndone, ntuples, - saveFreeSpace); - npages_used = 0; - } - else - npages_used++; - - /* - * Find buffer where at least the next tuple will fit. If the page is - * all-visible, this will also pin the requisite visibility map page. - * - * Also pin visibility map page if COPY FREEZE inserts tuples into an - * empty page. See all_frozen_set below. - */ - buffer = tdeheap_RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len, - InvalidBuffer, options, bistate, - &vmbuffer, NULL, - npages - npages_used); - page = BufferGetPage(buffer); - - starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0; - - if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN)) - all_frozen_set = true; - - /* - * Make sure relation keys in the cahce to avoid pallocs in - * the critical section. - */ - GetRelationKey(relation->rd_locator); - - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - /* - * tdeheap_RelationGetBufferForTuple has ensured that the first tuple fits. - * Put that on the page, and then as many other tuples as fit. - */ - tdeheap_RelationPutHeapTuple(relation, buffer, heaptuples[ndone], true, false); - - /* - * For logical decoding we need combo CIDs to properly decode the - * catalog. - */ - if (needwal && need_cids) - log_tdeheap_new_cid(relation, heaptuples[ndone]); - - for (nthispage = 1; ndone + nthispage < ntuples; nthispage++) - { - HeapTuple heaptup = heaptuples[ndone + nthispage]; - - if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) - break; - - tdeheap_RelationPutHeapTuple(relation, buffer, heaptup, true, false); - - /* - * For logical decoding we need combo CIDs to properly decode the - * catalog. - */ - if (needwal && need_cids) - log_tdeheap_new_cid(relation, heaptup); - } - - /* - * If the page is all visible, need to clear that, unless we're only - * going to add further frozen rows to it. - * - * If we're only adding already frozen rows to a previously empty - * page, mark it as all-visible. - */ - if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN)) - { - all_visible_cleared = true; - PageClearAllVisible(page); - tdeheap_visibilitymap_clear(relation, - BufferGetBlockNumber(buffer), - vmbuffer, VISIBILITYMAP_VALID_BITS); - } - else if (all_frozen_set) - PageSetAllVisible(page); - - /* - * XXX Should we set PageSetPrunable on this page ? See tdeheap_insert() - */ - - MarkBufferDirty(buffer); - - /* XLOG stuff */ - if (needwal) - { - XLogRecPtr recptr; - xl_tdeheap_multi_insert *xlrec; - uint8 info = XLOG_HEAP2_MULTI_INSERT; - char *tupledata; - int totaldatalen; - char *scratchptr = scratch.data; - bool init; - int bufflags = 0; - - /* - * If the page was previously empty, we can reinit the page - * instead of restoring the whole thing. - */ - init = starting_with_empty_page; - - /* allocate xl_tdeheap_multi_insert struct from the scratch area */ - xlrec = (xl_tdeheap_multi_insert *) scratchptr; - scratchptr += SizeOfHeapMultiInsert; - - /* - * Allocate offsets array. Unless we're reinitializing the page, - * in that case the tuples are stored in order starting at - * FirstOffsetNumber and we don't need to store the offsets - * explicitly. - */ - if (!init) - scratchptr += nthispage * sizeof(OffsetNumber); - - /* the rest of the scratch space is used for tuple data */ - tupledata = scratchptr; - - /* check that the mutually exclusive flags are not both set */ - Assert(!(all_visible_cleared && all_frozen_set)); - - xlrec->flags = 0; - if (all_visible_cleared) - xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED; - if (all_frozen_set) - xlrec->flags = XLH_INSERT_ALL_FROZEN_SET; - - xlrec->ntuples = nthispage; - - /* - * Write out an xl_multi_insert_tuple and the tuple data itself - * for each tuple. - */ - for (i = 0; i < nthispage; i++) - { - HeapTuple heaptup = heaptuples[ndone + i]; - xl_multi_insert_tuple *tuphdr; - int datalen; - - if (!init) - xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); - /* xl_multi_insert_tuple needs two-byte alignment. */ - tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); - scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; - - tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; - tuphdr->t_infomask = heaptup->t_data->t_infomask; - tuphdr->t_hoff = heaptup->t_data->t_hoff; - - /* Point to an encrypted tuple data in the Buffer */ - char *tup_data_on_page = (char *) page + ItemIdGetOffset(PageGetItemId(page, heaptup->t_self.ip_posid)); - /* write bitmap [+ padding] [+ oid] + data */ - datalen = heaptup->t_len - SizeofHeapTupleHeader; - memcpy(scratchptr, - tup_data_on_page + SizeofHeapTupleHeader, - datalen); - tuphdr->datalen = datalen; - scratchptr += datalen; - } - totaldatalen = scratchptr - tupledata; - Assert((scratchptr - scratch.data) < BLCKSZ); - - if (need_tuple_data) - xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; - - /* - * Signal that this is the last xl_tdeheap_multi_insert record - * emitted by this call to tdeheap_multi_insert(). Needed for logical - * decoding so it knows when to cleanup temporary data. - */ - if (ndone + nthispage == ntuples) - xlrec->flags |= XLH_INSERT_LAST_IN_MULTI; - - if (init) - { - info |= XLOG_HEAP_INIT_PAGE; - bufflags |= REGBUF_WILL_INIT; - } - - /* - * If we're doing logical decoding, include the new tuple data - * even if we take a full-page image of the page. - */ - if (need_tuple_data) - bufflags |= REGBUF_KEEP_DATA; - - XLogBeginInsert(); - XLogRegisterData((char *) xlrec, tupledata - scratch.data); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); - - XLogRegisterBufData(0, tupledata, totaldatalen); - - /* filtering by origin on a row level is much more efficient */ - XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - - recptr = XLogInsert(RM_HEAP2_ID, info); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - /* - * If we've frozen everything on the page, update the visibilitymap. - * We're already holding pin on the vmbuffer. - */ - if (all_frozen_set) - { - Assert(PageIsAllVisible(page)); - Assert(tdeheap_visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); - - /* - * It's fine to use InvalidTransactionId here - this is only used - * when HEAP_INSERT_FROZEN is specified, which intentionally - * violates visibility rules. - */ - tdeheap_visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer, - InvalidXLogRecPtr, vmbuffer, - InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); - } - - UnlockReleaseBuffer(buffer); - ndone += nthispage; - - /* - * NB: Only release vmbuffer after inserting all tuples - it's fairly - * likely that we'll insert into subsequent heap pages that are likely - * to use the same vm page. - */ - } - - /* We're done with inserting all tuples, so release the last vmbuffer. */ - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - - /* - * We're done with the actual inserts. Check for conflicts again, to - * ensure that all rw-conflicts in to these inserts are detected. Without - * this final check, a sequential scan of the heap may have locked the - * table after the "before" check, missing one opportunity to detect the - * conflict, and then scanned the table before the new tuples were there, - * missing the other chance to detect the conflict. - * - * For heap inserts, we only need to check for table-level SSI locks. Our - * new tuples can't possibly conflict with existing tuple locks, and heap - * page locks are only consolidated versions of tuple locks; they do not - * lock "gaps" as index page locks do. So we don't need to specify a - * buffer when making the call. - */ - CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); - - /* - * If tuples are cachable, mark them for invalidation from the caches in - * case we abort. Note it is OK to do this after releasing the buffer, - * because the heaptuples data structure is all in local memory, not in - * the shared buffer. - */ - if (IsCatalogRelation(relation)) - { - for (i = 0; i < ntuples; i++) - CacheInvalidateHeapTuple(relation, heaptuples[i], NULL); - } - - /* copy t_self fields back to the caller's slots */ - for (i = 0; i < ntuples; i++) - slots[i]->tts_tid = heaptuples[i]->t_self; - - pgstat_count_tdeheap_insert(relation, ntuples); -} - -/* - * simple_tdeheap_insert - insert a tuple - * - * Currently, this routine differs from tdeheap_insert only in supplying - * a default command ID and not allowing access to the speedup options. - * - * This should be used rather than using tdeheap_insert directly in most places - * where we are modifying system catalogs. - */ -void -simple_tdeheap_insert(Relation relation, HeapTuple tup) -{ - tdeheap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL); -} - -/* - * Given infomask/infomask2, compute the bits that must be saved in the - * "infobits" field of xl_tdeheap_delete, xl_tdeheap_update, xl_tdeheap_lock, - * xl_tdeheap_lock_updated WAL records. - * - * See fix_infomask_from_infobits. - */ -static uint8 -compute_infobits(uint16 infomask, uint16 infomask2) -{ - return - ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) | - ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) | - ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) | - /* note we ignore HEAP_XMAX_SHR_LOCK here */ - ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) | - ((infomask2 & HEAP_KEYS_UPDATED) != 0 ? - XLHL_KEYS_UPDATED : 0); -} - -/* - * Given two versions of the same t_infomask for a tuple, compare them and - * return whether the relevant status for a tuple Xmax has changed. This is - * used after a buffer lock has been released and reacquired: we want to ensure - * that the tuple state continues to be the same it was when we previously - * examined it. - * - * Note the Xmax field itself must be compared separately. - */ -static inline bool -xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) -{ - const uint16 interesting = - HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK; - - if ((new_infomask & interesting) != (old_infomask & interesting)) - return true; - - return false; -} - -/* - * tdeheap_delete - delete a tuple - * - * See table_tuple_delete() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. - * - * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, - * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last - * only for TM_SelfModified, since we cannot obtain cmax from a combo CID - * generated by another transaction). - */ -TM_Result -tdeheap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) -{ - TM_Result result; - TransactionId xid = GetCurrentTransactionId(); - ItemId lp; - HeapTupleData tp; - Page page; - BlockNumber block; - Buffer buffer; - Buffer vmbuffer = InvalidBuffer; - TransactionId new_xmax; - uint16 new_infomask, - new_infomask2; - bool have_tuple_lock = false; - bool iscombo; - bool all_visible_cleared = false; - HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ - bool old_key_copied = false; - HeapTuple decrypted_tuple; - - Assert(ItemPointerIsValid(tid)); - - /* - * Forbid this during a parallel operation, lest it allocate a combo CID. - * Other workers might need that combo CID for visibility checks, and we - * have no provision for broadcasting it to them. - */ - if (IsInParallelMode()) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TRANSACTION_STATE), - errmsg("cannot delete tuples during a parallel operation"))); - - block = ItemPointerGetBlockNumber(tid); - buffer = ReadBuffer(relation, block); - page = BufferGetPage(buffer); - - /* - * Before locking the buffer, pin the visibility map page if it appears to - * be necessary. Since we haven't got the lock yet, someone else might be - * in the middle of changing this, so we'll need to recheck after we have - * the lock. - */ - if (PageIsAllVisible(page)) - tdeheap_visibilitymap_pin(relation, block, &vmbuffer); - - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); - Assert(ItemIdIsNormal(lp)); - - tp.t_tableOid = RelationGetRelid(relation); - tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); - tp.t_len = ItemIdGetLength(lp); - tp.t_self = *tid; - -l1: - - /* - * If we didn't pin the visibility map page and the page has become all - * visible while we were busy locking the buffer, we'll have to unlock and - * re-lock, to avoid holding the buffer lock across an I/O. That's a bit - * unfortunate, but hopefully shouldn't happen often. - */ - if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - tdeheap_visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - } - - result = HeapTupleSatisfiesUpdate(&tp, cid, buffer); - - if (result == TM_Invisible) - { - UnlockReleaseBuffer(buffer); - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("attempted to delete invisible tuple"))); - } - else if (result == TM_BeingModified && wait) - { - TransactionId xwait; - uint16 infomask; - - /* must copy state data before unlocking buffer */ - xwait = HeapTupleHeaderGetRawXmax(tp.t_data); - infomask = tp.t_data->t_infomask; - - /* - * Sleep until concurrent transaction ends -- except when there's a - * single locker and it's our own transaction. Note we don't care - * which lock mode the locker has, because we need the strongest one. - * - * Before sleeping, we need to acquire tuple lock to establish our - * priority for the tuple (see tdeheap_lock_tuple). LockTuple will - * release us when we are next-in-line for the tuple. - * - * If we are forced to "start over" below, we keep the tuple lock; - * this arranges that we stay at the head of the line while rechecking - * tuple state. - */ - if (infomask & HEAP_XMAX_IS_MULTI) - { - bool current_is_member = false; - - if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, - LockTupleExclusive, ¤t_is_member)) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - /* - * Acquire the lock, if necessary (but skip it when we're - * requesting a lock and already have one; avoids deadlock). - */ - if (!current_is_member) - tdeheap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive, - LockWaitBlock, &have_tuple_lock); - - /* wait for multixact */ - MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask, - relation, &(tp.t_self), XLTW_Delete, - NULL); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * If xwait had just locked the tuple then some other xact - * could update this tuple before we get to this point. Check - * for xmax change, and start over if so. - * - * We also must start over if we didn't pin the VM page, and - * the page has become all visible. - */ - if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) || - xmax_infomask_changed(tp.t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), - xwait)) - goto l1; - } - - /* - * You might think the multixact is necessarily done here, but not - * so: it could have surviving members, namely our own xact or - * other subxacts of this backend. It is legal for us to delete - * the tuple in either case, however (the latter case is - * essentially a situation of upgrading our former shared lock to - * exclusive). We don't bother changing the on-disk hint bits - * since we are about to overwrite the xmax altogether. - */ - } - else if (!TransactionIdIsCurrentTransactionId(xwait)) - { - /* - * Wait for regular transaction to end; but first, acquire tuple - * lock. - */ - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - tdeheap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive, - LockWaitBlock, &have_tuple_lock); - XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * xwait is done, but if xwait had just locked the tuple then some - * other xact could update this tuple before we get to this point. - * Check for xmax change, and start over if so. - * - * We also must start over if we didn't pin the VM page, and the - * page has become all visible. - */ - if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) || - xmax_infomask_changed(tp.t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), - xwait)) - goto l1; - - /* Otherwise check if it committed or aborted */ - UpdateXmaxHintBits(tp.t_data, buffer, xwait); - } - - /* - * We may overwrite if previous xmax aborted, or if it committed but - * only locked the tuple without updating it. - */ - if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || - HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) || - HeapTupleHeaderIsOnlyLocked(tp.t_data)) - result = TM_Ok; - else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) - result = TM_Updated; - else - result = TM_Deleted; - } - - /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */ - if (result != TM_Ok) - { - Assert(result == TM_SelfModified || - result == TM_Updated || - result == TM_Deleted || - result == TM_BeingModified); - Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); - Assert(result != TM_Updated || - !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)); - } - - if (crosscheck != InvalidSnapshot && result == TM_Ok) - { - /* Perform additional check for transaction-snapshot mode RI updates */ - if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer)) - result = TM_Updated; - } - - if (result != TM_Ok) - { - tmfd->ctid = tp.t_data->t_ctid; - tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data); - if (result == TM_SelfModified) - tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); - else - tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); - if (have_tuple_lock) - UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - return result; - } - - /* - * We're about to do the actual delete -- check for conflict first, to - * avoid possibly having to roll back work we've just done. - * - * This is safe without a recheck as long as there is no possibility of - * another process scanning the page between this check and the delete - * being visible to the scan (i.e., an exclusive buffer content lock is - * continuously held from this point until the tuple delete is visible). - */ - CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer)); - - /* replace cid with a combo CID if necessary */ - HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo); - - /* - * Compute replica identity tuple before entering the critical section so - * we don't PANIC upon a memory allocation failure. - * - * ExtractReplicaIdentity has to get a decrypted tuple, otherwise it - * won't be able to extract varlen attributes. - */ - decrypted_tuple = tdeheap_copytuple(&tp); - PG_TDE_DECRYPT_TUPLE(&tp, decrypted_tuple, GetRelationKey(relation->rd_locator)); - - old_key_tuple = ExtractReplicaIdentity(relation, decrypted_tuple, true, &old_key_copied); - - tdeheap_freetuple(decrypted_tuple); - - /* - * If this is the first possibly-multixact-able operation in the current - * transaction, set my per-backend OldestMemberMXactId setting. We can be - * certain that the transaction will never become a member of any older - * MultiXactIds than that. (We have to do this even if we end up just - * using our own TransactionId below, since some other backend could - * incorporate our XID into a MultiXact immediately afterwards.) - */ - MultiXactIdSetOldestMember(); - - compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data), - tp.t_data->t_infomask, tp.t_data->t_infomask2, - xid, LockTupleExclusive, true, - &new_xmax, &new_infomask, &new_infomask2); - - START_CRIT_SECTION(); - - /* - * If this transaction commits, the tuple will become DEAD sooner or - * later. Set flag that this page is a candidate for pruning once our xid - * falls below the OldestXmin horizon. If the transaction finally aborts, - * the subsequent page pruning will be a no-op and the hint will be - * cleared. - */ - PageSetPrunable(page, xid); - - if (PageIsAllVisible(page)) - { - all_visible_cleared = true; - PageClearAllVisible(page); - tdeheap_visibilitymap_clear(relation, BufferGetBlockNumber(buffer), - vmbuffer, VISIBILITYMAP_VALID_BITS); - } - - /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; - tp.t_data->t_infomask |= new_infomask; - tp.t_data->t_infomask2 |= new_infomask2; - HeapTupleHeaderClearHotUpdated(tp.t_data); - HeapTupleHeaderSetXmax(tp.t_data, new_xmax); - HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo); - /* Make sure there is no forward chain link in t_ctid */ - tp.t_data->t_ctid = tp.t_self; - - /* Signal that this is actually a move into another partition */ - if (changingPart) - HeapTupleHeaderSetMovedPartitions(tp.t_data); - - MarkBufferDirty(buffer); - - /* - * XLOG stuff - * - * NB: tdeheap_abort_speculative() uses the same xlog record and replay - * routines. - */ - if (RelationNeedsWAL(relation)) - { - xl_tdeheap_delete xlrec; - xl_tdeheap_header xlhdr; - XLogRecPtr recptr; - - /* - * For logical decode we need combo CIDs to properly decode the - * catalog - */ - if (RelationIsAccessibleInLogicalDecoding(relation)) - log_tdeheap_new_cid(relation, &tp); - - xlrec.flags = 0; - if (all_visible_cleared) - xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED; - if (changingPart) - xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE; - xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, - tp.t_data->t_infomask2); - xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); - xlrec.xmax = new_xmax; - - if (old_key_tuple != NULL) - { - if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) - xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE; - else - xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY; - } - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); - - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - - /* - * Log replica identity of the deleted tuple if there is one - */ - if (old_key_tuple != NULL) - { - xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; - xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; - xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; - - XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader); - XLogRegisterData((char *) old_key_tuple->t_data - + SizeofHeapTupleHeader, - old_key_tuple->t_len - - SizeofHeapTupleHeader); - } - - /* filtering by origin on a row level is much more efficient */ - XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - - /* - * If the tuple has toasted out-of-line attributes, we need to delete - * those items too. We have to do this before releasing the buffer - * because we need to look at the contents of the tuple, but it's OK to - * release the content lock on the buffer first. - */ - if (relation->rd_rel->relkind != RELKIND_RELATION && - relation->rd_rel->relkind != RELKIND_MATVIEW) - { - /* toast table entries should never be recursively toasted */ - Assert(!HeapTupleHasExternal(&tp)); - } - else if (HeapTupleHasExternal(&tp)) - tdeheap_toast_delete(relation, &tp, false); - - /* - * Mark tuple for invalidation from system caches at next command - * boundary. We have to do this before releasing the buffer because we - * need to look at the contents of the tuple. - */ - CacheInvalidateHeapTuple(relation, &tp, NULL); - - /* Now we can release the buffer */ - ReleaseBuffer(buffer); - - /* - * Release the lmgr tuple lock, if we had it. - */ - if (have_tuple_lock) - UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); - - pgstat_count_tdeheap_delete(relation); - - if (old_key_tuple != NULL && old_key_copied) - tdeheap_freetuple(old_key_tuple); - - return TM_Ok; -} - -/* - * simple_tdeheap_delete - delete a tuple - * - * This routine may be used to delete a tuple when concurrent updates of - * the target tuple are not expected (for example, because we have a lock - * on the relation associated with the tuple). Any failure is reported - * via ereport(). - */ -void -simple_tdeheap_delete(Relation relation, ItemPointer tid) -{ - TM_Result result; - TM_FailureData tmfd; - - result = tdeheap_delete(relation, tid, - GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); - switch (result) - { - case TM_SelfModified: - /* Tuple was already updated in current command? */ - elog(ERROR, "tuple already updated by self"); - break; - - case TM_Ok: - /* done successfully */ - break; - - case TM_Updated: - elog(ERROR, "tuple concurrently updated"); - break; - - case TM_Deleted: - elog(ERROR, "tuple concurrently deleted"); - break; - - default: - elog(ERROR, "unrecognized tdeheap_delete status: %u", result); - break; - } -} - -/* - * tdeheap_update - replace a tuple - * - * See table_tuple_update() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. - * - * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, - * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last - * only for TM_SelfModified, since we cannot obtain cmax from a combo CID - * generated by another transaction). - */ -TM_Result -tdeheap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) -{ - TM_Result result; - TransactionId xid = GetCurrentTransactionId(); - Bitmapset *hot_attrs; - Bitmapset *sum_attrs; - Bitmapset *key_attrs; - Bitmapset *id_attrs; - Bitmapset *interesting_attrs; - Bitmapset *modified_attrs; - ItemId lp; - HeapTupleData oldtup; - HeapTupleData oldtup_decrypted; - void* oldtup_data; - HeapTuple heaptup; - HeapTuple old_key_tuple = NULL; - bool old_key_copied = false; - Page page; - BlockNumber block; - MultiXactStatus mxact_status; - Buffer buffer, - newbuf, - vmbuffer = InvalidBuffer, - vmbuffer_new = InvalidBuffer; - bool need_toast; - Size newtupsize, - pagefree; - bool have_tuple_lock = false; - bool iscombo; - bool use_hot_update = false; - bool summarized_update = false; - bool key_intact; - bool all_visible_cleared = false; - bool all_visible_cleared_new = false; - bool checked_lockers; - bool locker_remains; - bool id_has_external = false; - TransactionId xmax_new_tuple, - xmax_old_tuple; - uint16 infomask_old_tuple, - infomask2_old_tuple, - infomask_new_tuple, - infomask2_new_tuple; - - Assert(ItemPointerIsValid(otid)); - - /* Cheap, simplistic check that the tuple matches the rel's rowtype. */ - Assert(HeapTupleHeaderGetNatts(newtup->t_data) <= - RelationGetNumberOfAttributes(relation)); - - /* - * Forbid this during a parallel operation, lest it allocate a combo CID. - * Other workers might need that combo CID for visibility checks, and we - * have no provision for broadcasting it to them. - */ - if (IsInParallelMode()) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TRANSACTION_STATE), - errmsg("cannot update tuples during a parallel operation"))); - - /* - * Fetch the list of attributes to be checked for various operations. - * - * For HOT considerations, this is wasted effort if we fail to update or - * have to put the new tuple on a different page. But we must compute the - * list before obtaining buffer lock --- in the worst case, if we are - * doing an update on one of the relevant system catalogs, we could - * deadlock if we try to fetch the list later. In any case, the relcache - * caches the data so this is usually pretty cheap. - * - * We also need columns used by the replica identity and columns that are - * considered the "key" of rows in the table. - * - * Note that we get copies of each bitmap, so we need not worry about - * relcache flush happening midway through. - */ - hot_attrs = RelationGetIndexAttrBitmap(relation, - INDEX_ATTR_BITMAP_HOT_BLOCKING); - sum_attrs = RelationGetIndexAttrBitmap(relation, - INDEX_ATTR_BITMAP_SUMMARIZED); - key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); - id_attrs = RelationGetIndexAttrBitmap(relation, - INDEX_ATTR_BITMAP_IDENTITY_KEY); - interesting_attrs = NULL; - interesting_attrs = bms_add_members(interesting_attrs, hot_attrs); - interesting_attrs = bms_add_members(interesting_attrs, sum_attrs); - interesting_attrs = bms_add_members(interesting_attrs, key_attrs); - interesting_attrs = bms_add_members(interesting_attrs, id_attrs); - - block = ItemPointerGetBlockNumber(otid); - buffer = ReadBuffer(relation, block); - page = BufferGetPage(buffer); - - /* - * Before locking the buffer, pin the visibility map page if it appears to - * be necessary. Since we haven't got the lock yet, someone else might be - * in the middle of changing this, so we'll need to recheck after we have - * the lock. - */ - if (PageIsAllVisible(page)) - tdeheap_visibilitymap_pin(relation, block, &vmbuffer); - - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid)); - Assert(ItemIdIsNormal(lp)); - - /* - * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work - * properly. - */ - oldtup.t_tableOid = RelationGetRelid(relation); - oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp); - oldtup_data = oldtup.t_data; - oldtup.t_len = ItemIdGetLength(lp); - oldtup.t_self = *otid; - /* decrypt the old tuple */ - { - char* new_ptr = NULL; - new_ptr = MemoryContextAlloc(CurTransactionContext, oldtup.t_len); - memcpy(new_ptr, oldtup.t_data, oldtup.t_data->t_hoff); - // only neccessary field - oldtup_decrypted.t_data = (HeapTupleHeader)new_ptr; - } - PG_TDE_DECRYPT_TUPLE(&oldtup, &oldtup_decrypted, - GetRelationKey(relation->rd_locator)); - - // change field in oldtup now. - // We can't do it before, as PG_TDE_DECRYPT_TUPLE uses t_data address in - // calculations - oldtup.t_data = oldtup_decrypted.t_data; - - /* the new tuple is ready, except for this: */ - newtup->t_tableOid = RelationGetRelid(relation); - - /* - * Determine columns modified by the update. Additionally, identify - * whether any of the unmodified replica identity key attributes in the - * old tuple is externally stored or not. This is required because for - * such attributes the flattened value won't be WAL logged as part of the - * new tuple so we must include it as part of the old_key_tuple. See - * ExtractReplicaIdentity. - */ - modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs, - id_attrs, &oldtup, - newtup, &id_has_external); - - /* - * If we're not updating any "key" column, we can grab a weaker lock type. - * This allows for more concurrency when we are running simultaneously - * with foreign key checks. - * - * Note that if a column gets detoasted while executing the update, but - * the value ends up being the same, this test will fail and we will use - * the stronger lock. This is acceptable; the important case to optimize - * is updates that don't manipulate key columns, not those that - * serendipitously arrive at the same key values. - */ - if (!bms_overlap(modified_attrs, key_attrs)) - { - *lockmode = LockTupleNoKeyExclusive; - mxact_status = MultiXactStatusNoKeyUpdate; - key_intact = true; - - /* - * If this is the first possibly-multixact-able operation in the - * current transaction, set my per-backend OldestMemberMXactId - * setting. We can be certain that the transaction will never become a - * member of any older MultiXactIds than that. (We have to do this - * even if we end up just using our own TransactionId below, since - * some other backend could incorporate our XID into a MultiXact - * immediately afterwards.) - */ - MultiXactIdSetOldestMember(); - } - else - { - *lockmode = LockTupleExclusive; - mxact_status = MultiXactStatusUpdate; - key_intact = false; - } - - /* - * Note: beyond this point, use oldtup not otid to refer to old tuple. - * otid may very well point at newtup->t_self, which we will overwrite - * with the new tuple's location, so there's great risk of confusion if we - * use otid anymore. - */ - - oldtup.t_data = oldtup_data; - -l2: - checked_lockers = false; - locker_remains = false; - result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer); - - /* see below about the "no wait" case */ - Assert(result != TM_BeingModified || wait); - - if (result == TM_Invisible) - { - UnlockReleaseBuffer(buffer); - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("attempted to update invisible tuple"))); - } - else if (result == TM_BeingModified && wait) - { - TransactionId xwait; - uint16 infomask; - bool can_continue = false; - - /* - * XXX note that we don't consider the "no wait" case here. This - * isn't a problem currently because no caller uses that case, but it - * should be fixed if such a caller is introduced. It wasn't a - * problem previously because this code would always wait, but now - * that some tuple locks do not conflict with one of the lock modes we - * use, it is possible that this case is interesting to handle - * specially. - * - * This may cause failures with third-party code that calls - * tdeheap_update directly. - */ - - /* must copy state data before unlocking buffer */ - xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data); - infomask = oldtup.t_data->t_infomask; - - /* - * Now we have to do something about the existing locker. If it's a - * multi, sleep on it; we might be awakened before it is completely - * gone (or even not sleep at all in some cases); we need to preserve - * it as locker, unless it is gone completely. - * - * If it's not a multi, we need to check for sleeping conditions - * before actually going to sleep. If the update doesn't conflict - * with the locks, we just continue without sleeping (but making sure - * it is preserved). - * - * Before sleeping, we need to acquire tuple lock to establish our - * priority for the tuple (see tdeheap_lock_tuple). LockTuple will - * release us when we are next-in-line for the tuple. Note we must - * not acquire the tuple lock until we're sure we're going to sleep; - * otherwise we're open for race conditions with other transactions - * holding the tuple lock which sleep on us. - * - * If we are forced to "start over" below, we keep the tuple lock; - * this arranges that we stay at the head of the line while rechecking - * tuple state. - */ - if (infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId update_xact; - int remain; - bool current_is_member = false; - - if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, - *lockmode, ¤t_is_member)) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - /* - * Acquire the lock, if necessary (but skip it when we're - * requesting a lock and already have one; avoids deadlock). - */ - if (!current_is_member) - tdeheap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode, - LockWaitBlock, &have_tuple_lock); - - /* wait for multixact */ - MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask, - relation, &oldtup.t_self, XLTW_Update, - &remain); - checked_lockers = true; - locker_remains = remain != 0; - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * If xwait had just locked the tuple then some other xact - * could update this tuple before we get to this point. Check - * for xmax change, and start over if so. - */ - if (xmax_infomask_changed(oldtup.t_data->t_infomask, - infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data), - xwait)) - goto l2; - } - - /* - * Note that the multixact may not be done by now. It could have - * surviving members; our own xact or other subxacts of this - * backend, and also any other concurrent transaction that locked - * the tuple with LockTupleKeyShare if we only got - * LockTupleNoKeyExclusive. If this is the case, we have to be - * careful to mark the updated tuple with the surviving members in - * Xmax. - * - * Note that there could have been another update in the - * MultiXact. In that case, we need to check whether it committed - * or aborted. If it aborted we are safe to update it again; - * otherwise there is an update conflict, and we have to return - * TableTuple{Deleted, Updated} below. - * - * In the LockTupleExclusive case, we still need to preserve the - * surviving members: those would include the tuple locks we had - * before this one, which are important to keep in case this - * subxact aborts. - */ - if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask)) - update_xact = HeapTupleGetUpdateXid(oldtup.t_data); - else - update_xact = InvalidTransactionId; - - /* - * There was no UPDATE in the MultiXact; or it aborted. No - * TransactionIdIsInProgress() call needed here, since we called - * MultiXactIdWait() above. - */ - if (!TransactionIdIsValid(update_xact) || - TransactionIdDidAbort(update_xact)) - can_continue = true; - } - else if (TransactionIdIsCurrentTransactionId(xwait)) - { - /* - * The only locker is ourselves; we can avoid grabbing the tuple - * lock here, but must preserve our locking information. - */ - checked_lockers = true; - locker_remains = true; - can_continue = true; - } - else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact) - { - /* - * If it's just a key-share locker, and we're not changing the key - * columns, we don't need to wait for it to end; but we need to - * preserve it as locker. - */ - checked_lockers = true; - locker_remains = true; - can_continue = true; - } - else - { - /* - * Wait for regular transaction to end; but first, acquire tuple - * lock. - */ - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - tdeheap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode, - LockWaitBlock, &have_tuple_lock); - XactLockTableWait(xwait, relation, &oldtup.t_self, - XLTW_Update); - checked_lockers = true; - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * xwait is done, but if xwait had just locked the tuple then some - * other xact could update this tuple before we get to this point. - * Check for xmax change, and start over if so. - */ - if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) || - !TransactionIdEquals(xwait, - HeapTupleHeaderGetRawXmax(oldtup.t_data))) - goto l2; - - /* Otherwise check if it committed or aborted */ - UpdateXmaxHintBits(oldtup.t_data, buffer, xwait); - if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) - can_continue = true; - } - - if (can_continue) - result = TM_Ok; - else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)) - result = TM_Updated; - else - result = TM_Deleted; - } - - /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */ - if (result != TM_Ok) - { - Assert(result == TM_SelfModified || - result == TM_Updated || - result == TM_Deleted || - result == TM_BeingModified); - Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); - Assert(result != TM_Updated || - !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)); - } - - if (crosscheck != InvalidSnapshot && result == TM_Ok) - { - /* Perform additional check for transaction-snapshot mode RI updates */ - if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer)) - result = TM_Updated; - } - - if (result != TM_Ok) - { - tmfd->ctid = oldtup.t_data->t_ctid; - tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data); - if (result == TM_SelfModified) - tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); - else - tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); - if (have_tuple_lock) - UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - *update_indexes = TU_None; - - bms_free(hot_attrs); - bms_free(sum_attrs); - bms_free(key_attrs); - bms_free(id_attrs); - bms_free(modified_attrs); - bms_free(interesting_attrs); - return result; - } - - /* - * If we didn't pin the visibility map page and the page has become all - * visible while we were busy locking the buffer, or during some - * subsequent window during which we had it unlocked, we'll have to unlock - * and re-lock, to avoid holding the buffer lock across an I/O. That's a - * bit unfortunate, especially since we'll now have to recheck whether the - * tuple has been locked or updated under us, but hopefully it won't - * happen very often. - */ - if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - tdeheap_visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - goto l2; - } - - /* Fill in transaction status data */ - - /* - * If the tuple we're updating is locked, we need to preserve the locking - * info in the old tuple's Xmax. Prepare a new Xmax value for this. - */ - compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), - oldtup.t_data->t_infomask, - oldtup.t_data->t_infomask2, - xid, *lockmode, true, - &xmax_old_tuple, &infomask_old_tuple, - &infomask2_old_tuple); - - /* - * And also prepare an Xmax value for the new copy of the tuple. If there - * was no xmax previously, or there was one but all lockers are now gone, - * then use InvalidTransactionId; otherwise, get the xmax from the old - * tuple. (In rare cases that might also be InvalidTransactionId and yet - * not have the HEAP_XMAX_INVALID bit set; that's fine.) - */ - if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) || - HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) || - (checked_lockers && !locker_remains)) - xmax_new_tuple = InvalidTransactionId; - else - xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data); - - if (!TransactionIdIsValid(xmax_new_tuple)) - { - infomask_new_tuple = HEAP_XMAX_INVALID; - infomask2_new_tuple = 0; - } - else - { - /* - * If we found a valid Xmax for the new tuple, then the infomask bits - * to use on the new tuple depend on what was there on the old one. - * Note that since we're doing an update, the only possibility is that - * the lockers had FOR KEY SHARE lock. - */ - if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) - { - GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple, - &infomask2_new_tuple); - } - else - { - infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY; - infomask2_new_tuple = 0; - } - } - - /* - * Prepare the new tuple with the appropriate initial values of Xmin and - * Xmax, as well as initial infomask bits as computed above. - */ - newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK); - newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); - HeapTupleHeaderSetXmin(newtup->t_data, xid); - HeapTupleHeaderSetCmin(newtup->t_data, cid); - newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple; - newtup->t_data->t_infomask2 |= infomask2_new_tuple; - HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple); - - /* - * Replace cid with a combo CID if necessary. Note that we already put - * the plain cid into the new tuple. - */ - HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo); - - /* - * If the toaster needs to be activated, OR if the new tuple will not fit - * on the same page as the old, then we need to release the content lock - * (but not the pin!) on the old tuple's buffer while we are off doing - * TOAST and/or table-file-extension work. We must mark the old tuple to - * show that it's locked, else other processes may try to update it - * themselves. - * - * We need to invoke the toaster if there are already any out-of-line - * toasted values present, or if the new tuple is over-threshold. - */ - if (relation->rd_rel->relkind != RELKIND_RELATION && - relation->rd_rel->relkind != RELKIND_MATVIEW) - { - /* toast table entries should never be recursively toasted */ - Assert(!HeapTupleHasExternal(&oldtup)); - Assert(!HeapTupleHasExternal(newtup)); - need_toast = false; - } - else - need_toast = (HeapTupleHasExternal(&oldtup) || - HeapTupleHasExternal(newtup) || - newtup->t_len > TOAST_TUPLE_THRESHOLD); - - pagefree = PageGetHeapFreeSpace(page); - - newtupsize = MAXALIGN(newtup->t_len); - - if (need_toast || newtupsize > pagefree) - { - TransactionId xmax_lock_old_tuple; - uint16 infomask_lock_old_tuple, - infomask2_lock_old_tuple; - bool cleared_all_frozen = false; - - /* - * To prevent concurrent sessions from updating the tuple, we have to - * temporarily mark it locked, while we release the page-level lock. - * - * To satisfy the rule that any xid potentially appearing in a buffer - * written out to disk, we unfortunately have to WAL log this - * temporary modification. We can reuse xl_tdeheap_lock for this - * purpose. If we crash/error before following through with the - * actual update, xmax will be of an aborted transaction, allowing - * other sessions to proceed. - */ - - /* - * Compute xmax / infomask appropriate for locking the tuple. This has - * to be done separately from the combo that's going to be used for - * updating, because the potentially created multixact would otherwise - * be wrong. - */ - compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), - oldtup.t_data->t_infomask, - oldtup.t_data->t_infomask2, - xid, *lockmode, false, - &xmax_lock_old_tuple, &infomask_lock_old_tuple, - &infomask2_lock_old_tuple); - - Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple)); - - START_CRIT_SECTION(); - - /* Clear obsolete visibility flags ... */ - oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; - HeapTupleClearHotUpdated(&oldtup); - /* ... and store info about transaction updating this tuple */ - Assert(TransactionIdIsValid(xmax_lock_old_tuple)); - HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple); - oldtup.t_data->t_infomask |= infomask_lock_old_tuple; - oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple; - HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); - - /* temporarily make it look not-updated, but locked */ - oldtup.t_data->t_ctid = oldtup.t_self; - - /* - * Clear all-frozen bit on visibility map if needed. We could - * immediately reset ALL_VISIBLE, but given that the WAL logging - * overhead would be unchanged, that doesn't seem necessarily - * worthwhile. - */ - if (PageIsAllVisible(page) && - tdeheap_visibilitymap_clear(relation, block, vmbuffer, - VISIBILITYMAP_ALL_FROZEN)) - cleared_all_frozen = true; - - MarkBufferDirty(buffer); - - if (RelationNeedsWAL(relation)) - { - xl_tdeheap_lock xlrec; - XLogRecPtr recptr; - - XLogBeginInsert(); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - - xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self); - xlrec.xmax = xmax_lock_old_tuple; - xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask, - oldtup.t_data->t_infomask2); - xlrec.flags = - cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - XLogRegisterData((char *) &xlrec, SizeOfHeapLock); - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - /* - * Let the toaster do its thing, if needed. - * - * Note: below this point, heaptup is the data we actually intend to - * store into the relation; newtup is the caller's original untoasted - * data. - */ - if (need_toast) - { - /* Note we always use WAL and FSM during updates */ - heaptup = tdeheap_toast_insert_or_update(relation, newtup, &oldtup_decrypted, 0); - newtupsize = MAXALIGN(heaptup->t_len); - } - else - heaptup = newtup; - - /* - * Now, do we need a new page for the tuple, or not? This is a bit - * tricky since someone else could have added tuples to the page while - * we weren't looking. We have to recheck the available space after - * reacquiring the buffer lock. But don't bother to do that if the - * former amount of free space is still not enough; it's unlikely - * there's more free now than before. - * - * What's more, if we need to get a new page, we will need to acquire - * buffer locks on both old and new pages. To avoid deadlock against - * some other backend trying to get the same two locks in the other - * order, we must be consistent about the order we get the locks in. - * We use the rule "lock the lower-numbered page of the relation - * first". To implement this, we must do tdeheap_RelationGetBufferForTuple - * while not holding the lock on the old page, and we must rely on it - * to get the locks on both pages in the correct order. - * - * Another consideration is that we need visibility map page pin(s) if - * we will have to clear the all-visible flag on either page. If we - * call tdeheap_RelationGetBufferForTuple, we rely on it to acquire any such - * pins; but if we don't, we have to handle that here. Hence we need - * a loop. - */ - for (;;) - { - if (newtupsize > pagefree) - { - /* It doesn't fit, must use tdeheap_RelationGetBufferForTuple. */ - newbuf = tdeheap_RelationGetBufferForTuple(relation, heaptup->t_len, - buffer, 0, NULL, - &vmbuffer_new, &vmbuffer, - 0); - /* We're all done. */ - break; - } - /* Acquire VM page pin if needed and we don't have it. */ - if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) - tdeheap_visibilitymap_pin(relation, block, &vmbuffer); - /* Re-acquire the lock on the old tuple's page. */ - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - /* Re-check using the up-to-date free space */ - pagefree = PageGetHeapFreeSpace(page); - if (newtupsize > pagefree || - (vmbuffer == InvalidBuffer && PageIsAllVisible(page))) - { - /* - * Rats, it doesn't fit anymore, or somebody just now set the - * all-visible flag. We must now unlock and loop to avoid - * deadlock. Fortunately, this path should seldom be taken. - */ - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - } - else - { - /* We're all done. */ - newbuf = buffer; - break; - } - } - } - else - { - /* No TOAST work needed, and it'll fit on same page */ - newbuf = buffer; - heaptup = newtup; - } - - /* - * We're about to do the actual update -- check for conflict first, to - * avoid possibly having to roll back work we've just done. - * - * This is safe without a recheck as long as there is no possibility of - * another process scanning the pages between this check and the update - * being visible to the scan (i.e., exclusive buffer content lock(s) are - * continuously held from this point until the tuple update is visible). - * - * For the new tuple the only check needed is at the relation level, but - * since both tuples are in the same relation and the check for oldtup - * will include checking the relation level, there is no benefit to a - * separate check for the new tuple. - */ - CheckForSerializableConflictIn(relation, &oldtup.t_self, - BufferGetBlockNumber(buffer)); - - /* - * At this point newbuf and buffer are both pinned and locked, and newbuf - * has enough space for the new tuple. If they are the same buffer, only - * one pin is held. - */ - - if (newbuf == buffer) - { - /* - * Since the new tuple is going into the same page, we might be able - * to do a HOT update. Check if any of the index columns have been - * changed. - */ - if (!bms_overlap(modified_attrs, hot_attrs)) - { - use_hot_update = true; - - /* - * If none of the columns that are used in hot-blocking indexes - * were updated, we can apply HOT, but we do still need to check - * if we need to update the summarizing indexes, and update those - * indexes if the columns were updated, or we may fail to detect - * e.g. value bound changes in BRIN minmax indexes. - */ - if (bms_overlap(modified_attrs, sum_attrs)) - summarized_update = true; - } - } - else - { - /* Set a hint that the old page could use prune/defrag */ - PageSetFull(page); - } - - /* - * Compute replica identity tuple before entering the critical section so - * we don't PANIC upon a memory allocation failure. - * ExtractReplicaIdentity() will return NULL if nothing needs to be - * logged. Pass old key required as true only if the replica identity key - * columns are modified or it has external data. - */ - old_key_tuple = ExtractReplicaIdentity(relation, &oldtup, - bms_overlap(modified_attrs, id_attrs) || - id_has_external, - &old_key_copied); - - /* - * Make sure relation keys in the cahce to avoid pallocs in - * the critical section. - */ - GetRelationKey(relation->rd_locator); - - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - /* - * If this transaction commits, the old tuple will become DEAD sooner or - * later. Set flag that this page is a candidate for pruning once our xid - * falls below the OldestXmin horizon. If the transaction finally aborts, - * the subsequent page pruning will be a no-op and the hint will be - * cleared. - * - * XXX Should we set hint on newbuf as well? If the transaction aborts, - * there would be a prunable tuple in the newbuf; but for now we choose - * not to optimize for aborts. Note that tdeheap_xlog_update must be kept in - * sync if this decision changes. - */ - PageSetPrunable(page, xid); - - if (use_hot_update) - { - /* Mark the old tuple as HOT-updated */ - HeapTupleSetHotUpdated(&oldtup); - /* And mark the new tuple as heap-only */ - HeapTupleSetHeapOnly(heaptup); - /* Mark the caller's copy too, in case different from heaptup */ - HeapTupleSetHeapOnly(newtup); - } - else - { - /* Make sure tuples are correctly marked as not-HOT */ - HeapTupleClearHotUpdated(&oldtup); - HeapTupleClearHeapOnly(heaptup); - HeapTupleClearHeapOnly(newtup); - } - - tdeheap_RelationPutHeapTuple(relation, newbuf, heaptup, true, false); /* insert new tuple */ - - - /* Clear obsolete visibility flags, possibly set by ourselves above... */ - oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; - /* ... and store info about transaction updating this tuple */ - Assert(TransactionIdIsValid(xmax_old_tuple)); - HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple); - oldtup.t_data->t_infomask |= infomask_old_tuple; - oldtup.t_data->t_infomask2 |= infomask2_old_tuple; - HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); - - /* record address of new tuple in t_ctid of old one */ - oldtup.t_data->t_ctid = heaptup->t_self; - - /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */ - if (PageIsAllVisible(BufferGetPage(buffer))) - { - all_visible_cleared = true; - PageClearAllVisible(BufferGetPage(buffer)); - tdeheap_visibilitymap_clear(relation, BufferGetBlockNumber(buffer), - vmbuffer, VISIBILITYMAP_VALID_BITS); - } - if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf))) - { - all_visible_cleared_new = true; - PageClearAllVisible(BufferGetPage(newbuf)); - tdeheap_visibilitymap_clear(relation, BufferGetBlockNumber(newbuf), - vmbuffer_new, VISIBILITYMAP_VALID_BITS); - } - - if (newbuf != buffer) - MarkBufferDirty(newbuf); - MarkBufferDirty(buffer); - - /* XLOG stuff */ - if (RelationNeedsWAL(relation)) - { - XLogRecPtr recptr; - - /* - * For logical decoding we need combo CIDs to properly decode the - * catalog. - */ - if (RelationIsAccessibleInLogicalDecoding(relation)) - { - log_tdeheap_new_cid(relation, &oldtup); - log_tdeheap_new_cid(relation, heaptup); - } - - recptr = log_tdeheap_update(relation, buffer, - newbuf, &oldtup, heaptup, - old_key_tuple, - all_visible_cleared, - all_visible_cleared_new); - if (newbuf != buffer) - { - PageSetLSN(BufferGetPage(newbuf), recptr); - } - PageSetLSN(BufferGetPage(buffer), recptr); - } - - END_CRIT_SECTION(); - - if (newbuf != buffer) - LockBuffer(newbuf, BUFFER_LOCK_UNLOCK); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - /* - * Mark old tuple for invalidation from system caches at next command - * boundary, and mark the new tuple for invalidation in case we abort. We - * have to do this before releasing the buffer because oldtup is in the - * buffer. (heaptup is all in local memory, but it's necessary to process - * both tuple versions in one call to inval.c so we can avoid redundant - * sinval messages.) - */ - CacheInvalidateHeapTuple(relation, &oldtup, heaptup); - - /* Now we can release the buffer(s) */ - if (newbuf != buffer) - ReleaseBuffer(newbuf); - ReleaseBuffer(buffer); - if (BufferIsValid(vmbuffer_new)) - ReleaseBuffer(vmbuffer_new); - if (BufferIsValid(vmbuffer)) - ReleaseBuffer(vmbuffer); - - /* - * Release the lmgr tuple lock, if we had it. - */ - if (have_tuple_lock) - UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); - - pgstat_count_tdeheap_update(relation, use_hot_update, newbuf != buffer); - - /* - * If heaptup is a private copy, release it. Don't forget to copy t_self - * back to the caller's image, too. - */ - if (heaptup != newtup) - { - newtup->t_self = heaptup->t_self; - tdeheap_freetuple(heaptup); - } - - /* - * If it is a HOT update, the update may still need to update summarized - * indexes, lest we fail to update those summaries and get incorrect - * results (for example, minmax bounds of the block may change with this - * update). - */ - if (use_hot_update) - { - if (summarized_update) - *update_indexes = TU_Summarizing; - else - *update_indexes = TU_None; - } - else - *update_indexes = TU_All; - - if (old_key_tuple != NULL && old_key_copied) - tdeheap_freetuple(old_key_tuple); - - bms_free(hot_attrs); - bms_free(sum_attrs); - bms_free(key_attrs); - bms_free(id_attrs); - bms_free(modified_attrs); - bms_free(interesting_attrs); - - return TM_Ok; -} - -/* - * Check if the specified attribute's values are the same. Subroutine for - * HeapDetermineColumnsInfo. - */ -static bool -tdeheap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, - bool isnull1, bool isnull2) -{ - Form_pg_attribute att; - - /* - * If one value is NULL and other is not, then they are certainly not - * equal - */ - if (isnull1 != isnull2) - return false; - - /* - * If both are NULL, they can be considered equal. - */ - if (isnull1) - return true; - - /* - * We do simple binary comparison of the two datums. This may be overly - * strict because there can be multiple binary representations for the - * same logical value. But we should be OK as long as there are no false - * positives. Using a type-specific equality operator is messy because - * there could be multiple notions of equality in different operator - * classes; furthermore, we cannot safely invoke user-defined functions - * while holding exclusive buffer lock. - */ - if (attrnum <= 0) - { - /* The only allowed system columns are OIDs, so do this */ - return (DatumGetObjectId(value1) == DatumGetObjectId(value2)); - } - else - { - Assert(attrnum <= tupdesc->natts); - att = TupleDescAttr(tupdesc, attrnum - 1); - return datumIsEqual(value1, value2, att->attbyval, att->attlen); - } -} - -/* - * Check which columns are being updated. - * - * Given an updated tuple, determine (and return into the output bitmapset), - * from those listed as interesting, the set of columns that changed. - * - * has_external indicates if any of the unmodified attributes (from those - * listed as interesting) of the old tuple is a member of external_cols and is - * stored externally. - */ -static Bitmapset * -HeapDetermineColumnsInfo(Relation relation, - Bitmapset *interesting_cols, - Bitmapset *external_cols, - HeapTuple oldtup, HeapTuple newtup, - bool *has_external) -{ - int attidx; - Bitmapset *modified = NULL; - TupleDesc tupdesc = RelationGetDescr(relation); - - attidx = -1; - while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0) - { - /* attidx is zero-based, attrnum is the normal attribute number */ - AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber; - Datum value1, - value2; - bool isnull1, - isnull2; - - /* - * If it's a whole-tuple reference, say "not equal". It's not really - * worth supporting this case, since it could only succeed after a - * no-op update, which is hardly a case worth optimizing for. - */ - if (attrnum == 0) - { - modified = bms_add_member(modified, attidx); - continue; - } - - /* - * Likewise, automatically say "not equal" for any system attribute - * other than tableOID; we cannot expect these to be consistent in a - * HOT chain, or even to be set correctly yet in the new tuple. - */ - if (attrnum < 0) - { - if (attrnum != TableOidAttributeNumber) - { - modified = bms_add_member(modified, attidx); - continue; - } - } - - /* - * Extract the corresponding values. XXX this is pretty inefficient - * if there are many indexed columns. Should we do a single - * tdeheap_deform_tuple call on each tuple, instead? But that doesn't - * work for system columns ... - */ - value1 = tdeheap_getattr(oldtup, attrnum, tupdesc, &isnull1); - value2 = tdeheap_getattr(newtup, attrnum, tupdesc, &isnull2); - if (!tdeheap_attr_equals(tupdesc, attrnum, value1, - value2, isnull1, isnull2)) - { - modified = bms_add_member(modified, attidx); - continue; - } - - /* - * No need to check attributes that can't be stored externally. Note - * that system attributes can't be stored externally. - */ - if (attrnum < 0 || isnull1 || - TupleDescAttr(tupdesc, attrnum - 1)->attlen != -1) - continue; - - /* - * Check if the old tuple's attribute is stored externally and is a - * member of external_cols. - */ - if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) && - bms_is_member(attidx, external_cols)) - *has_external = true; - } - - return modified; -} - -/* - * simple_tdeheap_update - replace a tuple - * - * This routine may be used to update a tuple when concurrent updates of - * the target tuple are not expected (for example, because we have a lock - * on the relation associated with the tuple). Any failure is reported - * via ereport(). - */ -void -simple_tdeheap_update(Relation relation, ItemPointer otid, HeapTuple tup, - TU_UpdateIndexes *update_indexes) -{ - TM_Result result; - TM_FailureData tmfd; - LockTupleMode lockmode; - - result = tdeheap_update(relation, otid, tup, - GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); - switch (result) - { - case TM_SelfModified: - /* Tuple was already updated in current command? */ - elog(ERROR, "tuple already updated by self"); - break; - - case TM_Ok: - /* done successfully */ - break; - - case TM_Updated: - elog(ERROR, "tuple concurrently updated"); - break; - - case TM_Deleted: - elog(ERROR, "tuple concurrently deleted"); - break; - - default: - elog(ERROR, "unrecognized tdeheap_update status: %u", result); - break; - } -} - - -/* - * Return the MultiXactStatus corresponding to the given tuple lock mode. - */ -static MultiXactStatus -get_mxact_status_for_lock(LockTupleMode mode, bool is_update) -{ - int retval; - - if (is_update) - retval = tupleLockExtraInfo[mode].updstatus; - else - retval = tupleLockExtraInfo[mode].lockstatus; - - if (retval == -1) - elog(ERROR, "invalid lock tuple mode %d/%s", mode, - is_update ? "true" : "false"); - - return (MultiXactStatus) retval; -} - -/* - * tdeheap_lock_tuple - lock a tuple in shared or exclusive mode - * - * Note that this acquires a buffer pin, which the caller must release. - * - * Input parameters: - * relation: relation containing tuple (caller must hold suitable lock) - * tid: TID of tuple to lock - * cid: current command ID (used for visibility test, and stored into - * tuple's cmax if lock is successful) - * mode: indicates if shared or exclusive tuple lock is desired - * wait_policy: what to do if tuple lock is not available - * follow_updates: if true, follow the update chain to also lock descendant - * tuples. - * - * Output parameters: - * *tuple: all fields filled in - * *buffer: set to buffer holding tuple (pinned but not locked at exit) - * *tmfd: filled in failure cases (see below) - * - * Function results are the same as the ones for table_tuple_lock(). - * - * In the failure cases other than TM_Invisible, the routine fills - * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, - * if necessary), and t_cmax (the last only for TM_SelfModified, - * since we cannot obtain cmax from a combo CID generated by another - * transaction). - * See comments for struct TM_FailureData for additional info. - * - * See README.tuplock for a thorough explanation of this mechanism. - */ -TM_Result -tdeheap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, TM_FailureData *tmfd) -{ - TM_Result result; - ItemPointer tid = &(tuple->t_self); - ItemId lp; - Page page; - Buffer vmbuffer = InvalidBuffer; - BlockNumber block; - TransactionId xid, - xmax; - uint16 old_infomask, - new_infomask, - new_infomask2; - bool first_time = true; - bool skip_tuple_lock = false; - bool have_tuple_lock = false; - bool cleared_all_frozen = false; - - *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); - block = ItemPointerGetBlockNumber(tid); - - /* - * Before locking the buffer, pin the visibility map page if it appears to - * be necessary. Since we haven't got the lock yet, someone else might be - * in the middle of changing this, so we'll need to recheck after we have - * the lock. - */ - if (PageIsAllVisible(BufferGetPage(*buffer))) - tdeheap_visibilitymap_pin(relation, block, &vmbuffer); - - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - page = BufferGetPage(*buffer); - lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); - Assert(ItemIdIsNormal(lp)); - - tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); - tuple->t_len = ItemIdGetLength(lp); - tuple->t_tableOid = RelationGetRelid(relation); - -l3: - result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); - - if (result == TM_Invisible) - { - /* - * This is possible, but only when locking a tuple for ON CONFLICT - * UPDATE. We return this value here rather than throwing an error in - * order to give that case the opportunity to throw a more specific - * error. - */ - result = TM_Invisible; - goto out_locked; - } - else if (result == TM_BeingModified || - result == TM_Updated || - result == TM_Deleted) - { - TransactionId xwait; - uint16 infomask; - uint16 infomask2; - bool require_sleep; - ItemPointerData t_ctid; - - /* must copy state data before unlocking buffer */ - xwait = HeapTupleHeaderGetRawXmax(tuple->t_data); - infomask = tuple->t_data->t_infomask; - infomask2 = tuple->t_data->t_infomask2; - ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); - - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); - - /* - * If any subtransaction of the current top transaction already holds - * a lock as strong as or stronger than what we're requesting, we - * effectively hold the desired lock already. We *must* succeed - * without trying to take the tuple lock, else we will deadlock - * against anyone wanting to acquire a stronger lock. - * - * Note we only do this the first time we loop on the HTSU result; - * there is no point in testing in subsequent passes, because - * evidently our own transaction cannot have acquired a new lock after - * the first time we checked. - */ - if (first_time) - { - first_time = false; - - if (infomask & HEAP_XMAX_IS_MULTI) - { - int i; - int nmembers; - MultiXactMember *members; - - /* - * We don't need to allow old multixacts here; if that had - * been the case, HeapTupleSatisfiesUpdate would have returned - * MayBeUpdated and we wouldn't be here. - */ - nmembers = - GetMultiXactIdMembers(xwait, &members, false, - HEAP_XMAX_IS_LOCKED_ONLY(infomask)); - - for (i = 0; i < nmembers; i++) - { - /* only consider members of our own transaction */ - if (!TransactionIdIsCurrentTransactionId(members[i].xid)) - continue; - - if (TUPLOCK_from_mxstatus(members[i].status) >= mode) - { - pfree(members); - result = TM_Ok; - goto out_unlocked; - } - else - { - /* - * Disable acquisition of the heavyweight tuple lock. - * Otherwise, when promoting a weaker lock, we might - * deadlock with another locker that has acquired the - * heavyweight tuple lock and is waiting for our - * transaction to finish. - * - * Note that in this case we still need to wait for - * the multixact if required, to avoid acquiring - * conflicting locks. - */ - skip_tuple_lock = true; - } - } - - if (members) - pfree(members); - } - else if (TransactionIdIsCurrentTransactionId(xwait)) - { - switch (mode) - { - case LockTupleKeyShare: - Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) || - HEAP_XMAX_IS_SHR_LOCKED(infomask) || - HEAP_XMAX_IS_EXCL_LOCKED(infomask)); - result = TM_Ok; - goto out_unlocked; - case LockTupleShare: - if (HEAP_XMAX_IS_SHR_LOCKED(infomask) || - HEAP_XMAX_IS_EXCL_LOCKED(infomask)) - { - result = TM_Ok; - goto out_unlocked; - } - break; - case LockTupleNoKeyExclusive: - if (HEAP_XMAX_IS_EXCL_LOCKED(infomask)) - { - result = TM_Ok; - goto out_unlocked; - } - break; - case LockTupleExclusive: - if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) && - infomask2 & HEAP_KEYS_UPDATED) - { - result = TM_Ok; - goto out_unlocked; - } - break; - } - } - } - - /* - * Initially assume that we will have to wait for the locking - * transaction(s) to finish. We check various cases below in which - * this can be turned off. - */ - require_sleep = true; - if (mode == LockTupleKeyShare) - { - /* - * If we're requesting KeyShare, and there's no update present, we - * don't need to wait. Even if there is an update, we can still - * continue if the key hasn't been modified. - * - * However, if there are updates, we need to walk the update chain - * to mark future versions of the row as locked, too. That way, - * if somebody deletes that future version, we're protected - * against the key going away. This locking of future versions - * could block momentarily, if a concurrent transaction is - * deleting a key; or it could return a value to the effect that - * the transaction deleting the key has already committed. So we - * do this before re-locking the buffer; otherwise this would be - * prone to deadlocks. - * - * Note that the TID we're locking was grabbed before we unlocked - * the buffer. For it to change while we're not looking, the - * other properties we're testing for below after re-locking the - * buffer would also change, in which case we would restart this - * loop above. - */ - if (!(infomask2 & HEAP_KEYS_UPDATED)) - { - bool updated; - - updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask); - - /* - * If there are updates, follow the update chain; bail out if - * that cannot be done. - */ - if (follow_updates && updated) - { - TM_Result res; - - res = tdeheap_lock_updated_tuple(relation, tuple, &t_ctid, - GetCurrentTransactionId(), - mode); - if (res != TM_Ok) - { - result = res; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - } - - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * Make sure it's still an appropriate lock, else start over. - * Also, if it wasn't updated before we released the lock, but - * is updated now, we start over too; the reason is that we - * now need to follow the update chain to lock the new - * versions. - */ - if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) && - ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) || - !updated)) - goto l3; - - /* Things look okay, so we can skip sleeping */ - require_sleep = false; - - /* - * Note we allow Xmax to change here; other updaters/lockers - * could have modified it before we grabbed the buffer lock. - * However, this is not a problem, because with the recheck we - * just did we ensure that they still don't conflict with the - * lock we want. - */ - } - } - else if (mode == LockTupleShare) - { - /* - * If we're requesting Share, we can similarly avoid sleeping if - * there's no update and no exclusive lock present. - */ - if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && - !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) - { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * Make sure it's still an appropriate lock, else start over. - * See above about allowing xmax to change. - */ - if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || - HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask)) - goto l3; - require_sleep = false; - } - } - else if (mode == LockTupleNoKeyExclusive) - { - /* - * If we're requesting NoKeyExclusive, we might also be able to - * avoid sleeping; just ensure that there no conflicting lock - * already acquired. - */ - if (infomask & HEAP_XMAX_IS_MULTI) - { - if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask, - mode, NULL)) - { - /* - * No conflict, but if the xmax changed under us in the - * meantime, start over. - */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), - xwait)) - goto l3; - - /* otherwise, we're good */ - require_sleep = false; - } - } - else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) - { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* if the xmax changed in the meantime, start over */ - if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), - xwait)) - goto l3; - /* otherwise, we're good */ - require_sleep = false; - } - } - - /* - * As a check independent from those above, we can also avoid sleeping - * if the current transaction is the sole locker of the tuple. Note - * that the strength of the lock already held is irrelevant; this is - * not about recording the lock in Xmax (which will be done regardless - * of this optimization, below). Also, note that the cases where we - * hold a lock stronger than we are requesting are already handled - * above by not doing anything. - * - * Note we only deal with the non-multixact case here; MultiXactIdWait - * is well equipped to deal with this situation on its own. - */ - if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) && - TransactionIdIsCurrentTransactionId(xwait)) - { - /* ... but if the xmax changed in the meantime, start over */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), - xwait)) - goto l3; - Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask)); - require_sleep = false; - } - - /* - * Time to sleep on the other transaction/multixact, if necessary. - * - * If the other transaction is an update/delete that's already - * committed, then sleeping cannot possibly do any good: if we're - * required to sleep, get out to raise an error instead. - * - * By here, we either have already acquired the buffer exclusive lock, - * or we must wait for the locking transaction or multixact; so below - * we ensure that we grab buffer lock after the sleep. - */ - if (require_sleep && (result == TM_Updated || result == TM_Deleted)) - { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - else if (require_sleep) - { - /* - * Acquire tuple lock to establish our priority for the tuple, or - * die trying. LockTuple will release us when we are next-in-line - * for the tuple. We must do this even if we are share-locking, - * but not if we already have a weaker lock on the tuple. - * - * If we are forced to "start over" below, we keep the tuple lock; - * this arranges that we stay at the head of the line while - * rechecking tuple state. - */ - if (!skip_tuple_lock && - !tdeheap_acquire_tuplock(relation, tid, mode, wait_policy, - &have_tuple_lock)) - { - /* - * This can only happen if wait_policy is Skip and the lock - * couldn't be obtained. - */ - result = TM_WouldBlock; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - - if (infomask & HEAP_XMAX_IS_MULTI) - { - MultiXactStatus status = get_mxact_status_for_lock(mode, false); - - /* We only ever lock tuples, never update them */ - if (status >= MultiXactStatusNoKeyUpdate) - elog(ERROR, "invalid lock mode in tdeheap_lock_tuple"); - - /* wait for multixact to end, or die trying */ - switch (wait_policy) - { - case LockWaitBlock: - MultiXactIdWait((MultiXactId) xwait, status, infomask, - relation, &tuple->t_self, XLTW_Lock, NULL); - break; - case LockWaitSkip: - if (!ConditionalMultiXactIdWait((MultiXactId) xwait, - status, infomask, relation, - NULL)) - { - result = TM_WouldBlock; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - break; - case LockWaitError: - if (!ConditionalMultiXactIdWait((MultiXactId) xwait, - status, infomask, relation, - NULL)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", - RelationGetRelationName(relation)))); - - break; - } - - /* - * Of course, the multixact might not be done here: if we're - * requesting a light lock mode, other transactions with light - * locks could still be alive, as well as locks owned by our - * own xact or other subxacts of this backend. We need to - * preserve the surviving MultiXact members. Note that it - * isn't absolutely necessary in the latter case, but doing so - * is simpler. - */ - } - else - { - /* wait for regular transaction to end, or die trying */ - switch (wait_policy) - { - case LockWaitBlock: - XactLockTableWait(xwait, relation, &tuple->t_self, - XLTW_Lock); - break; - case LockWaitSkip: - if (!ConditionalXactLockTableWait(xwait)) - { - result = TM_WouldBlock; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - break; - case LockWaitError: - if (!ConditionalXactLockTableWait(xwait)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", - RelationGetRelationName(relation)))); - break; - } - } - - /* if there are updates, follow the update chain */ - if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask)) - { - TM_Result res; - - res = tdeheap_lock_updated_tuple(relation, tuple, &t_ctid, - GetCurrentTransactionId(), - mode); - if (res != TM_Ok) - { - result = res; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - } - - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * xwait is done, but if xwait had just locked the tuple then some - * other xact could update this tuple before we get to this point. - * Check for xmax change, and start over if so. - */ - if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), - xwait)) - goto l3; - - if (!(infomask & HEAP_XMAX_IS_MULTI)) - { - /* - * Otherwise check if it committed or aborted. Note we cannot - * be here if the tuple was only locked by somebody who didn't - * conflict with us; that would have been handled above. So - * that transaction must necessarily be gone by now. But - * don't check for this in the multixact case, because some - * locker transactions might still be running. - */ - UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); - } - } - - /* By here, we're certain that we hold buffer exclusive lock again */ - - /* - * We may lock if previous xmax aborted, or if it committed but only - * locked the tuple without updating it; or if we didn't have to wait - * at all for whatever reason. - */ - if (!require_sleep || - (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || - HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || - HeapTupleHeaderIsOnlyLocked(tuple->t_data)) - result = TM_Ok; - else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) - result = TM_Updated; - else - result = TM_Deleted; - } - -failed: - if (result != TM_Ok) - { - Assert(result == TM_SelfModified || result == TM_Updated || - result == TM_Deleted || result == TM_WouldBlock); - - /* - * When locking a tuple under LockWaitSkip semantics and we fail with - * TM_WouldBlock above, it's possible for concurrent transactions to - * release the lock and set HEAP_XMAX_INVALID in the meantime. So - * this assert is slightly different from the equivalent one in - * tdeheap_delete and tdeheap_update. - */ - Assert((result == TM_WouldBlock) || - !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); - Assert(result != TM_Updated || - !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)); - tmfd->ctid = tuple->t_data->t_ctid; - tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); - if (result == TM_SelfModified) - tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data); - else - tmfd->cmax = InvalidCommandId; - goto out_locked; - } - - /* - * If we didn't pin the visibility map page and the page has become all - * visible while we were busy locking the buffer, or during some - * subsequent window during which we had it unlocked, we'll have to unlock - * and re-lock, to avoid holding the buffer lock across I/O. That's a bit - * unfortunate, especially since we'll now have to recheck whether the - * tuple has been locked or updated under us, but hopefully it won't - * happen very often. - */ - if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) - { - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); - tdeheap_visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto l3; - } - - xmax = HeapTupleHeaderGetRawXmax(tuple->t_data); - old_infomask = tuple->t_data->t_infomask; - - /* - * If this is the first possibly-multixact-able operation in the current - * transaction, set my per-backend OldestMemberMXactId setting. We can be - * certain that the transaction will never become a member of any older - * MultiXactIds than that. (We have to do this even if we end up just - * using our own TransactionId below, since some other backend could - * incorporate our XID into a MultiXact immediately afterwards.) - */ - MultiXactIdSetOldestMember(); - - /* - * Compute the new xmax and infomask to store into the tuple. Note we do - * not modify the tuple just yet, because that would leave it in the wrong - * state if multixact.c elogs. - */ - compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2, - GetCurrentTransactionId(), mode, false, - &xid, &new_infomask, &new_infomask2); - - START_CRIT_SECTION(); - - /* - * Store transaction information of xact locking the tuple. - * - * Note: Cmax is meaningless in this context, so don't set it; this avoids - * possibly generating a useless combo CID. Moreover, if we're locking a - * previously updated tuple, it's important to preserve the Cmax. - * - * Also reset the HOT UPDATE bit, but only if there's no update; otherwise - * we would break the HOT chain. - */ - tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS; - tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; - tuple->t_data->t_infomask |= new_infomask; - tuple->t_data->t_infomask2 |= new_infomask2; - if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) - HeapTupleHeaderClearHotUpdated(tuple->t_data); - HeapTupleHeaderSetXmax(tuple->t_data, xid); - - /* - * Make sure there is no forward chain link in t_ctid. Note that in the - * cases where the tuple has been updated, we must not overwrite t_ctid, - * because it was set by the updater. Moreover, if the tuple has been - * updated, we need to follow the update chain to lock the new versions of - * the tuple as well. - */ - if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) - tuple->t_data->t_ctid = *tid; - - /* Clear only the all-frozen bit on visibility map if needed */ - if (PageIsAllVisible(page) && - tdeheap_visibilitymap_clear(relation, block, vmbuffer, - VISIBILITYMAP_ALL_FROZEN)) - cleared_all_frozen = true; - - - MarkBufferDirty(*buffer); - - /* - * XLOG stuff. You might think that we don't need an XLOG record because - * there is no state change worth restoring after a crash. You would be - * wrong however: we have just written either a TransactionId or a - * MultiXactId that may never have been seen on disk before, and we need - * to make sure that there are XLOG entries covering those ID numbers. - * Else the same IDs might be re-used after a crash, which would be - * disastrous if this page made it to disk before the crash. Essentially - * we have to enforce the WAL log-before-data rule even in this case. - * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG - * entries for everything anyway.) - */ - if (RelationNeedsWAL(relation)) - { - xl_tdeheap_lock xlrec; - XLogRecPtr recptr; - - XLogBeginInsert(); - XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); - - xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); - xlrec.xmax = xid; - xlrec.infobits_set = compute_infobits(new_infomask, - tuple->t_data->t_infomask2); - xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - XLogRegisterData((char *) &xlrec, SizeOfHeapLock); - - /* we don't decode row locks atm, so no need to log the origin */ - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - result = TM_Ok; - -out_locked: - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); - -out_unlocked: - if (BufferIsValid(vmbuffer)) - ReleaseBuffer(vmbuffer); - - /* - * Don't update the visibility map here. Locking a tuple doesn't change - * visibility info. - */ - - /* - * Now that we have successfully marked the tuple as locked, we can - * release the lmgr tuple lock, if we had it. - */ - if (have_tuple_lock) - UnlockTupleTuplock(relation, tid, mode); - - return result; -} - -/* - * Acquire heavyweight lock on the given tuple, in preparation for acquiring - * its normal, Xmax-based tuple lock. - * - * have_tuple_lock is an input and output parameter: on input, it indicates - * whether the lock has previously been acquired (and this function does - * nothing in that case). If this function returns success, have_tuple_lock - * has been flipped to true. - * - * Returns false if it was unable to obtain the lock; this can only happen if - * wait_policy is Skip. - */ -static bool -tdeheap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, - LockWaitPolicy wait_policy, bool *have_tuple_lock) -{ - if (*have_tuple_lock) - return true; - - switch (wait_policy) - { - case LockWaitBlock: - LockTupleTuplock(relation, tid, mode); - break; - - case LockWaitSkip: - if (!ConditionalLockTupleTuplock(relation, tid, mode)) - return false; - break; - - case LockWaitError: - if (!ConditionalLockTupleTuplock(relation, tid, mode)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", - RelationGetRelationName(relation)))); - break; - } - *have_tuple_lock = true; - - return true; -} - -/* - * Given an original set of Xmax and infomask, and a transaction (identified by - * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and - * corresponding infomasks to use on the tuple. - * - * Note that this might have side effects such as creating a new MultiXactId. - * - * Most callers will have called HeapTupleSatisfiesUpdate before this function; - * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId - * but it was not running anymore. There is a race condition, which is that the - * MultiXactId may have finished since then, but that uncommon case is handled - * either here, or within MultiXactIdExpand. - * - * There is a similar race condition possible when the old xmax was a regular - * TransactionId. We test TransactionIdIsInProgress again just to narrow the - * window, but it's still possible to end up creating an unnecessary - * MultiXactId. Fortunately this is harmless. - */ -static void -compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, - uint16 old_infomask2, TransactionId add_to_xmax, - LockTupleMode mode, bool is_update, - TransactionId *result_xmax, uint16 *result_infomask, - uint16 *result_infomask2) -{ - TransactionId new_xmax; - uint16 new_infomask, - new_infomask2; - - Assert(TransactionIdIsCurrentTransactionId(add_to_xmax)); - -l5: - new_infomask = 0; - new_infomask2 = 0; - if (old_infomask & HEAP_XMAX_INVALID) - { - /* - * No previous locker; we just insert our own TransactionId. - * - * Note that it's critical that this case be the first one checked, - * because there are several blocks below that come back to this one - * to implement certain optimizations; old_infomask might contain - * other dirty bits in those cases, but we don't really care. - */ - if (is_update) - { - new_xmax = add_to_xmax; - if (mode == LockTupleExclusive) - new_infomask2 |= HEAP_KEYS_UPDATED; - } - else - { - new_infomask |= HEAP_XMAX_LOCK_ONLY; - switch (mode) - { - case LockTupleKeyShare: - new_xmax = add_to_xmax; - new_infomask |= HEAP_XMAX_KEYSHR_LOCK; - break; - case LockTupleShare: - new_xmax = add_to_xmax; - new_infomask |= HEAP_XMAX_SHR_LOCK; - break; - case LockTupleNoKeyExclusive: - new_xmax = add_to_xmax; - new_infomask |= HEAP_XMAX_EXCL_LOCK; - break; - case LockTupleExclusive: - new_xmax = add_to_xmax; - new_infomask |= HEAP_XMAX_EXCL_LOCK; - new_infomask2 |= HEAP_KEYS_UPDATED; - break; - default: - new_xmax = InvalidTransactionId; /* silence compiler */ - elog(ERROR, "invalid lock mode"); - } - } - } - else if (old_infomask & HEAP_XMAX_IS_MULTI) - { - MultiXactStatus new_status; - - /* - * Currently we don't allow XMAX_COMMITTED to be set for multis, so - * cross-check. - */ - Assert(!(old_infomask & HEAP_XMAX_COMMITTED)); - - /* - * A multixact together with LOCK_ONLY set but neither lock bit set - * (i.e. a pg_upgraded share locked tuple) cannot possibly be running - * anymore. This check is critical for databases upgraded by - * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume - * that such multis are never passed. - */ - if (HEAP_LOCKED_UPGRADED(old_infomask)) - { - old_infomask &= ~HEAP_XMAX_IS_MULTI; - old_infomask |= HEAP_XMAX_INVALID; - goto l5; - } - - /* - * If the XMAX is already a MultiXactId, then we need to expand it to - * include add_to_xmax; but if all the members were lockers and are - * all gone, we can do away with the IS_MULTI bit and just set - * add_to_xmax as the only locker/updater. If all lockers are gone - * and we have an updater that aborted, we can also do without a - * multi. - * - * The cost of doing GetMultiXactIdMembers would be paid by - * MultiXactIdExpand if we weren't to do this, so this check is not - * incurring extra work anyhow. - */ - if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))) - { - if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) || - !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax, - old_infomask))) - { - /* - * Reset these bits and restart; otherwise fall through to - * create a new multi below. - */ - old_infomask &= ~HEAP_XMAX_IS_MULTI; - old_infomask |= HEAP_XMAX_INVALID; - goto l5; - } - } - - new_status = get_mxact_status_for_lock(mode, is_update); - - new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax, - new_status); - GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); - } - else if (old_infomask & HEAP_XMAX_COMMITTED) - { - /* - * It's a committed update, so we need to preserve him as updater of - * the tuple. - */ - MultiXactStatus status; - MultiXactStatus new_status; - - if (old_infomask2 & HEAP_KEYS_UPDATED) - status = MultiXactStatusUpdate; - else - status = MultiXactStatusNoKeyUpdate; - - new_status = get_mxact_status_for_lock(mode, is_update); - - /* - * since it's not running, it's obviously impossible for the old - * updater to be identical to the current one, so we need not check - * for that case as we do in the block above. - */ - new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); - GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); - } - else if (TransactionIdIsInProgress(xmax)) - { - /* - * If the XMAX is a valid, in-progress TransactionId, then we need to - * create a new MultiXactId that includes both the old locker or - * updater and our own TransactionId. - */ - MultiXactStatus new_status; - MultiXactStatus old_status; - LockTupleMode old_mode; - - if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) - { - if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) - old_status = MultiXactStatusForKeyShare; - else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) - old_status = MultiXactStatusForShare; - else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) - { - if (old_infomask2 & HEAP_KEYS_UPDATED) - old_status = MultiXactStatusForUpdate; - else - old_status = MultiXactStatusForNoKeyUpdate; - } - else - { - /* - * LOCK_ONLY can be present alone only when a page has been - * upgraded by pg_upgrade. But in that case, - * TransactionIdIsInProgress() should have returned false. We - * assume it's no longer locked in this case. - */ - elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax); - old_infomask |= HEAP_XMAX_INVALID; - old_infomask &= ~HEAP_XMAX_LOCK_ONLY; - goto l5; - } - } - else - { - /* it's an update, but which kind? */ - if (old_infomask2 & HEAP_KEYS_UPDATED) - old_status = MultiXactStatusUpdate; - else - old_status = MultiXactStatusNoKeyUpdate; - } - - old_mode = TUPLOCK_from_mxstatus(old_status); - - /* - * If the lock to be acquired is for the same TransactionId as the - * existing lock, there's an optimization possible: consider only the - * strongest of both locks as the only one present, and restart. - */ - if (xmax == add_to_xmax) - { - /* - * Note that it's not possible for the original tuple to be - * updated: we wouldn't be here because the tuple would have been - * invisible and we wouldn't try to update it. As a subtlety, - * this code can also run when traversing an update chain to lock - * future versions of a tuple. But we wouldn't be here either, - * because the add_to_xmax would be different from the original - * updater. - */ - Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); - - /* acquire the strongest of both */ - if (mode < old_mode) - mode = old_mode; - /* mustn't touch is_update */ - - old_infomask |= HEAP_XMAX_INVALID; - goto l5; - } - - /* otherwise, just fall back to creating a new multixact */ - new_status = get_mxact_status_for_lock(mode, is_update); - new_xmax = MultiXactIdCreate(xmax, old_status, - add_to_xmax, new_status); - GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); - } - else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) && - TransactionIdDidCommit(xmax)) - { - /* - * It's a committed update, so we gotta preserve him as updater of the - * tuple. - */ - MultiXactStatus status; - MultiXactStatus new_status; - - if (old_infomask2 & HEAP_KEYS_UPDATED) - status = MultiXactStatusUpdate; - else - status = MultiXactStatusNoKeyUpdate; - - new_status = get_mxact_status_for_lock(mode, is_update); - - /* - * since it's not running, it's obviously impossible for the old - * updater to be identical to the current one, so we need not check - * for that case as we do in the block above. - */ - new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); - GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); - } - else - { - /* - * Can get here iff the locking/updating transaction was running when - * the infomask was extracted from the tuple, but finished before - * TransactionIdIsInProgress got to run. Deal with it as if there was - * no locker at all in the first place. - */ - old_infomask |= HEAP_XMAX_INVALID; - goto l5; - } - - *result_infomask = new_infomask; - *result_infomask2 = new_infomask2; - *result_xmax = new_xmax; -} - -/* - * Subroutine for tdeheap_lock_updated_tuple_rec. - * - * Given a hypothetical multixact status held by the transaction identified - * with the given xid, does the current transaction need to wait, fail, or can - * it continue if it wanted to acquire a lock of the given mode? "needwait" - * is set to true if waiting is necessary; if it can continue, then TM_Ok is - * returned. If the lock is already held by the current transaction, return - * TM_SelfModified. In case of a conflict with another transaction, a - * different HeapTupleSatisfiesUpdate return code is returned. - * - * The held status is said to be hypothetical because it might correspond to a - * lock held by a single Xid, i.e. not a real MultiXactId; we express it this - * way for simplicity of API. - */ -static TM_Result -test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, - LockTupleMode mode, HeapTuple tup, - bool *needwait) -{ - MultiXactStatus wantedstatus; - - *needwait = false; - wantedstatus = get_mxact_status_for_lock(mode, false); - - /* - * Note: we *must* check TransactionIdIsInProgress before - * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c - * for an explanation. - */ - if (TransactionIdIsCurrentTransactionId(xid)) - { - /* - * The tuple has already been locked by our own transaction. This is - * very rare but can happen if multiple transactions are trying to - * lock an ancient version of the same tuple. - */ - return TM_SelfModified; - } - else if (TransactionIdIsInProgress(xid)) - { - /* - * If the locking transaction is running, what we do depends on - * whether the lock modes conflict: if they do, then we must wait for - * it to finish; otherwise we can fall through to lock this tuple - * version without waiting. - */ - if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), - LOCKMODE_from_mxstatus(wantedstatus))) - { - *needwait = true; - } - - /* - * If we set needwait above, then this value doesn't matter; - * otherwise, this value signals to caller that it's okay to proceed. - */ - return TM_Ok; - } - else if (TransactionIdDidAbort(xid)) - return TM_Ok; - else if (TransactionIdDidCommit(xid)) - { - /* - * The other transaction committed. If it was only a locker, then the - * lock is completely gone now and we can return success; but if it - * was an update, then what we do depends on whether the two lock - * modes conflict. If they conflict, then we must report error to - * caller. But if they don't, we can fall through to allow the current - * transaction to lock the tuple. - * - * Note: the reason we worry about ISUPDATE here is because as soon as - * a transaction ends, all its locks are gone and meaningless, and - * thus we can ignore them; whereas its updates persist. In the - * TransactionIdIsInProgress case, above, we don't need to check - * because we know the lock is still "alive" and thus a conflict needs - * always be checked. - */ - if (!ISUPDATE_from_mxstatus(status)) - return TM_Ok; - - if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), - LOCKMODE_from_mxstatus(wantedstatus))) - { - /* bummer */ - if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid)) - return TM_Updated; - else - return TM_Deleted; - } - - return TM_Ok; - } - - /* Not in progress, not aborted, not committed -- must have crashed */ - return TM_Ok; -} - - -/* - * Recursive part of tdeheap_lock_updated_tuple - * - * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given - * xid with the given mode; if this tuple is updated, recurse to lock the new - * version as well. - */ -static TM_Result -tdeheap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, - LockTupleMode mode) -{ - TM_Result result; - ItemPointerData tupid; - HeapTupleData mytup; - Buffer buf; - uint16 new_infomask, - new_infomask2, - old_infomask, - old_infomask2; - TransactionId xmax, - new_xmax; - TransactionId priorXmax = InvalidTransactionId; - bool cleared_all_frozen = false; - bool pinned_desired_page; - Buffer vmbuffer = InvalidBuffer; - BlockNumber block; - - ItemPointerCopy(tid, &tupid); - - for (;;) - { - new_infomask = 0; - new_xmax = InvalidTransactionId; - block = ItemPointerGetBlockNumber(&tupid); - ItemPointerCopy(&tupid, &(mytup.t_self)); - - if (!tdeheap_fetch(rel, SnapshotAny, &mytup, &buf, false)) - { - /* - * if we fail to find the updated version of the tuple, it's - * because it was vacuumed/pruned away after its creator - * transaction aborted. So behave as if we got to the end of the - * chain, and there's no further tuple to lock: return success to - * caller. - */ - result = TM_Ok; - goto out_unlocked; - } - -l4: - CHECK_FOR_INTERRUPTS(); - - /* - * Before locking the buffer, pin the visibility map page if it - * appears to be necessary. Since we haven't got the lock yet, - * someone else might be in the middle of changing this, so we'll need - * to recheck after we have the lock. - */ - if (PageIsAllVisible(BufferGetPage(buf))) - { - tdeheap_visibilitymap_pin(rel, block, &vmbuffer); - pinned_desired_page = true; - } - else - pinned_desired_page = false; - - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - - /* - * If we didn't pin the visibility map page and the page has become - * all visible while we were busy locking the buffer, we'll have to - * unlock and re-lock, to avoid holding the buffer lock across I/O. - * That's a bit unfortunate, but hopefully shouldn't happen often. - * - * Note: in some paths through this function, we will reach here - * holding a pin on a vm page that may or may not be the one matching - * this page. If this page isn't all-visible, we won't use the vm - * page, but we hold onto such a pin till the end of the function. - */ - if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf))) - { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - tdeheap_visibilitymap_pin(rel, block, &vmbuffer); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - } - - /* - * Check the tuple XMIN against prior XMAX, if any. If we reached the - * end of the chain, we're done, so return success. - */ - if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data), - priorXmax)) - { - result = TM_Ok; - goto out_locked; - } - - /* - * Also check Xmin: if this tuple was created by an aborted - * (sub)transaction, then we already locked the last live one in the - * chain, thus we're done, so return success. - */ - if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data))) - { - result = TM_Ok; - goto out_locked; - } - - old_infomask = mytup.t_data->t_infomask; - old_infomask2 = mytup.t_data->t_infomask2; - xmax = HeapTupleHeaderGetRawXmax(mytup.t_data); - - /* - * If this tuple version has been updated or locked by some concurrent - * transaction(s), what we do depends on whether our lock mode - * conflicts with what those other transactions hold, and also on the - * status of them. - */ - if (!(old_infomask & HEAP_XMAX_INVALID)) - { - TransactionId rawxmax; - bool needwait; - - rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data); - if (old_infomask & HEAP_XMAX_IS_MULTI) - { - int nmembers; - int i; - MultiXactMember *members; - - /* - * We don't need a test for pg_upgrade'd tuples: this is only - * applied to tuples after the first in an update chain. Said - * first tuple in the chain may well be locked-in-9.2-and- - * pg_upgraded, but that one was already locked by our caller, - * not us; and any subsequent ones cannot be because our - * caller must necessarily have obtained a snapshot later than - * the pg_upgrade itself. - */ - Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask)); - - nmembers = GetMultiXactIdMembers(rawxmax, &members, false, - HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); - for (i = 0; i < nmembers; i++) - { - result = test_lockmode_for_conflict(members[i].status, - members[i].xid, - mode, - &mytup, - &needwait); - - /* - * If the tuple was already locked by ourselves in a - * previous iteration of this (say tdeheap_lock_tuple was - * forced to restart the locking loop because of a change - * in xmax), then we hold the lock already on this tuple - * version and we don't need to do anything; and this is - * not an error condition either. We just need to skip - * this tuple and continue locking the next version in the - * update chain. - */ - if (result == TM_SelfModified) - { - pfree(members); - goto next; - } - - if (needwait) - { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - XactLockTableWait(members[i].xid, rel, - &mytup.t_self, - XLTW_LockUpdated); - pfree(members); - goto l4; - } - if (result != TM_Ok) - { - pfree(members); - goto out_locked; - } - } - if (members) - pfree(members); - } - else - { - MultiXactStatus status; - - /* - * For a non-multi Xmax, we first need to compute the - * corresponding MultiXactStatus by using the infomask bits. - */ - if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) - { - if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) - status = MultiXactStatusForKeyShare; - else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) - status = MultiXactStatusForShare; - else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) - { - if (old_infomask2 & HEAP_KEYS_UPDATED) - status = MultiXactStatusForUpdate; - else - status = MultiXactStatusForNoKeyUpdate; - } - else - { - /* - * LOCK_ONLY present alone (a pg_upgraded tuple marked - * as share-locked in the old cluster) shouldn't be - * seen in the middle of an update chain. - */ - elog(ERROR, "invalid lock status in tuple"); - } - } - else - { - /* it's an update, but which kind? */ - if (old_infomask2 & HEAP_KEYS_UPDATED) - status = MultiXactStatusUpdate; - else - status = MultiXactStatusNoKeyUpdate; - } - - result = test_lockmode_for_conflict(status, rawxmax, mode, - &mytup, &needwait); - - /* - * If the tuple was already locked by ourselves in a previous - * iteration of this (say tdeheap_lock_tuple was forced to - * restart the locking loop because of a change in xmax), then - * we hold the lock already on this tuple version and we don't - * need to do anything; and this is not an error condition - * either. We just need to skip this tuple and continue - * locking the next version in the update chain. - */ - if (result == TM_SelfModified) - goto next; - - if (needwait) - { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - XactLockTableWait(rawxmax, rel, &mytup.t_self, - XLTW_LockUpdated); - goto l4; - } - if (result != TM_Ok) - { - goto out_locked; - } - } - } - - /* compute the new Xmax and infomask values for the tuple ... */ - compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2, - xid, mode, false, - &new_xmax, &new_infomask, &new_infomask2); - - if (PageIsAllVisible(BufferGetPage(buf)) && - tdeheap_visibilitymap_clear(rel, block, vmbuffer, - VISIBILITYMAP_ALL_FROZEN)) - cleared_all_frozen = true; - - START_CRIT_SECTION(); - - /* ... and set them */ - HeapTupleHeaderSetXmax(mytup.t_data, new_xmax); - mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS; - mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; - mytup.t_data->t_infomask |= new_infomask; - mytup.t_data->t_infomask2 |= new_infomask2; - - MarkBufferDirty(buf); - - /* XLOG stuff */ - if (RelationNeedsWAL(rel)) - { - xl_tdeheap_lock_updated xlrec; - XLogRecPtr recptr; - Page page = BufferGetPage(buf); - - XLogBeginInsert(); - XLogRegisterBuffer(0, buf, REGBUF_STANDARD); - - xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self); - xlrec.xmax = new_xmax; - xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2); - xlrec.flags = - cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - - XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated); - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - -next: - /* if we find the end of update chain, we're done. */ - if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || - HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) || - ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || - HeapTupleHeaderIsOnlyLocked(mytup.t_data)) - { - result = TM_Ok; - goto out_locked; - } - - /* tail recursion */ - priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data); - ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); - UnlockReleaseBuffer(buf); - } - - result = TM_Ok; - -out_locked: - UnlockReleaseBuffer(buf); - -out_unlocked: - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - - return result; -} - -/* - * tdeheap_lock_updated_tuple - * Follow update chain when locking an updated tuple, acquiring locks (row - * marks) on the updated versions. - * - * The initial tuple is assumed to be already locked. - * - * This function doesn't check visibility, it just unconditionally marks the - * tuple(s) as locked. If any tuple in the updated chain is being deleted - * concurrently (or updated with the key being modified), sleep until the - * transaction doing it is finished. - * - * Note that we don't acquire heavyweight tuple locks on the tuples we walk - * when we have to wait for other transactions to release them, as opposed to - * what tdeheap_lock_tuple does. The reason is that having more than one - * transaction walking the chain is probably uncommon enough that risk of - * starvation is not likely: one of the preconditions for being here is that - * the snapshot in use predates the update that created this tuple (because we - * started at an earlier version of the tuple), but at the same time such a - * transaction cannot be using repeatable read or serializable isolation - * levels, because that would lead to a serializability failure. - */ -static TM_Result -tdeheap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, - TransactionId xid, LockTupleMode mode) -{ - /* - * If the tuple has not been updated, or has moved into another partition - * (effectively a delete) stop here. - */ - if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) && - !ItemPointerEquals(&tuple->t_self, ctid)) - { - /* - * If this is the first possibly-multixact-able operation in the - * current transaction, set my per-backend OldestMemberMXactId - * setting. We can be certain that the transaction will never become a - * member of any older MultiXactIds than that. (We have to do this - * even if we end up just using our own TransactionId below, since - * some other backend could incorporate our XID into a MultiXact - * immediately afterwards.) - */ - MultiXactIdSetOldestMember(); - - return tdeheap_lock_updated_tuple_rec(rel, ctid, xid, mode); - } - - /* nothing to lock */ - return TM_Ok; -} - -/* - * tdeheap_finish_speculative - mark speculative insertion as successful - * - * To successfully finish a speculative insertion we have to clear speculative - * token from tuple. To do so the t_ctid field, which will contain a - * speculative token value, is modified in place to point to the tuple itself, - * which is characteristic of a newly inserted ordinary tuple. - * - * NB: It is not ok to commit without either finishing or aborting a - * speculative insertion. We could treat speculative tuples of committed - * transactions implicitly as completed, but then we would have to be prepared - * to deal with speculative tokens on committed tuples. That wouldn't be - * difficult - no-one looks at the ctid field of a tuple with invalid xmax - - * but clearing the token at completion isn't very expensive either. - * An explicit confirmation WAL record also makes logical decoding simpler. - */ -void -tdeheap_finish_speculative(Relation relation, ItemPointer tid) -{ - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp = NULL; - HeapTupleHeader htup; - - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); - - offnum = ItemPointerGetOffsetNumber(tid); - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) - elog(ERROR, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - Assert(HeapTupleHeaderIsSpeculative(htup)); - - MarkBufferDirty(buffer); - - /* - * Replace the speculative insertion token with a real t_ctid, pointing to - * itself like it does on regular tuples. - */ - htup->t_ctid = *tid; - - /* XLOG stuff */ - if (RelationNeedsWAL(relation)) - { - xl_tdeheap_confirm xlrec; - XLogRecPtr recptr; - - xlrec.offnum = ItemPointerGetOffsetNumber(tid); - - XLogBeginInsert(); - - /* We want the same filtering on this as on a plain insert */ - XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - - XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - UnlockReleaseBuffer(buffer); -} - -/* - * tdeheap_abort_speculative - kill a speculatively inserted tuple - * - * Marks a tuple that was speculatively inserted in the same command as dead, - * by setting its xmin as invalid. That makes it immediately appear as dead - * to all transactions, including our own. In particular, it makes - * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend - * inserting a duplicate key value won't unnecessarily wait for our whole - * transaction to finish (it'll just wait for our speculative insertion to - * finish). - * - * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks - * that arise due to a mutual dependency that is not user visible. By - * definition, unprincipled deadlocks cannot be prevented by the user - * reordering lock acquisition in client code, because the implementation level - * lock acquisitions are not under the user's direct control. If speculative - * inserters did not take this precaution, then under high concurrency they - * could deadlock with each other, which would not be acceptable. - * - * This is somewhat redundant with tdeheap_delete, but we prefer to have a - * dedicated routine with stripped down requirements. Note that this is also - * used to delete the TOAST tuples created during speculative insertion. - * - * This routine does not affect logical decoding as it only looks at - * confirmation records. - */ -void -tdeheap_abort_speculative(Relation relation, ItemPointer tid) -{ - TransactionId xid = GetCurrentTransactionId(); - ItemId lp; - HeapTupleData tp; - Page page; - BlockNumber block; - Buffer buffer; - TransactionId prune_xid; - - Assert(ItemPointerIsValid(tid)); - - block = ItemPointerGetBlockNumber(tid); - buffer = ReadBuffer(relation, block); - page = BufferGetPage(buffer); - - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * Page can't be all visible, we just inserted into it, and are still - * running. - */ - Assert(!PageIsAllVisible(page)); - - lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); - Assert(ItemIdIsNormal(lp)); - - tp.t_tableOid = RelationGetRelid(relation); - tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); - tp.t_len = ItemIdGetLength(lp); - tp.t_self = *tid; - - /* - * Sanity check that the tuple really is a speculatively inserted tuple, - * inserted by us. - */ - if (tp.t_data->t_choice.t_heap.t_xmin != xid) - elog(ERROR, "attempted to kill a tuple inserted by another transaction"); - if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data))) - elog(ERROR, "attempted to kill a non-speculative tuple"); - Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); - - /* - * No need to check for serializable conflicts here. There is never a - * need for a combo CID, either. No need to extract replica identity, or - * do anything special with infomask bits. - */ - - START_CRIT_SECTION(); - - /* - * The tuple will become DEAD immediately. Flag that this page is a - * candidate for pruning by setting xmin to TransactionXmin. While not - * immediately prunable, it is the oldest xid we can cheaply determine - * that's safe against wraparound / being older than the table's - * relfrozenxid. To defend against the unlikely case of a new relation - * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid - * if so (vacuum can't subsequently move relfrozenxid to beyond - * TransactionXmin, so there's no race here). - */ - Assert(TransactionIdIsValid(TransactionXmin)); - if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid)) - prune_xid = relation->rd_rel->relfrozenxid; - else - prune_xid = TransactionXmin; - PageSetPrunable(page, prune_xid); - - /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; - - /* - * Set the tuple header xmin to InvalidTransactionId. This makes the - * tuple immediately invisible everyone. (In particular, to any - * transactions waiting on the speculative token, woken up later.) - */ - HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); - - /* Clear the speculative insertion token too */ - tp.t_data->t_ctid = tp.t_self; - - MarkBufferDirty(buffer); - - /* - * XLOG stuff - * - * The WAL records generated here match tdeheap_delete(). The same recovery - * routines are used. - */ - if (RelationNeedsWAL(relation)) - { - xl_tdeheap_delete xlrec; - XLogRecPtr recptr; - - xlrec.flags = XLH_DELETE_IS_SUPER; - xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, - tp.t_data->t_infomask2); - xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); - xlrec.xmax = xid; - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - - /* No replica identity & replication origin logged */ - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - if (HeapTupleHasExternal(&tp)) - { - Assert(!IsToastRelation(relation)); - tdeheap_toast_delete(relation, &tp, true); - } - - /* - * Never need to mark tuple for invalidation, since catalogs don't support - * speculative insertion - */ - - /* Now we can release the buffer */ - ReleaseBuffer(buffer); - - /* count deletion, as we counted the insertion too */ - pgstat_count_tdeheap_delete(relation); -} - -/* - * tdeheap_inplace_update - update a tuple "in place" (ie, overwrite it) - * - * Overwriting violates both MVCC and transactional safety, so the uses - * of this function in Postgres are extremely limited. Nonetheless we - * find some places to use it. - * - * The tuple cannot change size, and therefore it's reasonable to assume - * that its null bitmap (if any) doesn't change either. So we just - * overwrite the data portion of the tuple without touching the null - * bitmap or any of the header fields. - * - * tuple is an in-memory tuple structure containing the data to be written - * over the target tuple. Also, tuple->t_self identifies the target tuple. - * - * Note that the tuple updated here had better not come directly from the - * syscache if the relation has a toast relation as this tuple could - * include toast values that have been expanded, causing a failure here. - */ -void -tdeheap_inplace_update(Relation relation, HeapTuple tuple) -{ - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp = NULL; - HeapTupleHeader htup; - uint32 oldlen; - uint32 newlen; - - /* - * For now, we don't allow parallel updates. Unlike a regular update, - * this should never create a combo CID, so it might be possible to relax - * this restriction, but not without more thought and testing. It's not - * clear that it would be useful, anyway. - */ - if (IsInParallelMode()) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TRANSACTION_STATE), - errmsg("cannot update tuples during a parallel operation"))); - - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self))); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); - - offnum = ItemPointerGetOffsetNumber(&(tuple->t_self)); - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) - elog(ERROR, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - oldlen = ItemIdGetLength(lp) - htup->t_hoff; - newlen = tuple->t_len - tuple->t_data->t_hoff; - if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff) - elog(ERROR, "wrong tuple length"); - - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - memcpy((char *) htup + htup->t_hoff, - (char *) tuple->t_data + tuple->t_data->t_hoff, - newlen); - - MarkBufferDirty(buffer); - - /* XLOG stuff */ - if (RelationNeedsWAL(relation)) - { - xl_tdeheap_inplace xlrec; - XLogRecPtr recptr; - - xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapInplace); - - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); - - /* inplace updates aren't decoded atm, don't log the origin */ - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - UnlockReleaseBuffer(buffer); - - /* - * Send out shared cache inval if necessary. Note that because we only - * pass the new version of the tuple, this mustn't be used for any - * operations that could change catcache lookup keys. But we aren't - * bothering with index updates either, so that's true a fortiori. - */ - if (!IsBootstrapProcessingMode()) - CacheInvalidateHeapTuple(relation, tuple, NULL); -} - -#define FRM_NOOP 0x0001 -#define FRM_INVALIDATE_XMAX 0x0002 -#define FRM_RETURN_IS_XID 0x0004 -#define FRM_RETURN_IS_MULTI 0x0008 -#define FRM_MARK_COMMITTED 0x0010 - -/* - * FreezeMultiXactId - * Determine what to do during freezing when a tuple is marked by a - * MultiXactId. - * - * "flags" is an output value; it's used to tell caller what to do on return. - * "pagefrz" is an input/output value, used to manage page level freezing. - * - * Possible values that we can set in "flags": - * FRM_NOOP - * don't do anything -- keep existing Xmax - * FRM_INVALIDATE_XMAX - * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag. - * FRM_RETURN_IS_XID - * The Xid return value is a single update Xid to set as xmax. - * FRM_MARK_COMMITTED - * Xmax can be marked as HEAP_XMAX_COMMITTED - * FRM_RETURN_IS_MULTI - * The return value is a new MultiXactId to set as new Xmax. - * (caller must obtain proper infomask bits using GetMultiXactIdHintBits) - * - * Caller delegates control of page freezing to us. In practice we always - * force freezing of caller's page unless FRM_NOOP processing is indicated. - * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff - * can never be left behind. We freely choose when and how to process each - * Multi, without ever violating the cutoff postconditions for freezing. - * - * It's useful to remove Multis on a proactive timeline (relative to freezing - * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also - * be cheaper in the short run, for us, since we too can avoid SLRU buffer - * misses through eager processing. - * - * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only - * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice. - * This can usually be put off, which is usually enough to avoid it altogether. - * Allocating new multis during VACUUM should be avoided on general principle; - * only VACUUM can advance relminmxid, so allocating new Multis here comes with - * its own special risks. - * - * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers - * using tdeheap_tuple_should_freeze when we haven't forced page-level freezing. - * - * NB: Caller should avoid needlessly calling tdeheap_tuple_should_freeze when we - * have already forced page-level freezing, since that might incur the same - * SLRU buffer misses that we specifically intended to avoid by freezing. - */ -static TransactionId -FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, - const struct VacuumCutoffs *cutoffs, uint16 *flags, - HeapPageFreeze *pagefrz) -{ - TransactionId newxmax; - MultiXactMember *members; - int nmembers; - bool need_replace; - int nnewmembers; - MultiXactMember *newmembers; - bool has_lockers; - TransactionId update_xid; - bool update_committed; - TransactionId FreezePageRelfrozenXid; - - *flags = 0; - - /* We should only be called in Multis */ - Assert(t_infomask & HEAP_XMAX_IS_MULTI); - - if (!MultiXactIdIsValid(multi) || - HEAP_LOCKED_UPGRADED(t_infomask)) - { - *flags |= FRM_INVALIDATE_XMAX; - pagefrz->freeze_required = true; - return InvalidTransactionId; - } - else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("found multixact %u from before relminmxid %u", - multi, cutoffs->relminmxid))); - else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact)) - { - TransactionId update_xact; - - /* - * This old multi cannot possibly have members still running, but - * verify just in case. If it was a locker only, it can be removed - * without any further consideration; but if it contained an update, - * we might need to preserve it. - */ - if (MultiXactIdIsRunning(multi, - HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running", - multi, cutoffs->OldestMxact))); - - if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)) - { - *flags |= FRM_INVALIDATE_XMAX; - pagefrz->freeze_required = true; - return InvalidTransactionId; - } - - /* replace multi with single XID for its updater? */ - update_xact = MultiXactIdGetUpdateXid(multi, t_infomask); - if (TransactionIdPrecedes(update_xact, cutoffs->relfrozenxid)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u", - multi, update_xact, - cutoffs->relfrozenxid))); - else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin)) - { - /* - * Updater XID has to have aborted (otherwise the tuple would have - * been pruned away instead, since updater XID is < OldestXmin). - * Just remove xmax. - */ - if (TransactionIdDidCommit(update_xact)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u", - multi, update_xact, - cutoffs->OldestXmin))); - *flags |= FRM_INVALIDATE_XMAX; - pagefrz->freeze_required = true; - return InvalidTransactionId; - } - - /* Have to keep updater XID as new xmax */ - *flags |= FRM_RETURN_IS_XID; - pagefrz->freeze_required = true; - return update_xact; - } - - /* - * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we - * need to walk the whole members array to figure out what to do, if - * anything. - */ - nmembers = - GetMultiXactIdMembers(multi, &members, false, - HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)); - if (nmembers <= 0) - { - /* Nothing worth keeping */ - *flags |= FRM_INVALIDATE_XMAX; - pagefrz->freeze_required = true; - return InvalidTransactionId; - } - - /* - * The FRM_NOOP case is the only case where we might need to ratchet back - * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only - * case where our caller might ratchet back its NoFreezePageRelfrozenXid - * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi. - * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid - * trackers managed by VACUUM being ratcheting back by xmax to the degree - * required to make it safe to leave xmax undisturbed, independent of - * whether or not page freezing is triggered somewhere else. - * - * Our policy is to force freezing in every case other than FRM_NOOP, - * which obviates the need to maintain either set of trackers, anywhere. - * Every other case will reliably execute a freeze plan for xmax that - * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or - * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen. - * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with - * OldestXmin/OldestMxact, so later values never need to be tracked here.) - */ - need_replace = false; - FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid; - for (int i = 0; i < nmembers; i++) - { - TransactionId xid = members[i].xid; - - Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid)); - - if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) - { - /* Can't violate the FreezeLimit postcondition */ - need_replace = true; - break; - } - if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid)) - FreezePageRelfrozenXid = xid; - } - - /* Can't violate the MultiXactCutoff postcondition, either */ - if (!need_replace) - need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff); - - if (!need_replace) - { - /* - * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or - * both together to make it safe to retain this particular multi after - * freezing its page - */ - *flags |= FRM_NOOP; - pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid; - if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid)) - pagefrz->FreezePageRelminMxid = multi; - pfree(members); - return multi; - } - - /* - * Do a more thorough second pass over the multi to figure out which - * member XIDs actually need to be kept. Checking the precise status of - * individual members might even show that we don't need to keep anything. - * That is quite possible even though the Multi must be >= OldestMxact, - * since our second pass only keeps member XIDs when it's truly necessary; - * even member XIDs >= OldestXmin often won't be kept by second pass. - */ - nnewmembers = 0; - newmembers = palloc(sizeof(MultiXactMember) * nmembers); - has_lockers = false; - update_xid = InvalidTransactionId; - update_committed = false; - - /* - * Determine whether to keep each member xid, or to ignore it instead - */ - for (int i = 0; i < nmembers; i++) - { - TransactionId xid = members[i].xid; - MultiXactStatus mstatus = members[i].status; - - Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid)); - - if (!ISUPDATE_from_mxstatus(mstatus)) - { - /* - * Locker XID (not updater XID). We only keep lockers that are - * still running. - */ - if (TransactionIdIsCurrentTransactionId(xid) || - TransactionIdIsInProgress(xid)) - { - if (TransactionIdPrecedes(xid, cutoffs->OldestXmin)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u", - multi, xid, - cutoffs->OldestXmin))); - newmembers[nnewmembers++] = members[i]; - has_lockers = true; - } - - continue; - } - - /* - * Updater XID (not locker XID). Should we keep it? - * - * Since the tuple wasn't totally removed when vacuum pruned, the - * update Xid cannot possibly be older than OldestXmin cutoff unless - * the updater XID aborted. If the updater transaction is known - * aborted or crashed then it's okay to ignore it, otherwise not. - * - * In any case the Multi should never contain two updaters, whatever - * their individual commit status. Check for that first, in passing. - */ - if (TransactionIdIsValid(update_xid)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("multixact %u has two or more updating members", - multi), - errdetail_internal("First updater XID=%u second updater XID=%u.", - update_xid, xid))); - - /* - * As with all tuple visibility routines, it's critical to test - * TransactionIdIsInProgress before TransactionIdDidCommit, because of - * race conditions explained in detail in heapam_visibility.c. - */ - if (TransactionIdIsCurrentTransactionId(xid) || - TransactionIdIsInProgress(xid)) - update_xid = xid; - else if (TransactionIdDidCommit(xid)) - { - /* - * The transaction committed, so we can tell caller to set - * HEAP_XMAX_COMMITTED. (We can only do this because we know the - * transaction is not running.) - */ - update_committed = true; - update_xid = xid; - } - else - { - /* - * Not in progress, not committed -- must be aborted or crashed; - * we can ignore it. - */ - continue; - } - - /* - * We determined that updater must be kept -- add it to pending new - * members list - */ - if (TransactionIdPrecedes(xid, cutoffs->OldestXmin)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u", - multi, xid, cutoffs->OldestXmin))); - newmembers[nnewmembers++] = members[i]; - } - - pfree(members); - - /* - * Determine what to do with caller's multi based on information gathered - * during our second pass - */ - if (nnewmembers == 0) - { - /* Nothing worth keeping */ - *flags |= FRM_INVALIDATE_XMAX; - newxmax = InvalidTransactionId; - } - else if (TransactionIdIsValid(update_xid) && !has_lockers) - { - /* - * If there's a single member and it's an update, pass it back alone - * without creating a new Multi. (XXX we could do this when there's a - * single remaining locker, too, but that would complicate the API too - * much; moreover, the case with the single updater is more - * interesting, because those are longer-lived.) - */ - Assert(nnewmembers == 1); - *flags |= FRM_RETURN_IS_XID; - if (update_committed) - *flags |= FRM_MARK_COMMITTED; - newxmax = update_xid; - } - else - { - /* - * Create a new multixact with the surviving members of the previous - * one, to set as new Xmax in the tuple - */ - newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers); - *flags |= FRM_RETURN_IS_MULTI; - } - - pfree(newmembers); - - pagefrz->freeze_required = true; - return newxmax; -} - -/* - * tdeheap_prepare_freeze_tuple - * - * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) - * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so, - * setup enough state (in the *frz output argument) to enable caller to - * process this tuple as part of freezing its page, and return true. Return - * false if nothing can be changed about the tuple right now. - * - * Also sets *totally_frozen to true if the tuple will be totally frozen once - * caller executes returned freeze plan (or if the tuple was already totally - * frozen by an earlier VACUUM). This indicates that there are no remaining - * XIDs or MultiXactIds that will need to be processed by a future VACUUM. - * - * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every - * tuple that we returned true for, and call tdeheap_freeze_execute_prepared to - * execute freezing. Caller must initialize pagefrz fields for page as a - * whole before first call here for each heap page. - * - * VACUUM caller decides on whether or not to freeze the page as a whole. - * We'll often prepare freeze plans for a page that caller just discards. - * However, VACUUM doesn't always get to make a choice; it must freeze when - * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and - * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure - * that VACUUM always follows that rule. - * - * We sometimes force freezing of xmax MultiXactId values long before it is - * strictly necessary to do so just to ensure the FreezeLimit postcondition. - * It's worth processing MultiXactIds proactively when it is cheap to do so, - * and it's convenient to make that happen by piggy-backing it on the "force - * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds - * because it is expensive right now (though only when it's still possible to - * do so without violating the FreezeLimit/MultiXactCutoff postcondition). - * - * It is assumed that the caller has checked the tuple with - * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD - * (else we should be removing the tuple, not freezing it). - * - * NB: This function has side effects: it might allocate a new MultiXactId. - * It will be set as tuple's new xmax when our *frz output is processed within - * tdeheap_execute_freeze_tuple later on. If the tuple is in a shared buffer - * then caller had better have an exclusive lock on it already. - */ -bool -tdeheap_prepare_freeze_tuple(HeapTupleHeader tuple, - const struct VacuumCutoffs *cutoffs, - HeapPageFreeze *pagefrz, - HeapTupleFreeze *frz, bool *totally_frozen) -{ - bool xmin_already_frozen = false, - xmax_already_frozen = false; - bool freeze_xmin = false, - replace_xvac = false, - replace_xmax = false, - freeze_xmax = false; - TransactionId xid; - - frz->xmax = HeapTupleHeaderGetRawXmax(tuple); - frz->t_infomask2 = tuple->t_infomask2; - frz->t_infomask = tuple->t_infomask; - frz->frzflags = 0; - frz->checkflags = 0; - - /* - * Process xmin, while keeping track of whether it's already frozen, or - * will become frozen iff our freeze plan is executed by caller (could be - * neither). - */ - xid = HeapTupleHeaderGetXmin(tuple); - if (!TransactionIdIsNormal(xid)) - xmin_already_frozen = true; - else - { - if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("found xmin %u from before relfrozenxid %u", - xid, cutoffs->relfrozenxid))); - - /* Will set freeze_xmin flags in freeze plan below */ - freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin); - - /* Verify that xmin committed if and when freeze plan is executed */ - if (freeze_xmin) - frz->checkflags |= HEAP_FREEZE_CHECK_XMIN_COMMITTED; - } - - /* - * Old-style VACUUM FULL is gone, but we have to process xvac for as long - * as we support having MOVED_OFF/MOVED_IN tuples in the database - */ - xid = HeapTupleHeaderGetXvac(tuple); - if (TransactionIdIsNormal(xid)) - { - Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); - Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin)); - - /* - * For Xvac, we always freeze proactively. This allows totally_frozen - * tracking to ignore xvac. - */ - replace_xvac = pagefrz->freeze_required = true; - - /* Will set replace_xvac flags in freeze plan below */ - } - - /* Now process xmax */ - xid = frz->xmax; - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - /* Raw xmax is a MultiXactId */ - TransactionId newxmax; - uint16 flags; - - /* - * We will either remove xmax completely (in the "freeze_xmax" path), - * process xmax by replacing it (in the "replace_xmax" path), or - * perform no-op xmax processing. The only constraint is that the - * FreezeLimit/MultiXactCutoff postcondition must never be violated. - */ - newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs, - &flags, pagefrz); - - if (flags & FRM_NOOP) - { - /* - * xmax is a MultiXactId, and nothing about it changes for now. - * This is the only case where 'freeze_required' won't have been - * set for us by FreezeMultiXactId, as well as the only case where - * neither freeze_xmax nor replace_xmax are set (given a multi). - * - * This is a no-op, but the call to FreezeMultiXactId might have - * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers - * for us (the "freeze page" variants, specifically). That'll - * make it safe for our caller to freeze the page later on, while - * leaving this particular xmax undisturbed. - * - * FreezeMultiXactId is _not_ responsible for the "no freeze" - * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our - * job. A call to tdeheap_tuple_should_freeze for this same tuple - * will take place below if 'freeze_required' isn't set already. - * (This repeats work from FreezeMultiXactId, but allows "no - * freeze" tracker maintenance to happen in only one place.) - */ - Assert(!MultiXactIdPrecedes(newxmax, cutoffs->MultiXactCutoff)); - Assert(MultiXactIdIsValid(newxmax) && xid == newxmax); - } - else if (flags & FRM_RETURN_IS_XID) - { - /* - * xmax will become an updater Xid (original MultiXact's updater - * member Xid will be carried forward as a simple Xid in Xmax). - */ - Assert(!TransactionIdPrecedes(newxmax, cutoffs->OldestXmin)); - - /* - * NB -- some of these transformations are only valid because we - * know the return Xid is a tuple updater (i.e. not merely a - * locker.) Also note that the only reason we don't explicitly - * worry about HEAP_KEYS_UPDATED is because it lives in - * t_infomask2 rather than t_infomask. - */ - frz->t_infomask &= ~HEAP_XMAX_BITS; - frz->xmax = newxmax; - if (flags & FRM_MARK_COMMITTED) - frz->t_infomask |= HEAP_XMAX_COMMITTED; - replace_xmax = true; - } - else if (flags & FRM_RETURN_IS_MULTI) - { - uint16 newbits; - uint16 newbits2; - - /* - * xmax is an old MultiXactId that we have to replace with a new - * MultiXactId, to carry forward two or more original member XIDs. - */ - Assert(!MultiXactIdPrecedes(newxmax, cutoffs->OldestMxact)); - - /* - * We can't use GetMultiXactIdHintBits directly on the new multi - * here; that routine initializes the masks to all zeroes, which - * would lose other bits we need. Doing it this way ensures all - * unrelated bits remain untouched. - */ - frz->t_infomask &= ~HEAP_XMAX_BITS; - frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; - GetMultiXactIdHintBits(newxmax, &newbits, &newbits2); - frz->t_infomask |= newbits; - frz->t_infomask2 |= newbits2; - frz->xmax = newxmax; - replace_xmax = true; - } - else - { - /* - * Freeze plan for tuple "freezes xmax" in the strictest sense: - * it'll leave nothing in xmax (neither an Xid nor a MultiXactId). - */ - Assert(flags & FRM_INVALIDATE_XMAX); - Assert(!TransactionIdIsValid(newxmax)); - - /* Will set freeze_xmax flags in freeze plan below */ - freeze_xmax = true; - } - - /* MultiXactId processing forces freezing (barring FRM_NOOP case) */ - Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax)); - } - else if (TransactionIdIsNormal(xid)) - { - /* Raw xmax is normal XID */ - if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("found xmax %u from before relfrozenxid %u", - xid, cutoffs->relfrozenxid))); - - /* Will set freeze_xmax flags in freeze plan below */ - freeze_xmax = TransactionIdPrecedes(xid, cutoffs->OldestXmin); - - /* - * Verify that xmax aborted if and when freeze plan is executed, - * provided it's from an update. (A lock-only xmax can be removed - * independent of this, since the lock is released at xact end.) - */ - if (freeze_xmax && !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED; - } - else if (!TransactionIdIsValid(xid)) - { - /* Raw xmax is InvalidTransactionId XID */ - Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0); - xmax_already_frozen = true; - } - else - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi", - xid, tuple->t_infomask))); - - if (freeze_xmin) - { - Assert(!xmin_already_frozen); - - frz->t_infomask |= HEAP_XMIN_FROZEN; - } - if (replace_xvac) - { - /* - * If a MOVED_OFF tuple is not dead, the xvac transaction must have - * failed; whereas a non-dead MOVED_IN tuple must mean the xvac - * transaction succeeded. - */ - Assert(pagefrz->freeze_required); - if (tuple->t_infomask & HEAP_MOVED_OFF) - frz->frzflags |= XLH_INVALID_XVAC; - else - frz->frzflags |= XLH_FREEZE_XVAC; - } - if (replace_xmax) - { - Assert(!xmax_already_frozen && !freeze_xmax); - Assert(pagefrz->freeze_required); - - /* Already set replace_xmax flags in freeze plan earlier */ - } - if (freeze_xmax) - { - Assert(!xmax_already_frozen && !replace_xmax); - - frz->xmax = InvalidTransactionId; - - /* - * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + - * LOCKED. Normalize to INVALID just to be sure no one gets confused. - * Also get rid of the HEAP_KEYS_UPDATED bit. - */ - frz->t_infomask &= ~HEAP_XMAX_BITS; - frz->t_infomask |= HEAP_XMAX_INVALID; - frz->t_infomask2 &= ~HEAP_HOT_UPDATED; - frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; - } - - /* - * Determine if this tuple is already totally frozen, or will become - * totally frozen (provided caller executes freeze plans for the page) - */ - *totally_frozen = ((freeze_xmin || xmin_already_frozen) && - (freeze_xmax || xmax_already_frozen)); - - if (!pagefrz->freeze_required && !(xmin_already_frozen && - xmax_already_frozen)) - { - /* - * So far no previous tuple from the page made freezing mandatory. - * Does this tuple force caller to freeze the entire page? - */ - pagefrz->freeze_required = - tdeheap_tuple_should_freeze(tuple, cutoffs, - &pagefrz->NoFreezePageRelfrozenXid, - &pagefrz->NoFreezePageRelminMxid); - } - - /* Tell caller if this tuple has a usable freeze plan set in *frz */ - return freeze_xmin || replace_xvac || replace_xmax || freeze_xmax; -} - -/* - * tdeheap_execute_freeze_tuple - * Execute the prepared freezing of a tuple with caller's freeze plan. - * - * Caller is responsible for ensuring that no other backend can access the - * storage underlying this tuple, either by holding an exclusive lock on the - * buffer containing it (which is what lazy VACUUM does), or by having it be - * in private storage (which is what CLUSTER and friends do). - */ -static inline void -tdeheap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz) -{ - HeapTupleHeaderSetXmax(tuple, frz->xmax); - - if (frz->frzflags & XLH_FREEZE_XVAC) - HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); - - if (frz->frzflags & XLH_INVALID_XVAC) - HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); - - tuple->t_infomask = frz->t_infomask; - tuple->t_infomask2 = frz->t_infomask2; -} - -/* - * tdeheap_freeze_execute_prepared - * - * Executes freezing of one or more heap tuples on a page on behalf of caller. - * Caller passes an array of tuple plans from tdeheap_prepare_freeze_tuple. - * Caller must set 'offset' in each plan for us. Note that we destructively - * sort caller's tuples array in-place, so caller had better be done with it. - * - * WAL-logs the changes so that VACUUM can advance the rel's relfrozenxid - * later on without any risk of unsafe pg_xact lookups, even following a hard - * crash (or when querying from a standby). We represent freezing by setting - * infomask bits in tuple headers, but this shouldn't be thought of as a hint. - * See section on buffer access rules in src/backend/storage/buffer/README. - */ -void -tdeheap_freeze_execute_prepared(Relation rel, Buffer buffer, - TransactionId snapshotConflictHorizon, - HeapTupleFreeze *tuples, int ntuples) -{ - Page page = BufferGetPage(buffer); - - Assert(ntuples > 0); - - /* - * Perform xmin/xmax XID status sanity checks before critical section. - * - * tdeheap_prepare_freeze_tuple doesn't perform these checks directly because - * pg_xact lookups are relatively expensive. They shouldn't be repeated - * by successive VACUUMs that each decide against freezing the same page. - */ - for (int i = 0; i < ntuples; i++) - { - HeapTupleFreeze *frz = tuples + i; - ItemId itemid = PageGetItemId(page, frz->offset); - HeapTupleHeader htup; - - htup = (HeapTupleHeader) PageGetItem(page, itemid); - - /* Deliberately avoid relying on tuple hint bits here */ - if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED) - { - TransactionId xmin = HeapTupleHeaderGetRawXmin(htup); - - Assert(!HeapTupleHeaderXminFrozen(htup)); - if (unlikely(!TransactionIdDidCommit(xmin))) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("uncommitted xmin %u needs to be frozen", - xmin))); - } - - /* - * TransactionIdDidAbort won't work reliably in the presence of XIDs - * left behind by transactions that were in progress during a crash, - * so we can only check that xmax didn't commit - */ - if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED) - { - TransactionId xmax = HeapTupleHeaderGetRawXmax(htup); - - Assert(TransactionIdIsNormal(xmax)); - if (unlikely(TransactionIdDidCommit(xmax))) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("cannot freeze committed xmax %u", - xmax))); - } - } - - START_CRIT_SECTION(); - - for (int i = 0; i < ntuples; i++) - { - HeapTupleFreeze *frz = tuples + i; - ItemId itemid = PageGetItemId(page, frz->offset); - HeapTupleHeader htup; - - htup = (HeapTupleHeader) PageGetItem(page, itemid); - tdeheap_execute_freeze_tuple(htup, frz); - } - - MarkBufferDirty(buffer); - - /* Now WAL-log freezing if necessary */ - if (RelationNeedsWAL(rel)) - { - xl_tdeheap_freeze_plan plans[MaxHeapTuplesPerPage]; - OffsetNumber offsets[MaxHeapTuplesPerPage]; - int nplans; - xl_tdeheap_freeze_page xlrec; - XLogRecPtr recptr; - - /* Prepare deduplicated representation for use in WAL record */ - nplans = tdeheap_log_freeze_plan(tuples, ntuples, plans, offsets); - - xlrec.snapshotConflictHorizon = snapshotConflictHorizon; - xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(rel); - xlrec.nplans = nplans; - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage); - - /* - * The freeze plan array and offset array are not actually in the - * buffer, but pretend that they are. When XLogInsert stores the - * whole buffer, the arrays need not be stored too. - */ - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) plans, - nplans * sizeof(xl_tdeheap_freeze_plan)); - XLogRegisterBufData(0, (char *) offsets, - ntuples * sizeof(OffsetNumber)); - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); -} - -/* - * Comparator used to deduplicate XLOG_HEAP2_FREEZE_PAGE freeze plans - */ -static int -tdeheap_log_freeze_cmp(const void *arg1, const void *arg2) -{ - HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1; - HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2; - - if (frz1->xmax < frz2->xmax) - return -1; - else if (frz1->xmax > frz2->xmax) - return 1; - - if (frz1->t_infomask2 < frz2->t_infomask2) - return -1; - else if (frz1->t_infomask2 > frz2->t_infomask2) - return 1; - - if (frz1->t_infomask < frz2->t_infomask) - return -1; - else if (frz1->t_infomask > frz2->t_infomask) - return 1; - - if (frz1->frzflags < frz2->frzflags) - return -1; - else if (frz1->frzflags > frz2->frzflags) - return 1; - - /* - * tdeheap_log_freeze_eq would consider these tuple-wise plans to be equal. - * (So the tuples will share a single canonical freeze plan.) - * - * We tiebreak on page offset number to keep each freeze plan's page - * offset number array individually sorted. (Unnecessary, but be tidy.) - */ - if (frz1->offset < frz2->offset) - return -1; - else if (frz1->offset > frz2->offset) - return 1; - - Assert(false); - return 0; -} - -/* - * Compare fields that describe actions required to freeze tuple with caller's - * open plan. If everything matches then the frz tuple plan is equivalent to - * caller's plan. - */ -static inline bool -tdeheap_log_freeze_eq(xl_tdeheap_freeze_plan *plan, HeapTupleFreeze *frz) -{ - if (plan->xmax == frz->xmax && - plan->t_infomask2 == frz->t_infomask2 && - plan->t_infomask == frz->t_infomask && - plan->frzflags == frz->frzflags) - return true; - - /* Caller must call tdeheap_log_freeze_new_plan again for frz */ - return false; -} - -/* - * Start new plan initialized using tuple-level actions. At least one tuple - * will have steps required to freeze described by caller's plan during REDO. - */ -static inline void -tdeheap_log_freeze_new_plan(xl_tdeheap_freeze_plan *plan, HeapTupleFreeze *frz) -{ - plan->xmax = frz->xmax; - plan->t_infomask2 = frz->t_infomask2; - plan->t_infomask = frz->t_infomask; - plan->frzflags = frz->frzflags; - plan->ntuples = 1; /* for now */ -} - -/* - * Deduplicate tuple-based freeze plans so that each distinct set of - * processing steps is only stored once in XLOG_HEAP2_FREEZE_PAGE records. - * Called during original execution of freezing (for logged relations). - * - * Return value is number of plans set in *plans_out for caller. Also writes - * an array of offset numbers into *offsets_out output argument for caller - * (actually there is one array per freeze plan, but that's not of immediate - * concern to our caller). - */ -static int -tdeheap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, - xl_tdeheap_freeze_plan *plans_out, - OffsetNumber *offsets_out) -{ - int nplans = 0; - - /* Sort tuple-based freeze plans in the order required to deduplicate */ - qsort(tuples, ntuples, sizeof(HeapTupleFreeze), tdeheap_log_freeze_cmp); - - for (int i = 0; i < ntuples; i++) - { - HeapTupleFreeze *frz = tuples + i; - - if (i == 0) - { - /* New canonical freeze plan starting with first tup */ - tdeheap_log_freeze_new_plan(plans_out, frz); - nplans++; - } - else if (tdeheap_log_freeze_eq(plans_out, frz)) - { - /* tup matches open canonical plan -- include tup in it */ - Assert(offsets_out[i - 1] < frz->offset); - plans_out->ntuples++; - } - else - { - /* Tup doesn't match current plan -- done with it now */ - plans_out++; - - /* New canonical freeze plan starting with this tup */ - tdeheap_log_freeze_new_plan(plans_out, frz); - nplans++; - } - - /* - * Save page offset number in dedicated buffer in passing. - * - * REDO routine relies on the record's offset numbers array grouping - * offset numbers by freeze plan. The sort order within each grouping - * is ascending offset number order, just to keep things tidy. - */ - offsets_out[i] = frz->offset; - } - - Assert(nplans > 0 && nplans <= ntuples); - - return nplans; -} - -/* - * tdeheap_freeze_tuple - * Freeze tuple in place, without WAL logging. - * - * Useful for callers like CLUSTER that perform their own WAL logging. - */ -bool -tdeheap_freeze_tuple(HeapTupleHeader tuple, - TransactionId relfrozenxid, TransactionId relminmxid, - TransactionId FreezeLimit, TransactionId MultiXactCutoff) -{ - HeapTupleFreeze frz; - bool do_freeze; - bool totally_frozen; - struct VacuumCutoffs cutoffs; - HeapPageFreeze pagefrz; - - cutoffs.relfrozenxid = relfrozenxid; - cutoffs.relminmxid = relminmxid; - cutoffs.OldestXmin = FreezeLimit; - cutoffs.OldestMxact = MultiXactCutoff; - cutoffs.FreezeLimit = FreezeLimit; - cutoffs.MultiXactCutoff = MultiXactCutoff; - - pagefrz.freeze_required = true; - pagefrz.FreezePageRelfrozenXid = FreezeLimit; - pagefrz.FreezePageRelminMxid = MultiXactCutoff; - pagefrz.NoFreezePageRelfrozenXid = FreezeLimit; - pagefrz.NoFreezePageRelminMxid = MultiXactCutoff; - - do_freeze = tdeheap_prepare_freeze_tuple(tuple, &cutoffs, - &pagefrz, &frz, &totally_frozen); - - /* - * Note that because this is not a WAL-logged operation, we don't need to - * fill in the offset in the freeze record. - */ - - if (do_freeze) - tdeheap_execute_freeze_tuple(tuple, &frz); - return do_freeze; -} - -/* - * For a given MultiXactId, return the hint bits that should be set in the - * tuple's infomask. - * - * Normally this should be called for a multixact that was just created, and - * so is on our local cache, so the GetMembers call is fast. - */ -static void -GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, - uint16 *new_infomask2) -{ - int nmembers; - MultiXactMember *members; - int i; - uint16 bits = HEAP_XMAX_IS_MULTI; - uint16 bits2 = 0; - bool has_update = false; - LockTupleMode strongest = LockTupleKeyShare; - - /* - * We only use this in multis we just created, so they cannot be values - * pre-pg_upgrade. - */ - nmembers = GetMultiXactIdMembers(multi, &members, false, false); - - for (i = 0; i < nmembers; i++) - { - LockTupleMode mode; - - /* - * Remember the strongest lock mode held by any member of the - * multixact. - */ - mode = TUPLOCK_from_mxstatus(members[i].status); - if (mode > strongest) - strongest = mode; - - /* See what other bits we need */ - switch (members[i].status) - { - case MultiXactStatusForKeyShare: - case MultiXactStatusForShare: - case MultiXactStatusForNoKeyUpdate: - break; - - case MultiXactStatusForUpdate: - bits2 |= HEAP_KEYS_UPDATED; - break; - - case MultiXactStatusNoKeyUpdate: - has_update = true; - break; - - case MultiXactStatusUpdate: - bits2 |= HEAP_KEYS_UPDATED; - has_update = true; - break; - } - } - - if (strongest == LockTupleExclusive || - strongest == LockTupleNoKeyExclusive) - bits |= HEAP_XMAX_EXCL_LOCK; - else if (strongest == LockTupleShare) - bits |= HEAP_XMAX_SHR_LOCK; - else if (strongest == LockTupleKeyShare) - bits |= HEAP_XMAX_KEYSHR_LOCK; - - if (!has_update) - bits |= HEAP_XMAX_LOCK_ONLY; - - if (nmembers > 0) - pfree(members); - - *new_infomask = bits; - *new_infomask2 = bits2; -} - -/* - * MultiXactIdGetUpdateXid - * - * Given a multixact Xmax and corresponding infomask, which does not have the - * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating - * transaction. - * - * Caller is expected to check the status of the updating transaction, if - * necessary. - */ -static TransactionId -MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask) -{ - TransactionId update_xact = InvalidTransactionId; - MultiXactMember *members; - int nmembers; - - Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY)); - Assert(t_infomask & HEAP_XMAX_IS_MULTI); - - /* - * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from - * pre-pg_upgrade. - */ - nmembers = GetMultiXactIdMembers(xmax, &members, false, false); - - if (nmembers > 0) - { - int i; - - for (i = 0; i < nmembers; i++) - { - /* Ignore lockers */ - if (!ISUPDATE_from_mxstatus(members[i].status)) - continue; - - /* there can be at most one updater */ - Assert(update_xact == InvalidTransactionId); - update_xact = members[i].xid; -#ifndef USE_ASSERT_CHECKING - - /* - * in an assert-enabled build, walk the whole array to ensure - * there's no other updater. - */ - break; -#endif - } - - pfree(members); - } - - return update_xact; -} - -/* - * HeapTupleGetUpdateXid - * As above, but use a HeapTupleHeader - * - * See also HeapTupleHeaderGetUpdateXid, which can be used without previously - * checking the hint bits. - */ -TransactionId -HeapTupleGetUpdateXid(HeapTupleHeader tuple) -{ - return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple), - tuple->t_infomask); -} - -/* - * Does the given multixact conflict with the current transaction grabbing a - * tuple lock of the given strength? - * - * The passed infomask pairs up with the given multixact in the tuple header. - * - * If current_is_member is not NULL, it is set to 'true' if the current - * transaction is a member of the given multixact. - */ -static bool -DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, - LockTupleMode lockmode, bool *current_is_member) -{ - int nmembers; - MultiXactMember *members; - bool result = false; - LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock; - - if (HEAP_LOCKED_UPGRADED(infomask)) - return false; - - nmembers = GetMultiXactIdMembers(multi, &members, false, - HEAP_XMAX_IS_LOCKED_ONLY(infomask)); - if (nmembers >= 0) - { - int i; - - for (i = 0; i < nmembers; i++) - { - TransactionId memxid; - LOCKMODE memlockmode; - - if (result && (current_is_member == NULL || *current_is_member)) - break; - - memlockmode = LOCKMODE_from_mxstatus(members[i].status); - - /* ignore members from current xact (but track their presence) */ - memxid = members[i].xid; - if (TransactionIdIsCurrentTransactionId(memxid)) - { - if (current_is_member != NULL) - *current_is_member = true; - continue; - } - else if (result) - continue; - - /* ignore members that don't conflict with the lock we want */ - if (!DoLockModesConflict(memlockmode, wanted)) - continue; - - if (ISUPDATE_from_mxstatus(members[i].status)) - { - /* ignore aborted updaters */ - if (TransactionIdDidAbort(memxid)) - continue; - } - else - { - /* ignore lockers-only that are no longer in progress */ - if (!TransactionIdIsInProgress(memxid)) - continue; - } - - /* - * Whatever remains are either live lockers that conflict with our - * wanted lock, and updaters that are not aborted. Those conflict - * with what we want. Set up to return true, but keep going to - * look for the current transaction among the multixact members, - * if needed. - */ - result = true; - } - pfree(members); - } - - return result; -} - -/* - * Do_MultiXactIdWait - * Actual implementation for the two functions below. - * - * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is - * needed to ensure we only sleep on conflicting members, and the infomask is - * used to optimize multixact access in case it's a lock-only multi); 'nowait' - * indicates whether to use conditional lock acquisition, to allow callers to - * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up - * context information for error messages. 'remaining', if not NULL, receives - * the number of members that are still running, including any (non-aborted) - * subtransactions of our own transaction. - * - * We do this by sleeping on each member using XactLockTableWait. Any - * members that belong to the current backend are *not* waited for, however; - * this would not merely be useless but would lead to Assert failure inside - * XactLockTableWait. By the time this returns, it is certain that all - * transactions *of other backends* that were members of the MultiXactId - * that conflict with the requested status are dead (and no new ones can have - * been added, since it is not legal to add members to an existing - * MultiXactId). - * - * But by the time we finish sleeping, someone else may have changed the Xmax - * of the containing tuple, so the caller needs to iterate on us somehow. - * - * Note that in case we return false, the number of remaining members is - * not to be trusted. - */ -static bool -Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, - uint16 infomask, bool nowait, - Relation rel, ItemPointer ctid, XLTW_Oper oper, - int *remaining) -{ - bool result = true; - MultiXactMember *members; - int nmembers; - int remain = 0; - - /* for pre-pg_upgrade tuples, no need to sleep at all */ - nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 : - GetMultiXactIdMembers(multi, &members, false, - HEAP_XMAX_IS_LOCKED_ONLY(infomask)); - - if (nmembers >= 0) - { - int i; - - for (i = 0; i < nmembers; i++) - { - TransactionId memxid = members[i].xid; - MultiXactStatus memstatus = members[i].status; - - if (TransactionIdIsCurrentTransactionId(memxid)) - { - remain++; - continue; - } - - if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus), - LOCKMODE_from_mxstatus(status))) - { - if (remaining && TransactionIdIsInProgress(memxid)) - remain++; - continue; - } - - /* - * This member conflicts with our multi, so we have to sleep (or - * return failure, if asked to avoid waiting.) - * - * Note that we don't set up an error context callback ourselves, - * but instead we pass the info down to XactLockTableWait. This - * might seem a bit wasteful because the context is set up and - * tore down for each member of the multixact, but in reality it - * should be barely noticeable, and it avoids duplicate code. - */ - if (nowait) - { - result = ConditionalXactLockTableWait(memxid); - if (!result) - break; - } - else - XactLockTableWait(memxid, rel, ctid, oper); - } - - pfree(members); - } - - if (remaining) - *remaining = remain; - - return result; -} - -/* - * MultiXactIdWait - * Sleep on a MultiXactId. - * - * By the time we finish sleeping, someone else may have changed the Xmax - * of the containing tuple, so the caller needs to iterate on us somehow. - * - * We return (in *remaining, if not NULL) the number of members that are still - * running, including any (non-aborted) subtransactions of our own transaction. - */ -static void -MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, - Relation rel, ItemPointer ctid, XLTW_Oper oper, - int *remaining) -{ - (void) Do_MultiXactIdWait(multi, status, infomask, false, - rel, ctid, oper, remaining); -} - -/* - * ConditionalMultiXactIdWait - * As above, but only lock if we can get the lock without blocking. - * - * By the time we finish sleeping, someone else may have changed the Xmax - * of the containing tuple, so the caller needs to iterate on us somehow. - * - * If the multixact is now all gone, return true. Returns false if some - * transactions might still be running. - * - * We return (in *remaining, if not NULL) the number of members that are still - * running, including any (non-aborted) subtransactions of our own transaction. - */ -static bool -ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, - uint16 infomask, Relation rel, int *remaining) -{ - return Do_MultiXactIdWait(multi, status, infomask, true, - rel, NULL, XLTW_None, remaining); -} - -/* - * tdeheap_tuple_needs_eventual_freeze - * - * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) - * will eventually require freezing (if tuple isn't removed by pruning first). - */ -bool -tdeheap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) -{ - TransactionId xid; - - /* - * If xmin is a normal transaction ID, this tuple is definitely not - * frozen. - */ - xid = HeapTupleHeaderGetXmin(tuple); - if (TransactionIdIsNormal(xid)) - return true; - - /* - * If xmax is a valid xact or multixact, this tuple is also not frozen. - */ - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - MultiXactId multi; - - multi = HeapTupleHeaderGetRawXmax(tuple); - if (MultiXactIdIsValid(multi)) - return true; - } - else - { - xid = HeapTupleHeaderGetRawXmax(tuple); - if (TransactionIdIsNormal(xid)) - return true; - } - - if (tuple->t_infomask & HEAP_MOVED) - { - xid = HeapTupleHeaderGetXvac(tuple); - if (TransactionIdIsNormal(xid)) - return true; - } - - return false; -} - -/* - * tdeheap_tuple_should_freeze - * - * Return value indicates if tdeheap_prepare_freeze_tuple sibling function would - * (or should) force freezing of the heap page that contains caller's tuple. - * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing. - * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs. - * - * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output - * arguments help VACUUM track the oldest extant XID/MXID remaining in rel. - * Our working assumption is that caller won't decide to freeze this tuple. - * It's up to caller to only ratchet back its own top-level trackers after the - * point that it fully commits to not freezing the tuple/page in question. - */ -bool -tdeheap_tuple_should_freeze(HeapTupleHeader tuple, - const struct VacuumCutoffs *cutoffs, - TransactionId *NoFreezePageRelfrozenXid, - MultiXactId *NoFreezePageRelminMxid) -{ - TransactionId xid; - MultiXactId multi; - bool freeze = false; - - /* First deal with xmin */ - xid = HeapTupleHeaderGetXmin(tuple); - if (TransactionIdIsNormal(xid)) - { - Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); - if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) - *NoFreezePageRelfrozenXid = xid; - if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) - freeze = true; - } - - /* Now deal with xmax */ - xid = InvalidTransactionId; - multi = InvalidMultiXactId; - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - multi = HeapTupleHeaderGetRawXmax(tuple); - else - xid = HeapTupleHeaderGetRawXmax(tuple); - - if (TransactionIdIsNormal(xid)) - { - Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); - /* xmax is a non-permanent XID */ - if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) - *NoFreezePageRelfrozenXid = xid; - if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) - freeze = true; - } - else if (!MultiXactIdIsValid(multi)) - { - /* xmax is a permanent XID or invalid MultiXactId/XID */ - } - else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask)) - { - /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */ - if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid)) - *NoFreezePageRelminMxid = multi; - /* tdeheap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */ - freeze = true; - } - else - { - /* xmax is a MultiXactId that may have an updater XID */ - MultiXactMember *members; - int nmembers; - - Assert(MultiXactIdPrecedesOrEquals(cutoffs->relminmxid, multi)); - if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid)) - *NoFreezePageRelminMxid = multi; - if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff)) - freeze = true; - - /* need to check whether any member of the mxact is old */ - nmembers = GetMultiXactIdMembers(multi, &members, false, - HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); - - for (int i = 0; i < nmembers; i++) - { - xid = members[i].xid; - Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); - if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) - *NoFreezePageRelfrozenXid = xid; - if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) - freeze = true; - } - if (nmembers > 0) - pfree(members); - } - - if (tuple->t_infomask & HEAP_MOVED) - { - xid = HeapTupleHeaderGetXvac(tuple); - if (TransactionIdIsNormal(xid)) - { - Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); - if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) - *NoFreezePageRelfrozenXid = xid; - /* tdeheap_prepare_freeze_tuple forces xvac freezing */ - freeze = true; - } - } - - return freeze; -} - -/* - * Maintain snapshotConflictHorizon for caller by ratcheting forward its value - * using any committed XIDs contained in 'tuple', an obsolescent heap tuple - * that caller is in the process of physically removing, e.g. via HOT pruning - * or index deletion. - * - * Caller must initialize its value to InvalidTransactionId, which is - * generally interpreted as "definitely no need for a recovery conflict". - * Final value must reflect all heap tuples that caller will physically remove - * (or remove TID references to) via its ongoing pruning/deletion operation. - * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from - * caller's WAL record) by REDO routine when it replays caller's operation. - */ -void -HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, - TransactionId *snapshotConflictHorizon) -{ - TransactionId xmin = HeapTupleHeaderGetXmin(tuple); - TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple); - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (tuple->t_infomask & HEAP_MOVED) - { - if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac)) - *snapshotConflictHorizon = xvac; - } - - /* - * Ignore tuples inserted by an aborted transaction or if the tuple was - * updated/deleted by the inserting transaction. - * - * Look for a committed hint bit, or if no xmin bit is set, check clog. - */ - if (HeapTupleHeaderXminCommitted(tuple) || - (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin))) - { - if (xmax != xmin && - TransactionIdFollows(xmax, *snapshotConflictHorizon)) - *snapshotConflictHorizon = xmax; - } -} - -#ifdef USE_PREFETCH -/* - * Helper function for tdeheap_index_delete_tuples. Issues prefetch requests for - * prefetch_count buffers. The prefetch_state keeps track of all the buffers - * we can prefetch, and which have already been prefetched; each call to this - * function picks up where the previous call left off. - * - * Note: we expect the deltids array to be sorted in an order that groups TIDs - * by heap block, with all TIDs for each block appearing together in exactly - * one group. - */ -static void -index_delete_prefetch_buffer(Relation rel, - IndexDeletePrefetchState *prefetch_state, - int prefetch_count) -{ - BlockNumber cur_hblkno = prefetch_state->cur_hblkno; - int count = 0; - int i; - int ndeltids = prefetch_state->ndeltids; - TM_IndexDelete *deltids = prefetch_state->deltids; - - for (i = prefetch_state->next_item; - i < ndeltids && count < prefetch_count; - i++) - { - ItemPointer htid = &deltids[i].tid; - - if (cur_hblkno == InvalidBlockNumber || - ItemPointerGetBlockNumber(htid) != cur_hblkno) - { - cur_hblkno = ItemPointerGetBlockNumber(htid); - PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno); - count++; - } - } - - /* - * Save the prefetch position so that next time we can continue from that - * position. - */ - prefetch_state->next_item = i; - prefetch_state->cur_hblkno = cur_hblkno; -} -#endif - -/* - * Helper function for tdeheap_index_delete_tuples. Checks for index corruption - * involving an invalid TID in index AM caller's index page. - * - * This is an ideal place for these checks. The index AM must hold a buffer - * lock on the index page containing the TIDs we examine here, so we don't - * have to worry about concurrent VACUUMs at all. We can be sure that the - * index is corrupt when htid points directly to an LP_UNUSED item or - * heap-only tuple, which is not the case during standard index scans. - */ -static inline void -index_delete_check_htid(TM_IndexDeleteOp *delstate, - Page page, OffsetNumber maxoff, - ItemPointer htid, TM_IndexStatus *istatus) -{ - OffsetNumber indexpagehoffnum = ItemPointerGetOffsetNumber(htid); - ItemId iid; - - Assert(OffsetNumberIsValid(istatus->idxoffnum)); - - if (unlikely(indexpagehoffnum > maxoff)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"", - ItemPointerGetBlockNumber(htid), - indexpagehoffnum, - istatus->idxoffnum, delstate->iblknum, - RelationGetRelationName(delstate->irel)))); - - iid = PageGetItemId(page, indexpagehoffnum); - if (unlikely(!ItemIdIsUsed(iid))) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"", - ItemPointerGetBlockNumber(htid), - indexpagehoffnum, - istatus->idxoffnum, delstate->iblknum, - RelationGetRelationName(delstate->irel)))); - - if (ItemIdHasStorage(iid)) - { - HeapTupleHeader htup; - - Assert(ItemIdIsNormal(iid)); - htup = (HeapTupleHeader) PageGetItem(page, iid); - - if (unlikely(HeapTupleHeaderIsHeapOnly(htup))) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"", - ItemPointerGetBlockNumber(htid), - indexpagehoffnum, - istatus->idxoffnum, delstate->iblknum, - RelationGetRelationName(delstate->irel)))); - } -} - -/* - * heapam implementation of tableam's index_delete_tuples interface. - * - * This helper function is called by index AMs during index tuple deletion. - * See tableam header comments for an explanation of the interface implemented - * here and a general theory of operation. Note that each call here is either - * a simple index deletion call, or a bottom-up index deletion call. - * - * It's possible for this to generate a fair amount of I/O, since we may be - * deleting hundreds of tuples from a single index block. To amortize that - * cost to some degree, this uses prefetching and combines repeat accesses to - * the same heap block. - */ -TransactionId -tdeheap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) -{ - /* Initial assumption is that earlier pruning took care of conflict */ - TransactionId snapshotConflictHorizon = InvalidTransactionId; - BlockNumber blkno = InvalidBlockNumber; - Buffer buf = InvalidBuffer; - Page page = NULL; - OffsetNumber maxoff = InvalidOffsetNumber; - TransactionId priorXmax; -#ifdef USE_PREFETCH - IndexDeletePrefetchState prefetch_state; - int prefetch_distance; -#endif - SnapshotData SnapshotNonVacuumable; - int finalndeltids = 0, - nblocksaccessed = 0; - - /* State that's only used in bottom-up index deletion case */ - int nblocksfavorable = 0; - int curtargetfreespace = delstate->bottomupfreespace, - lastfreespace = 0, - actualfreespace = 0; - bool bottomup_final_block = false; - - InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel)); - - /* Sort caller's deltids array by TID for further processing */ - index_delete_sort(delstate); - - /* - * Bottom-up case: resort deltids array in an order attuned to where the - * greatest number of promising TIDs are to be found, and determine how - * many blocks from the start of sorted array should be considered - * favorable. This will also shrink the deltids array in order to - * eliminate completely unfavorable blocks up front. - */ - if (delstate->bottomup) - nblocksfavorable = bottomup_sort_and_shrink(delstate); - -#ifdef USE_PREFETCH - /* Initialize prefetch state. */ - prefetch_state.cur_hblkno = InvalidBlockNumber; - prefetch_state.next_item = 0; - prefetch_state.ndeltids = delstate->ndeltids; - prefetch_state.deltids = delstate->deltids; - - /* - * Determine the prefetch distance that we will attempt to maintain. - * - * Since the caller holds a buffer lock somewhere in rel, we'd better make - * sure that isn't a catalog relation before we call code that does - * syscache lookups, to avoid risk of deadlock. - */ - if (IsCatalogRelation(rel)) - prefetch_distance = maintenance_io_concurrency; - else - prefetch_distance = - get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); - - /* Cap initial prefetch distance for bottom-up deletion caller */ - if (delstate->bottomup) - { - Assert(nblocksfavorable >= 1); - Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS); - prefetch_distance = Min(prefetch_distance, nblocksfavorable); - } - - /* Start prefetching. */ - index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance); -#endif - - /* Iterate over deltids, determine which to delete, check their horizon */ - Assert(delstate->ndeltids > 0); - for (int i = 0; i < delstate->ndeltids; i++) - { - TM_IndexDelete *ideltid = &delstate->deltids[i]; - TM_IndexStatus *istatus = delstate->status + ideltid->id; - ItemPointer htid = &ideltid->tid; - OffsetNumber offnum; - - /* - * Read buffer, and perform required extra steps each time a new block - * is encountered. Avoid refetching if it's the same block as the one - * from the last htid. - */ - if (blkno == InvalidBlockNumber || - ItemPointerGetBlockNumber(htid) != blkno) - { - /* - * Consider giving up early for bottom-up index deletion caller - * first. (Only prefetch next-next block afterwards, when it - * becomes clear that we're at least going to access the next - * block in line.) - * - * Sometimes the first block frees so much space for bottom-up - * caller that the deletion process can end without accessing any - * more blocks. It is usually necessary to access 2 or 3 blocks - * per bottom-up deletion operation, though. - */ - if (delstate->bottomup) - { - /* - * We often allow caller to delete a few additional items - * whose entries we reached after the point that space target - * from caller was satisfied. The cost of accessing the page - * was already paid at that point, so it made sense to finish - * it off. When that happened, we finalize everything here - * (by finishing off the whole bottom-up deletion operation - * without needlessly paying the cost of accessing any more - * blocks). - */ - if (bottomup_final_block) - break; - - /* - * Give up when we didn't enable our caller to free any - * additional space as a result of processing the page that we - * just finished up with. This rule is the main way in which - * we keep the cost of bottom-up deletion under control. - */ - if (nblocksaccessed >= 1 && actualfreespace == lastfreespace) - break; - lastfreespace = actualfreespace; /* for next time */ - - /* - * Deletion operation (which is bottom-up) will definitely - * access the next block in line. Prepare for that now. - * - * Decay target free space so that we don't hang on for too - * long with a marginal case. (Space target is only truly - * helpful when it allows us to recognize that we don't need - * to access more than 1 or 2 blocks to satisfy caller due to - * agreeable workload characteristics.) - * - * We are a bit more patient when we encounter contiguous - * blocks, though: these are treated as favorable blocks. The - * decay process is only applied when the next block in line - * is not a favorable/contiguous block. This is not an - * exception to the general rule; we still insist on finding - * at least one deletable item per block accessed. See - * bottomup_nblocksfavorable() for full details of the theory - * behind favorable blocks and heap block locality in general. - * - * Note: The first block in line is always treated as a - * favorable block, so the earliest possible point that the - * decay can be applied is just before we access the second - * block in line. The Assert() verifies this for us. - */ - Assert(nblocksaccessed > 0 || nblocksfavorable > 0); - if (nblocksfavorable > 0) - nblocksfavorable--; - else - curtargetfreespace /= 2; - } - - /* release old buffer */ - if (BufferIsValid(buf)) - UnlockReleaseBuffer(buf); - - blkno = ItemPointerGetBlockNumber(htid); - buf = ReadBuffer(rel, blkno); - nblocksaccessed++; - Assert(!delstate->bottomup || - nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS); - -#ifdef USE_PREFETCH - - /* - * To maintain the prefetch distance, prefetch one more page for - * each page we read. - */ - index_delete_prefetch_buffer(rel, &prefetch_state, 1); -#endif - - LockBuffer(buf, BUFFER_LOCK_SHARE); - - page = BufferGetPage(buf); - maxoff = PageGetMaxOffsetNumber(page); - } - - /* - * In passing, detect index corruption involving an index page with a - * TID that points to a location in the heap that couldn't possibly be - * correct. We only do this with actual TIDs from caller's index page - * (not items reached by traversing through a HOT chain). - */ - index_delete_check_htid(delstate, page, maxoff, htid, istatus); - - if (istatus->knowndeletable) - Assert(!delstate->bottomup && !istatus->promising); - else - { - ItemPointerData tmp = *htid; - HeapTupleData heapTuple; - - /* Are any tuples from this HOT chain non-vacuumable? */ - if (tdeheap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable, - &heapTuple, NULL, true)) - continue; /* can't delete entry */ - - /* Caller will delete, since whole HOT chain is vacuumable */ - istatus->knowndeletable = true; - - /* Maintain index free space info for bottom-up deletion case */ - if (delstate->bottomup) - { - Assert(istatus->freespace > 0); - actualfreespace += istatus->freespace; - if (actualfreespace >= curtargetfreespace) - bottomup_final_block = true; - } - } - - /* - * Maintain snapshotConflictHorizon value for deletion operation as a - * whole by advancing current value using heap tuple headers. This is - * loosely based on the logic for pruning a HOT chain. - */ - offnum = ItemPointerGetOffsetNumber(htid); - priorXmax = InvalidTransactionId; /* cannot check first XMIN */ - for (;;) - { - ItemId lp; - HeapTupleHeader htup; - - /* Sanity check (pure paranoia) */ - if (offnum < FirstOffsetNumber) - break; - - /* - * An offset past the end of page's line pointer array is possible - * when the array was truncated - */ - if (offnum > maxoff) - break; - - lp = PageGetItemId(page, offnum); - if (ItemIdIsRedirected(lp)) - { - offnum = ItemIdGetRedirect(lp); - continue; - } - - /* - * We'll often encounter LP_DEAD line pointers (especially with an - * entry marked knowndeletable by our caller up front). No heap - * tuple headers get examined for an htid that leads us to an - * LP_DEAD item. This is okay because the earlier pruning - * operation that made the line pointer LP_DEAD in the first place - * must have considered the original tuple header as part of - * generating its own snapshotConflictHorizon value. - * - * Relying on XLOG_HEAP2_PRUNE records like this is the same - * strategy that index vacuuming uses in all cases. Index VACUUM - * WAL records don't even have a snapshotConflictHorizon field of - * their own for this reason. - */ - if (!ItemIdIsNormal(lp)) - break; - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - /* - * Check the tuple XMIN against prior XMAX, if any - */ - if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) - break; - - HeapTupleHeaderAdvanceConflictHorizon(htup, - &snapshotConflictHorizon); - - /* - * If the tuple is not HOT-updated, then we are at the end of this - * HOT-chain. No need to visit later tuples from the same update - * chain (they get their own index entries) -- just move on to - * next htid from index AM caller. - */ - if (!HeapTupleHeaderIsHotUpdated(htup)) - break; - - /* Advance to next HOT chain member */ - Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno); - offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - priorXmax = HeapTupleHeaderGetUpdateXid(htup); - } - - /* Enable further/final shrinking of deltids for caller */ - finalndeltids = i + 1; - } - - UnlockReleaseBuffer(buf); - - /* - * Shrink deltids array to exclude non-deletable entries at the end. This - * is not just a minor optimization. Final deltids array size might be - * zero for a bottom-up caller. Index AM is explicitly allowed to rely on - * ndeltids being zero in all cases with zero total deletable entries. - */ - Assert(finalndeltids > 0 || delstate->bottomup); - delstate->ndeltids = finalndeltids; - - return snapshotConflictHorizon; -} - -/* - * Specialized inlineable comparison function for index_delete_sort() - */ -static inline int -index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2) -{ - ItemPointer tid1 = &deltid1->tid; - ItemPointer tid2 = &deltid2->tid; - - { - BlockNumber blk1 = ItemPointerGetBlockNumber(tid1); - BlockNumber blk2 = ItemPointerGetBlockNumber(tid2); - - if (blk1 != blk2) - return (blk1 < blk2) ? -1 : 1; - } - { - OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1); - OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2); - - if (pos1 != pos2) - return (pos1 < pos2) ? -1 : 1; - } - - Assert(false); - - return 0; -} - -/* - * Sort deltids array from delstate by TID. This prepares it for further - * processing by tdeheap_index_delete_tuples(). - * - * This operation becomes a noticeable consumer of CPU cycles with some - * workloads, so we go to the trouble of specialization/micro optimization. - * We use shellsort for this because it's easy to specialize, compiles to - * relatively few instructions, and is adaptive to presorted inputs/subsets - * (which are typical here). - */ -static void -index_delete_sort(TM_IndexDeleteOp *delstate) -{ - TM_IndexDelete *deltids = delstate->deltids; - int ndeltids = delstate->ndeltids; - int low = 0; - - /* - * Shellsort gap sequence (taken from Sedgewick-Incerpi paper). - * - * This implementation is fast with array sizes up to ~4500. This covers - * all supported BLCKSZ values. - */ - const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1}; - - /* Think carefully before changing anything here -- keep swaps cheap */ - StaticAssertDecl(sizeof(TM_IndexDelete) <= 8, - "element size exceeds 8 bytes"); - - for (int g = 0; g < lengthof(gaps); g++) - { - for (int hi = gaps[g], i = low + hi; i < ndeltids; i++) - { - TM_IndexDelete d = deltids[i]; - int j = i; - - while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0) - { - deltids[j] = deltids[j - hi]; - j -= hi; - } - deltids[j] = d; - } - } -} - -/* - * Returns how many blocks should be considered favorable/contiguous for a - * bottom-up index deletion pass. This is a number of heap blocks that starts - * from and includes the first block in line. - * - * There is always at least one favorable block during bottom-up index - * deletion. In the worst case (i.e. with totally random heap blocks) the - * first block in line (the only favorable block) can be thought of as a - * degenerate array of contiguous blocks that consists of a single block. - * tdeheap_index_delete_tuples() will expect this. - * - * Caller passes blockgroups, a description of the final order that deltids - * will be sorted in for tdeheap_index_delete_tuples() bottom-up index deletion - * processing. Note that deltids need not actually be sorted just yet (caller - * only passes deltids to us so that we can interpret blockgroups). - * - * You might guess that the existence of contiguous blocks cannot matter much, - * since in general the main factor that determines which blocks we visit is - * the number of promising TIDs, which is a fixed hint from the index AM. - * We're not really targeting the general case, though -- the actual goal is - * to adapt our behavior to a wide variety of naturally occurring conditions. - * The effects of most of the heuristics we apply are only noticeable in the - * aggregate, over time and across many _related_ bottom-up index deletion - * passes. - * - * Deeming certain blocks favorable allows heapam to recognize and adapt to - * workloads where heap blocks visited during bottom-up index deletion can be - * accessed contiguously, in the sense that each newly visited block is the - * neighbor of the block that bottom-up deletion just finished processing (or - * close enough to it). It will likely be cheaper to access more favorable - * blocks sooner rather than later (e.g. in this pass, not across a series of - * related bottom-up passes). Either way it is probably only a matter of time - * (or a matter of further correlated version churn) before all blocks that - * appear together as a single large batch of favorable blocks get accessed by - * _some_ bottom-up pass. Large batches of favorable blocks tend to either - * appear almost constantly or not even once (it all depends on per-index - * workload characteristics). - * - * Note that the blockgroups sort order applies a power-of-two bucketing - * scheme that creates opportunities for contiguous groups of blocks to get - * batched together, at least with workloads that are naturally amenable to - * being driven by heap block locality. This doesn't just enhance the spatial - * locality of bottom-up heap block processing in the obvious way. It also - * enables temporal locality of access, since sorting by heap block number - * naturally tends to make the bottom-up processing order deterministic. - * - * Consider the following example to get a sense of how temporal locality - * might matter: There is a heap relation with several indexes, each of which - * is low to medium cardinality. It is subject to constant non-HOT updates. - * The updates are skewed (in one part of the primary key, perhaps). None of - * the indexes are logically modified by the UPDATE statements (if they were - * then bottom-up index deletion would not be triggered in the first place). - * Naturally, each new round of index tuples (for each heap tuple that gets a - * tdeheap_update() call) will have the same heap TID in each and every index. - * Since these indexes are low cardinality and never get logically modified, - * heapam processing during bottom-up deletion passes will access heap blocks - * in approximately sequential order. Temporal locality of access occurs due - * to bottom-up deletion passes behaving very similarly across each of the - * indexes at any given moment. This keeps the number of buffer misses needed - * to visit heap blocks to a minimum. - */ -static int -bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, - TM_IndexDelete *deltids) -{ - int64 lastblock = -1; - int nblocksfavorable = 0; - - Assert(nblockgroups >= 1); - Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS); - - /* - * We tolerate heap blocks that will be accessed only slightly out of - * physical order. Small blips occur when a pair of almost-contiguous - * blocks happen to fall into different buckets (perhaps due only to a - * small difference in npromisingtids that the bucketing scheme didn't - * quite manage to ignore). We effectively ignore these blips by applying - * a small tolerance. The precise tolerance we use is a little arbitrary, - * but it works well enough in practice. - */ - for (int b = 0; b < nblockgroups; b++) - { - IndexDeleteCounts *group = blockgroups + b; - TM_IndexDelete *firstdtid = deltids + group->ifirsttid; - BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid); - - if (lastblock != -1 && - ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS || - (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS)) - break; - - nblocksfavorable++; - lastblock = block; - } - - /* Always indicate that there is at least 1 favorable block */ - Assert(nblocksfavorable >= 1); - - return nblocksfavorable; -} - -/* - * qsort comparison function for bottomup_sort_and_shrink() - */ -static int -bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2) -{ - const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1; - const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2; - - /* - * Most significant field is npromisingtids (which we invert the order of - * so as to sort in desc order). - * - * Caller should have already normalized npromisingtids fields into - * power-of-two values (buckets). - */ - if (group1->npromisingtids > group2->npromisingtids) - return -1; - if (group1->npromisingtids < group2->npromisingtids) - return 1; - - /* - * Tiebreak: desc ntids sort order. - * - * We cannot expect power-of-two values for ntids fields. We should - * behave as if they were already rounded up for us instead. - */ - if (group1->ntids != group2->ntids) - { - uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids); - uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids); - - if (ntids1 > ntids2) - return -1; - if (ntids1 < ntids2) - return 1; - } - - /* - * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for - * block in deltids array) order. - * - * This is equivalent to sorting in ascending heap block number order - * (among otherwise equal subsets of the array). This approach allows us - * to avoid accessing the out-of-line TID. (We rely on the assumption - * that the deltids array was sorted in ascending heap TID order when - * these offsets to the first TID from each heap block group were formed.) - */ - if (group1->ifirsttid > group2->ifirsttid) - return 1; - if (group1->ifirsttid < group2->ifirsttid) - return -1; - - pg_unreachable(); - - return 0; -} - -/* - * tdeheap_index_delete_tuples() helper function for bottom-up deletion callers. - * - * Sorts deltids array in the order needed for useful processing by bottom-up - * deletion. The array should already be sorted in TID order when we're - * called. The sort process groups heap TIDs from deltids into heap block - * groupings. Earlier/more-promising groups/blocks are usually those that are - * known to have the most "promising" TIDs. - * - * Sets new size of deltids array (ndeltids) in state. deltids will only have - * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we - * return. This often means that deltids will be shrunk to a small fraction - * of its original size (we eliminate many heap blocks from consideration for - * caller up front). - * - * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable() - * for a definition and full details. - */ -static int -bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) -{ - IndexDeleteCounts *blockgroups; - TM_IndexDelete *reordereddeltids; - BlockNumber curblock = InvalidBlockNumber; - int nblockgroups = 0; - int ncopied = 0; - int nblocksfavorable = 0; - - Assert(delstate->bottomup); - Assert(delstate->ndeltids > 0); - - /* Calculate per-heap-block count of TIDs */ - blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids); - for (int i = 0; i < delstate->ndeltids; i++) - { - TM_IndexDelete *ideltid = &delstate->deltids[i]; - TM_IndexStatus *istatus = delstate->status + ideltid->id; - ItemPointer htid = &ideltid->tid; - bool promising = istatus->promising; - - if (curblock != ItemPointerGetBlockNumber(htid)) - { - /* New block group */ - nblockgroups++; - - Assert(curblock < ItemPointerGetBlockNumber(htid) || - !BlockNumberIsValid(curblock)); - - curblock = ItemPointerGetBlockNumber(htid); - blockgroups[nblockgroups - 1].ifirsttid = i; - blockgroups[nblockgroups - 1].ntids = 1; - blockgroups[nblockgroups - 1].npromisingtids = 0; - } - else - { - blockgroups[nblockgroups - 1].ntids++; - } - - if (promising) - blockgroups[nblockgroups - 1].npromisingtids++; - } - - /* - * We're about ready to sort block groups to determine the optimal order - * for visiting heap blocks. But before we do, round the number of - * promising tuples for each block group up to the next power-of-two, - * unless it is very low (less than 4), in which case we round up to 4. - * npromisingtids is far too noisy to trust when choosing between a pair - * of block groups that both have very low values. - * - * This scheme divides heap blocks/block groups into buckets. Each bucket - * contains blocks that have _approximately_ the same number of promising - * TIDs as each other. The goal is to ignore relatively small differences - * in the total number of promising entries, so that the whole process can - * give a little weight to heapam factors (like heap block locality) - * instead. This isn't a trade-off, really -- we have nothing to lose. It - * would be foolish to interpret small differences in npromisingtids - * values as anything more than noise. - * - * We tiebreak on nhtids when sorting block group subsets that have the - * same npromisingtids, but this has the same issues as npromisingtids, - * and so nhtids is subject to the same power-of-two bucketing scheme. The - * only reason that we don't fix nhtids in the same way here too is that - * we'll need accurate nhtids values after the sort. We handle nhtids - * bucketization dynamically instead (in the sort comparator). - * - * See bottomup_nblocksfavorable() for a full explanation of when and how - * heap locality/favorable blocks can significantly influence when and how - * heap blocks are accessed. - */ - for (int b = 0; b < nblockgroups; b++) - { - IndexDeleteCounts *group = blockgroups + b; - - /* Better off falling back on nhtids with low npromisingtids */ - if (group->npromisingtids <= 4) - group->npromisingtids = 4; - else - group->npromisingtids = - pg_nextpower2_32((uint32) group->npromisingtids); - } - - /* Sort groups and rearrange caller's deltids array */ - qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts), - bottomup_sort_and_shrink_cmp); - reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete)); - - nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups); - /* Determine number of favorable blocks at the start of final deltids */ - nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups, - delstate->deltids); - - for (int b = 0; b < nblockgroups; b++) - { - IndexDeleteCounts *group = blockgroups + b; - TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid; - - memcpy(reordereddeltids + ncopied, firstdtid, - sizeof(TM_IndexDelete) * group->ntids); - ncopied += group->ntids; - } - - /* Copy final grouped and sorted TIDs back into start of caller's array */ - memcpy(delstate->deltids, reordereddeltids, - sizeof(TM_IndexDelete) * ncopied); - delstate->ndeltids = ncopied; - - pfree(reordereddeltids); - pfree(blockgroups); - - return nblocksfavorable; -} - -/* - * Perform XLogInsert for a heap-visible operation. 'block' is the block - * being marked all-visible, and vm_buffer is the buffer containing the - * corresponding visibility map block. Both should have already been modified - * and dirtied. - * - * snapshotConflictHorizon comes from the largest xmin on the page being - * marked all-visible. REDO routine uses it to generate recovery conflicts. - * - * If checksums or wal_log_hints are enabled, we may also generate a full-page - * image of tdeheap_buffer. Otherwise, we optimize away the FPI (by specifying - * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not* - * update the heap page's LSN. - */ -XLogRecPtr -log_tdeheap_visible(Relation rel, Buffer tdeheap_buffer, Buffer vm_buffer, - TransactionId snapshotConflictHorizon, uint8 vmflags) -{ - xl_tdeheap_visible xlrec; - XLogRecPtr recptr; - uint8 flags; - - Assert(BufferIsValid (tdeheap_buffer)); - Assert(BufferIsValid(vm_buffer)); - - xlrec.snapshotConflictHorizon = snapshotConflictHorizon; - xlrec.flags = vmflags; - if (RelationIsAccessibleInLogicalDecoding(rel)) - xlrec.flags |= VISIBILITYMAP_XLOG_CATALOG_REL; - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapVisible); - - XLogRegisterBuffer(0, vm_buffer, 0); - - flags = REGBUF_STANDARD; - if (!XLogHintBitIsNeeded()) - flags |= REGBUF_NO_IMAGE; - XLogRegisterBuffer(1, tdeheap_buffer, flags); - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE); - - return recptr; -} - -/* - * Perform XLogInsert for a heap-update operation. Caller must already - * have modified the buffer(s) and marked them dirty. - */ -static XLogRecPtr -log_tdeheap_update(Relation reln, Buffer oldbuf, - Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, - HeapTuple old_key_tuple, - bool all_visible_cleared, bool new_all_visible_cleared) -{ - xl_tdeheap_update xlrec; - xl_tdeheap_header xlhdr; - xl_tdeheap_header xlhdr_idx; - uint8 info; - uint16 prefix_suffix[2]; - uint16 prefixlen = 0, - suffixlen = 0; - XLogRecPtr recptr; - Page page = BufferGetPage(newbuf); - PageHeader phdr = (PageHeader) page; - bool need_tuple_data = RelationIsLogicallyLogged(reln); - bool init; - int bufflags; - - /* Caller should not call me on a non-WAL-logged relation */ - Assert(RelationNeedsWAL(reln)); - - XLogBeginInsert(); - - if (HeapTupleIsHeapOnly(newtup)) - info = XLOG_HEAP_HOT_UPDATE; - else - info = XLOG_HEAP_UPDATE; - - /* - * If the old and new tuple are on the same page, we only need to log the - * parts of the new tuple that were changed. That saves on the amount of - * WAL we need to write. Currently, we just count any unchanged bytes in - * the beginning and end of the tuple. That's quick to check, and - * perfectly covers the common case that only one field is updated. - * - * We could do this even if the old and new tuple are on different pages, - * but only if we don't make a full-page image of the old page, which is - * difficult to know in advance. Also, if the old tuple is corrupt for - * some reason, it would allow the corruption to propagate the new page, - * so it seems best to avoid. Under the general assumption that most - * updates tend to create the new tuple version on the same page, there - * isn't much to be gained by doing this across pages anyway. - * - * Skip this if we're taking a full-page image of the new page, as we - * don't include the new tuple in the WAL record in that case. Also - * disable if wal_level='logical', as logical decoding needs to be able to - * read the new tuple in whole from the WAL record alone. - */ - if (oldbuf == newbuf && !need_tuple_data && - !XLogCheckBufferNeedsBackup(newbuf)) - { - char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff; - char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff; - int oldlen = oldtup->t_len - oldtup->t_data->t_hoff; - int newlen = newtup->t_len - newtup->t_data->t_hoff; - - /* Check for common prefix between old and new tuple */ - for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++) - { - if (newp[prefixlen] != oldp[prefixlen]) - break; - } - - /* - * Storing the length of the prefix takes 2 bytes, so we need to save - * at least 3 bytes or there's no point. - */ - if (prefixlen < 3) - prefixlen = 0; - - /* Same for suffix */ - for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++) - { - if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1]) - break; - } - if (suffixlen < 3) - suffixlen = 0; - } - - /* Prepare main WAL data chain */ - xlrec.flags = 0; - if (all_visible_cleared) - xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED; - if (new_all_visible_cleared) - xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED; - if (prefixlen > 0) - xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD; - if (suffixlen > 0) - xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD; - if (need_tuple_data) - { - xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE; - if (old_key_tuple) - { - if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL) - xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE; - else - xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; - } - } - - /* If new tuple is the single and first tuple on page... */ - if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && - PageGetMaxOffsetNumber(page) == FirstOffsetNumber) - { - info |= XLOG_HEAP_INIT_PAGE; - init = true; - } - else - init = false; - - /* Prepare WAL data for the old page */ - xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self); - xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); - xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, - oldtup->t_data->t_infomask2); - - /* Prepare WAL data for the new page */ - xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); - xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); - - bufflags = REGBUF_STANDARD; - if (init) - bufflags |= REGBUF_WILL_INIT; - if (need_tuple_data) - bufflags |= REGBUF_KEEP_DATA; - - XLogRegisterBuffer(0, newbuf, bufflags); - if (oldbuf != newbuf) - XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD); - - XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate); - - /* - * Prepare WAL data for the new tuple. - */ - if (prefixlen > 0 || suffixlen > 0) - { - if (prefixlen > 0 && suffixlen > 0) - { - prefix_suffix[0] = prefixlen; - prefix_suffix[1] = suffixlen; - XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2); - } - else if (prefixlen > 0) - { - XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16)); - } - else - { - XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16)); - } - } - - xlhdr.t_infomask2 = newtup->t_data->t_infomask2; - xlhdr.t_infomask = newtup->t_data->t_infomask; - xlhdr.t_hoff = newtup->t_data->t_hoff; - Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); - - /* - * PG73FORMAT: write bitmap [+ padding] [+ oid] + data - * - * The 'data' doesn't include the common prefix or suffix. - */ - /* We write an encrypted newtuple data from the buffer */ - XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); - if (prefixlen == 0) - { - XLogRegisterBufData(0, - ((char *) phdr) + phdr->pd_upper + SizeofHeapTupleHeader, - newtup->t_len - SizeofHeapTupleHeader - suffixlen); - } - else - { - /* - * Have to write the null bitmap and data after the common prefix as - * two separate rdata entries. - */ - /* bitmap [+ padding] [+ oid] */ - if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0) - { - XLogRegisterBufData(0, - ((char *) phdr) + phdr->pd_upper + SizeofHeapTupleHeader, - newtup->t_data->t_hoff - SizeofHeapTupleHeader); - } - - /* data after common prefix */ - XLogRegisterBufData(0, - ((char *) phdr) + phdr->pd_upper + newtup->t_data->t_hoff + prefixlen, - newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen); - } - - /* We need to log a tuple identity */ - if (need_tuple_data && old_key_tuple) - { - /* don't really need this, but its more comfy to decode */ - xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; - xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; - xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; - - XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader); - - /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ - XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader, - old_key_tuple->t_len - SizeofHeapTupleHeader); - } - - /* filtering by origin on a row level is much more efficient */ - XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - - recptr = XLogInsert(RM_HEAP_ID, info); - - return recptr; -} - -/* - * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record - * - * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog - * tuples. - */ -static XLogRecPtr -log_tdeheap_new_cid(Relation relation, HeapTuple tup) -{ - xl_tdeheap_new_cid xlrec; - - XLogRecPtr recptr; - HeapTupleHeader hdr = tup->t_data; - - Assert(ItemPointerIsValid(&tup->t_self)); - Assert(tup->t_tableOid != InvalidOid); - - xlrec.top_xid = GetTopTransactionId(); - xlrec.target_locator = relation->rd_locator; - xlrec.target_tid = tup->t_self; - - /* - * If the tuple got inserted & deleted in the same TX we definitely have a - * combo CID, set cmin and cmax. - */ - if (hdr->t_infomask & HEAP_COMBOCID) - { - Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID)); - Assert(!HeapTupleHeaderXminInvalid(hdr)); - xlrec.cmin = HeapTupleHeaderGetCmin(hdr); - xlrec.cmax = HeapTupleHeaderGetCmax(hdr); - xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr); - } - /* No combo CID, so only cmin or cmax can be set by this TX */ - else - { - /* - * Tuple inserted. - * - * We need to check for LOCK ONLY because multixacts might be - * transferred to the new tuple in case of FOR KEY SHARE updates in - * which case there will be an xmax, although the tuple just got - * inserted. - */ - if (hdr->t_infomask & HEAP_XMAX_INVALID || - HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask)) - { - xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr); - xlrec.cmax = InvalidCommandId; - } - /* Tuple from a different tx updated or deleted. */ - else - { - xlrec.cmin = InvalidCommandId; - xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr); - } - xlrec.combocid = InvalidCommandId; - } - - /* - * Note that we don't need to register the buffer here, because this - * operation does not modify the page. The insert/update/delete that - * called us certainly did, but that's WAL-logged separately. - */ - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid); - - /* will be looked at irrespective of origin */ - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID); - - return recptr; -} - -/* - * Build a heap tuple representing the configured REPLICA IDENTITY to represent - * the old tuple in an UPDATE or DELETE. - * - * Returns NULL if there's no need to log an identity or if there's no suitable - * key defined. - * - * Pass key_required true if any replica identity columns changed value, or if - * any of them have any external data. Delete must always pass true. - * - * *copy is set to true if the returned tuple is a modified copy rather than - * the same tuple that was passed in. - */ -static HeapTuple -ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, - bool *copy) -{ - TupleDesc desc = RelationGetDescr(relation); - char replident = relation->rd_rel->relreplident; - Bitmapset *idattrs; - HeapTuple key_tuple; - bool nulls[MaxHeapAttributeNumber]; - Datum values[MaxHeapAttributeNumber]; - - *copy = false; - - if (!RelationIsLogicallyLogged(relation)) - return NULL; - - if (replident == REPLICA_IDENTITY_NOTHING) - return NULL; - - if (replident == REPLICA_IDENTITY_FULL) - { - /* - * When logging the entire old tuple, it very well could contain - * toasted columns. If so, force them to be inlined. - */ - if (HeapTupleHasExternal(tp)) - { - *copy = true; - tp = toast_flatten_tuple(tp, desc); - } - return tp; - } - - /* if the key isn't required and we're only logging the key, we're done */ - if (!key_required) - return NULL; - - /* find out the replica identity columns */ - idattrs = RelationGetIndexAttrBitmap(relation, - INDEX_ATTR_BITMAP_IDENTITY_KEY); - - /* - * If there's no defined replica identity columns, treat as !key_required. - * (This case should not be reachable from tdeheap_update, since that should - * calculate key_required accurately. But tdeheap_delete just passes - * constant true for key_required, so we can hit this case in deletes.) - */ - if (bms_is_empty(idattrs)) - return NULL; - - /* - * Construct a new tuple containing only the replica identity columns, - * with nulls elsewhere. While we're at it, assert that the replica - * identity columns aren't null. - */ - tdeheap_deform_tuple(tp, desc, values, nulls); - - for (int i = 0; i < desc->natts; i++) - { - if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber, - idattrs)) - Assert(!nulls[i]); - else - nulls[i] = true; - } - - key_tuple = tdeheap_form_tuple(desc, values, nulls); - *copy = true; - - bms_free(idattrs); - - /* - * If the tuple, which by here only contains indexed columns, still has - * toasted columns, force them to be inlined. This is somewhat unlikely - * since there's limits on the size of indexed columns, so we don't - * duplicate toast_flatten_tuple()s functionality in the above loop over - * the indexed columns, even if it would be more efficient. - */ - if (HeapTupleHasExternal(key_tuple)) - { - HeapTuple oldtup = key_tuple; - - key_tuple = toast_flatten_tuple(oldtup, desc); - tdeheap_freetuple(oldtup); - } - - return key_tuple; -} - -/* - * Handles XLOG_HEAP2_PRUNE record type. - * - * Acquires a full cleanup lock. - */ -static void -tdeheap_xlog_prune(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_prune *xlrec = (xl_tdeheap_prune *) XLogRecGetData(record); - Buffer buffer; - RelFileLocator rlocator; - BlockNumber blkno; - XLogRedoAction action; - - XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); - - /* - * We're about to remove tuples. In Hot Standby mode, ensure that there's - * no queries running for which the removed tuples are still visible. - */ - if (InHotStandby) - ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, - xlrec->isCatalogRel, - rlocator); - - /* - * If we have a full-page image, restore it (using a cleanup lock) and - * we're done. - */ - action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, - &buffer); - if (action == BLK_NEEDS_REDO) - { - Page page = (Page) BufferGetPage(buffer); - OffsetNumber *end; - OffsetNumber *redirected; - OffsetNumber *nowdead; - OffsetNumber *nowunused; - int nredirected; - int ndead; - int nunused; - Size datalen; - Relation reln; - - redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); - - nredirected = xlrec->nredirected; - ndead = xlrec->ndead; - end = (OffsetNumber *) ((char *) redirected + datalen); - nowdead = redirected + (nredirected * 2); - nowunused = nowdead + ndead; - nunused = (end - nowunused); - Assert(nunused >= 0); - - /* Update all line pointers per the record, and repair fragmentation */ - reln = CreateFakeRelcacheEntry(rlocator); - tdeheap_page_prune_execute(reln, buffer, - redirected, nredirected, - nowdead, ndead, - nowunused, nunused); - - /* - * Note: we don't worry about updating the page's prunability hints. - * At worst this will cause an extra prune cycle to occur soon. - */ - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - - if (BufferIsValid(buffer)) - { - Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); - - UnlockReleaseBuffer(buffer); - - /* - * After pruning records from a page, it's useful to update the FSM - * about it, as it may cause the page become target for insertions - * later even if vacuum decides not to visit it (which is possible if - * gets marked all-visible.) - * - * Do this regardless of a full-page image being applied, since the - * FSM data is not in the page anyway. - */ - XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); - } -} - -/* - * Handles XLOG_HEAP2_VACUUM record type. - * - * Acquires an ordinary exclusive lock only. - */ -static void -tdeheap_xlog_vacuum(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_vacuum *xlrec = (xl_tdeheap_vacuum *) XLogRecGetData(record); - Buffer buffer; - BlockNumber blkno; - XLogRedoAction action; - - /* - * If we have a full-page image, restore it (without using a cleanup lock) - * and we're done. - */ - action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false, - &buffer); - if (action == BLK_NEEDS_REDO) - { - Page page = (Page) BufferGetPage(buffer); - OffsetNumber *nowunused; - Size datalen; - OffsetNumber *offnum; - - nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); - - /* Shouldn't be a record unless there's something to do */ - Assert(xlrec->nunused > 0); - - /* Update all now-unused line pointers */ - offnum = nowunused; - for (int i = 0; i < xlrec->nunused; i++) - { - OffsetNumber off = *offnum++; - ItemId lp = PageGetItemId(page, off); - - Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp)); - ItemIdSetUnused(lp); - } - - /* Attempt to truncate line pointer array now */ - PageTruncateLinePointerArray(page); - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - - if (BufferIsValid(buffer)) - { - Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); - RelFileLocator rlocator; - - XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); - - UnlockReleaseBuffer(buffer); - - /* - * After vacuuming LP_DEAD items from a page, it's useful to update - * the FSM about it, as it may cause the page become target for - * insertions later even if vacuum decides not to visit it (which is - * possible if gets marked all-visible.) - * - * Do this regardless of a full-page image being applied, since the - * FSM data is not in the page anyway. - */ - XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); - } -} - -/* - * Replay XLOG_HEAP2_VISIBLE record. - * - * The critical integrity requirement here is that we must never end up with - * a situation where the visibility map bit is set, and the page-level - * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent - * page modification would fail to clear the visibility map bit. - */ -static void -tdeheap_xlog_visible(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_visible *xlrec = (xl_tdeheap_visible *) XLogRecGetData(record); - Buffer vmbuffer = InvalidBuffer; - Buffer buffer; - Page page; - RelFileLocator rlocator; - BlockNumber blkno; - XLogRedoAction action; - - Assert((xlrec->flags & VISIBILITYMAP_XLOG_VALID_BITS) == xlrec->flags); - - XLogRecGetBlockTag(record, 1, &rlocator, NULL, &blkno); - - /* - * If there are any Hot Standby transactions running that have an xmin - * horizon old enough that this page isn't all-visible for them, they - * might incorrectly decide that an index-only scan can skip a heap fetch. - * - * NB: It might be better to throw some kind of "soft" conflict here that - * forces any index-only scan that is in flight to perform heap fetches, - * rather than killing the transaction outright. - */ - if (InHotStandby) - ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, - xlrec->flags & VISIBILITYMAP_XLOG_CATALOG_REL, - rlocator); - - /* - * Read the heap page, if it still exists. If the heap file has dropped or - * truncated later in recovery, we don't need to update the page, but we'd - * better still update the visibility map. - */ - action = XLogReadBufferForRedo(record, 1, &buffer); - if (action == BLK_NEEDS_REDO) - { - /* - * We don't bump the LSN of the heap page when setting the visibility - * map bit (unless checksums or wal_hint_bits is enabled, in which - * case we must). This exposes us to torn page hazards, but since - * we're not inspecting the existing page contents in any way, we - * don't care. - */ - page = BufferGetPage(buffer); - - PageSetAllVisible(page); - - if (XLogHintBitIsNeeded()) - PageSetLSN(page, lsn); - - MarkBufferDirty(buffer); - } - else if (action == BLK_RESTORED) - { - /* - * If heap block was backed up, we already restored it and there's - * nothing more to do. (This can only happen with checksums or - * wal_log_hints enabled.) - */ - } - - if (BufferIsValid(buffer)) - { - Size space = PageGetFreeSpace(BufferGetPage(buffer)); - - UnlockReleaseBuffer(buffer); - - /* - * Since FSM is not WAL-logged and only updated heuristically, it - * easily becomes stale in standbys. If the standby is later promoted - * and runs VACUUM, it will skip updating individual free space - * figures for pages that became all-visible (or all-frozen, depending - * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum - * propagates too optimistic free space values to upper FSM layers; - * later inserters try to use such pages only to find out that they - * are unusable. This can cause long stalls when there are many such - * pages. - * - * Forestall those problems by updating FSM's idea about a page that - * is becoming all-visible or all-frozen. - * - * Do this regardless of a full-page image being applied, since the - * FSM data is not in the page anyway. - */ - if (xlrec->flags & VISIBILITYMAP_VALID_BITS) - XLogRecordPageWithFreeSpace(rlocator, blkno, space); - } - - /* - * Even if we skipped the heap page update due to the LSN interlock, it's - * still safe to update the visibility map. Any WAL record that clears - * the visibility map bit does so before checking the page LSN, so any - * bits that need to be cleared will still be cleared. - */ - if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false, - &vmbuffer) == BLK_NEEDS_REDO) - { - Page vmpage = BufferGetPage(vmbuffer); - Relation reln; - uint8 vmbits; - - /* initialize the page if it was read as zeros */ - if (PageIsNew(vmpage)) - PageInit(vmpage, BLCKSZ, 0); - - /* remove VISIBILITYMAP_XLOG_* */ - vmbits = xlrec->flags & VISIBILITYMAP_VALID_BITS; - - /* - * XLogReadBufferForRedoExtended locked the buffer. But - * tdeheap_visibilitymap_set will handle locking itself. - */ - LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); - - reln = CreateFakeRelcacheEntry(rlocator); - tdeheap_visibilitymap_pin(reln, blkno, &vmbuffer); - - tdeheap_visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, - xlrec->snapshotConflictHorizon, vmbits); - - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - else if (BufferIsValid(vmbuffer)) - UnlockReleaseBuffer(vmbuffer); -} - -/* - * Replay XLOG_HEAP2_FREEZE_PAGE records - */ -static void -tdeheap_xlog_freeze_page(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_freeze_page *xlrec = (xl_tdeheap_freeze_page *) XLogRecGetData(record); - Buffer buffer; - - /* - * In Hot Standby mode, ensure that there's no queries running which still - * consider the frozen xids as running. - */ - if (InHotStandby) - { - RelFileLocator rlocator; - - XLogRecGetBlockTag(record, 0, &rlocator, NULL, NULL); - ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, - xlrec->isCatalogRel, - rlocator); - } - - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) - { - Page page = BufferGetPage(buffer); - xl_tdeheap_freeze_plan *plans; - OffsetNumber *offsets; - int curoff = 0; - - plans = (xl_tdeheap_freeze_plan *) XLogRecGetBlockData(record, 0, NULL); - offsets = (OffsetNumber *) ((char *) plans + - (xlrec->nplans * - sizeof(xl_tdeheap_freeze_plan))); - for (int p = 0; p < xlrec->nplans; p++) - { - HeapTupleFreeze frz; - - /* - * Convert freeze plan representation from WAL record into - * per-tuple format used by tdeheap_execute_freeze_tuple - */ - frz.xmax = plans[p].xmax; - frz.t_infomask2 = plans[p].t_infomask2; - frz.t_infomask = plans[p].t_infomask; - frz.frzflags = plans[p].frzflags; - frz.offset = InvalidOffsetNumber; /* unused, but be tidy */ - - for (int i = 0; i < plans[p].ntuples; i++) - { - OffsetNumber offset = offsets[curoff++]; - ItemId lp; - HeapTupleHeader tuple; - - lp = PageGetItemId(page, offset); - tuple = (HeapTupleHeader) PageGetItem(page, lp); - tdeheap_execute_freeze_tuple(tuple, &frz); - } - } - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); -} - -/* - * Given an "infobits" field from an XLog record, set the correct bits in the - * given infomask and infomask2 for the tuple touched by the record. - * - * (This is the reverse of compute_infobits). - */ -static void -fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) -{ - *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | - HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); - *infomask2 &= ~HEAP_KEYS_UPDATED; - - if (infobits & XLHL_XMAX_IS_MULTI) - *infomask |= HEAP_XMAX_IS_MULTI; - if (infobits & XLHL_XMAX_LOCK_ONLY) - *infomask |= HEAP_XMAX_LOCK_ONLY; - if (infobits & XLHL_XMAX_EXCL_LOCK) - *infomask |= HEAP_XMAX_EXCL_LOCK; - /* note HEAP_XMAX_SHR_LOCK isn't considered here */ - if (infobits & XLHL_XMAX_KEYSHR_LOCK) - *infomask |= HEAP_XMAX_KEYSHR_LOCK; - - if (infobits & XLHL_KEYS_UPDATED) - *infomask2 |= HEAP_KEYS_UPDATED; -} - -static void -tdeheap_xlog_delete(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_delete *xlrec = (xl_tdeheap_delete *) XLogRecGetData(record); - Buffer buffer; - Page page; - ItemId lp = NULL; - HeapTupleHeader htup; - BlockNumber blkno; - RelFileLocator target_locator; - ItemPointerData target_tid; - - XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); - ItemPointerSetBlockNumber(&target_tid, blkno); - ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); - - /* - * The visibility map may need to be fixed even if the heap page is - * already up-to-date. - */ - if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) - { - Relation reln = CreateFakeRelcacheEntry(target_locator); - Buffer vmbuffer = InvalidBuffer; - - tdeheap_visibilitymap_pin(reln, blkno, &vmbuffer); - tdeheap_visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) - { - page = BufferGetPage(buffer); - - if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) - lp = PageGetItemId(page, xlrec->offnum); - - if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) - elog(PANIC, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; - HeapTupleHeaderClearHotUpdated(htup); - fix_infomask_from_infobits(xlrec->infobits_set, - &htup->t_infomask, &htup->t_infomask2); - if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) - HeapTupleHeaderSetXmax(htup, xlrec->xmax); - else - HeapTupleHeaderSetXmin(htup, InvalidTransactionId); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); - - /* Mark the page as a candidate for pruning */ - PageSetPrunable(page, XLogRecGetXid(record)); - - if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) - PageClearAllVisible(page); - - /* Make sure t_ctid is set correctly */ - if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE) - HeapTupleHeaderSetMovedPartitions(htup); - else - htup->t_ctid = target_tid; - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); -} - -static void -tdeheap_xlog_insert(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_insert *xlrec = (xl_tdeheap_insert *) XLogRecGetData(record); - Buffer buffer; - Page page; - union - { - HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; - } tbuf; - HeapTupleHeader htup; - xl_tdeheap_header xlhdr; - uint32 newlen; - Size freespace = 0; - RelFileLocator target_locator; - BlockNumber blkno; - ItemPointerData target_tid; - XLogRedoAction action; - - XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); - ItemPointerSetBlockNumber(&target_tid, blkno); - ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); - - /* - * The visibility map may need to be fixed even if the heap page is - * already up-to-date. - */ - if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) - { - Relation reln = CreateFakeRelcacheEntry(target_locator); - Buffer vmbuffer = InvalidBuffer; - - tdeheap_visibilitymap_pin(reln, blkno, &vmbuffer); - tdeheap_visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - - /* - * If we inserted the first and only tuple on the page, re-initialize the - * page from scratch. - */ - if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) - { - buffer = XLogInitBufferForRedo(record, 0); - page = BufferGetPage(buffer); - PageInit(page, BufferGetPageSize(buffer), 0); - action = BLK_NEEDS_REDO; - } - else - action = XLogReadBufferForRedo(record, 0, &buffer); - if (action == BLK_NEEDS_REDO) - { - Size datalen; - char *data; - - page = BufferGetPage(buffer); - - if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum) - elog(PANIC, "invalid max offset number"); - - data = XLogRecGetBlockData(record, 0, &datalen); - - newlen = datalen - SizeOfHeapHeader; - Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize); - memcpy((char *) &xlhdr, data, SizeOfHeapHeader); - data += SizeOfHeapHeader; - - htup = &tbuf.hdr; - MemSet((char *) htup, 0, SizeofHeapTupleHeader); - /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ - memcpy((char *) htup + SizeofHeapTupleHeader, - data, - newlen); - newlen += SizeofHeapTupleHeader; - htup->t_infomask2 = xlhdr.t_infomask2; - htup->t_infomask = xlhdr.t_infomask; - htup->t_hoff = xlhdr.t_hoff; - HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); - htup->t_ctid = target_tid; - - if (TDE_PageAddItem(target_locator, target_locator.spcOid, blkno, page, (Item) htup, newlen, xlrec->offnum, - true, true) == InvalidOffsetNumber) - elog(PANIC, "failed to add tuple"); - - freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ - - PageSetLSN(page, lsn); - - if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) - PageClearAllVisible(page); - - /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ - if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) - PageSetAllVisible(page); - - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); - - /* - * If the page is running low on free space, update the FSM as well. - * Arbitrarily, our definition of "low" is less than 20%. We can't do much - * better than that without knowing the fill-factor for the table. - * - * XXX: Don't do this if the page was restored from full page image. We - * don't bother to update the FSM in that case, it doesn't need to be - * totally accurate anyway. - */ - if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(target_locator, blkno, freespace); -} - -/* - * Handles MULTI_INSERT record type. - */ -static void -tdeheap_xlog_multi_insert(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_multi_insert *xlrec; - RelFileLocator rlocator; - BlockNumber blkno; - Buffer buffer; - Page page; - union - { - HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; - } tbuf; - HeapTupleHeader htup; - uint32 newlen; - Size freespace = 0; - int i; - bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; - XLogRedoAction action; - - /* - * Insertion doesn't overwrite MVCC data, so no conflict processing is - * required. - */ - xlrec = (xl_tdeheap_multi_insert *) XLogRecGetData(record); - - XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); - - /* check that the mutually exclusive flags are not both set */ - Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) && - (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET))); - - /* - * The visibility map may need to be fixed even if the heap page is - * already up-to-date. - */ - if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) - { - Relation reln = CreateFakeRelcacheEntry(rlocator); - Buffer vmbuffer = InvalidBuffer; - - tdeheap_visibilitymap_pin(reln, blkno, &vmbuffer); - tdeheap_visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - - if (isinit) - { - buffer = XLogInitBufferForRedo(record, 0); - page = BufferGetPage(buffer); - PageInit(page, BufferGetPageSize(buffer), 0); - action = BLK_NEEDS_REDO; - } - else - action = XLogReadBufferForRedo(record, 0, &buffer); - if (action == BLK_NEEDS_REDO) - { - char *tupdata; - char *endptr; - Size len; - - /* Tuples are stored as block data */ - tupdata = XLogRecGetBlockData(record, 0, &len); - endptr = tupdata + len; - - page = (Page) BufferGetPage(buffer); - - for (i = 0; i < xlrec->ntuples; i++) - { - OffsetNumber offnum; - xl_multi_insert_tuple *xlhdr; - - /* - * If we're reinitializing the page, the tuples are stored in - * order from FirstOffsetNumber. Otherwise there's an array of - * offsets in the WAL record, and the tuples come after that. - */ - if (isinit) - offnum = FirstOffsetNumber + i; - else - offnum = xlrec->offsets[i]; - if (PageGetMaxOffsetNumber(page) + 1 < offnum) - elog(PANIC, "invalid max offset number"); - - xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata); - tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple; - - newlen = xlhdr->datalen; - Assert(newlen <= MaxHeapTupleSize); - htup = &tbuf.hdr; - MemSet((char *) htup, 0, SizeofHeapTupleHeader); - /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ - memcpy((char *) htup + SizeofHeapTupleHeader, - (char *) tupdata, - newlen); - tupdata += newlen; - - newlen += SizeofHeapTupleHeader; - htup->t_infomask2 = xlhdr->t_infomask2; - htup->t_infomask = xlhdr->t_infomask; - htup->t_hoff = xlhdr->t_hoff; - HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); - ItemPointerSetBlockNumber(&htup->t_ctid, blkno); - ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); - - offnum = TDE_PageAddItem(rlocator, rlocator.spcOid, blkno, page, (Item) htup, newlen, offnum, true, true); - if (offnum == InvalidOffsetNumber) - elog(PANIC, "failed to add tuple"); - } - if (tupdata != endptr) - elog(PANIC, "total tuple length mismatch"); - - freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ - - PageSetLSN(page, lsn); - - if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) - PageClearAllVisible(page); - - /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ - if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) - PageSetAllVisible(page); - - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); - - /* - * If the page is running low on free space, update the FSM as well. - * Arbitrarily, our definition of "low" is less than 20%. We can't do much - * better than that without knowing the fill-factor for the table. - * - * XXX: Don't do this if the page was restored from full page image. We - * don't bother to update the FSM in that case, it doesn't need to be - * totally accurate anyway. - */ - if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); -} - -/* - * Handles UPDATE and HOT_UPDATE - */ -static void -tdeheap_xlog_update(XLogReaderState *record, bool hot_update) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_update *xlrec = (xl_tdeheap_update *) XLogRecGetData(record); - RelFileLocator rlocator; - BlockNumber oldblk; - BlockNumber newblk; - ItemPointerData newtid; - Buffer obuffer, - nbuffer; - Page page; - OffsetNumber offnum; - ItemId lp = NULL; - HeapTupleData oldtup; - HeapTupleHeader htup; - uint16 prefixlen = 0, - suffixlen = 0; - char *newp; - union - { - HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; - } tbuf; - xl_tdeheap_header xlhdr; - uint32 newlen; - Size freespace = 0; - XLogRedoAction oldaction; - XLogRedoAction newaction; - - /* initialize to keep the compiler quiet */ - oldtup.t_data = NULL; - oldtup.t_len = 0; - - XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk); - if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL)) - { - /* HOT updates are never done across pages */ - Assert(!hot_update); - } - else - oldblk = newblk; - - ItemPointerSet(&newtid, newblk, xlrec->new_offnum); - - /* - * The visibility map may need to be fixed even if the heap page is - * already up-to-date. - */ - if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) - { - Relation reln = CreateFakeRelcacheEntry(rlocator); - Buffer vmbuffer = InvalidBuffer; - - tdeheap_visibilitymap_pin(reln, oldblk, &vmbuffer); - tdeheap_visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS); - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - - /* - * In normal operation, it is important to lock the two pages in - * page-number order, to avoid possible deadlocks against other update - * operations going the other way. However, during WAL replay there can - * be no other update happening, so we don't need to worry about that. But - * we *do* need to worry that we don't expose an inconsistent state to Hot - * Standby queries --- so the original page can't be unlocked before we've - * added the new tuple to the new page. - */ - - /* Deal with old tuple version */ - oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1, - &obuffer); - if (oldaction == BLK_NEEDS_REDO) - { - page = BufferGetPage(obuffer); - offnum = xlrec->old_offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) - elog(PANIC, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - oldtup.t_data = htup; - oldtup.t_len = ItemIdGetLength(lp); - - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; - if (hot_update) - HeapTupleHeaderSetHotUpdated(htup); - else - HeapTupleHeaderClearHotUpdated(htup); - fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, - &htup->t_infomask2); - HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); - /* Set forward chain link in t_ctid */ - htup->t_ctid = newtid; - - /* Mark the page as a candidate for pruning */ - PageSetPrunable(page, XLogRecGetXid(record)); - - if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) - PageClearAllVisible(page); - - PageSetLSN(page, lsn); - MarkBufferDirty(obuffer); - } - - /* - * Read the page the new tuple goes into, if different from old. - */ - if (oldblk == newblk) - { - nbuffer = obuffer; - newaction = oldaction; - } - else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) - { - nbuffer = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(nbuffer); - PageInit(page, BufferGetPageSize(nbuffer), 0); - newaction = BLK_NEEDS_REDO; - } - else - newaction = XLogReadBufferForRedo(record, 0, &nbuffer); - - /* - * The visibility map may need to be fixed even if the heap page is - * already up-to-date. - */ - if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) - { - Relation reln = CreateFakeRelcacheEntry(rlocator); - Buffer vmbuffer = InvalidBuffer; - - tdeheap_visibilitymap_pin(reln, newblk, &vmbuffer); - tdeheap_visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS); - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - - /* Deal with new tuple */ - if (newaction == BLK_NEEDS_REDO) - { - char *recdata; - char *recdata_end; - Size datalen; - Size tuplen; - - recdata = XLogRecGetBlockData(record, 0, &datalen); - recdata_end = recdata + datalen; - - page = BufferGetPage(nbuffer); - - offnum = xlrec->new_offnum; - if (PageGetMaxOffsetNumber(page) + 1 < offnum) - elog(PANIC, "invalid max offset number"); - - if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD) - { - Assert(newblk == oldblk); - memcpy(&prefixlen, recdata, sizeof(uint16)); - recdata += sizeof(uint16); - } - if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD) - { - Assert(newblk == oldblk); - memcpy(&suffixlen, recdata, sizeof(uint16)); - recdata += sizeof(uint16); - } - - memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader); - recdata += SizeOfHeapHeader; - - tuplen = recdata_end - recdata; - Assert(tuplen <= MaxHeapTupleSize); - - htup = &tbuf.hdr; - MemSet((char *) htup, 0, SizeofHeapTupleHeader); - - /* - * Reconstruct the new tuple using the prefix and/or suffix from the - * old tuple, and the data stored in the WAL record. - */ - newp = (char *) htup + SizeofHeapTupleHeader; - if (prefixlen > 0) - { - int len; - - /* copy bitmap [+ padding] [+ oid] from WAL record */ - len = xlhdr.t_hoff - SizeofHeapTupleHeader; - memcpy(newp, recdata, len); - recdata += len; - newp += len; - - /* copy prefix from old tuple */ - memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen); - newp += prefixlen; - - /* copy new tuple data from WAL record */ - len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader); - memcpy(newp, recdata, len); - recdata += len; - newp += len; - } - else - { - /* - * copy bitmap [+ padding] [+ oid] + data from record, all in one - * go - */ - memcpy(newp, recdata, tuplen); - recdata += tuplen; - newp += tuplen; - } - Assert(recdata == recdata_end); - - /* copy suffix from old tuple */ - if (suffixlen > 0) - memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen); - - newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen; - htup->t_infomask2 = xlhdr.t_infomask2; - htup->t_infomask = xlhdr.t_infomask; - htup->t_hoff = xlhdr.t_hoff; - - HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); - HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); - /* Make sure there is no forward chain link in t_ctid */ - htup->t_ctid = newtid; - - offnum = TDE_PageAddItem(rlocator, rlocator.spcOid, newblk, page, (Item) htup, newlen, offnum, true, true); - if (offnum == InvalidOffsetNumber) - elog(PANIC, "failed to add tuple"); - - if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) - PageClearAllVisible(page); - - freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ - - PageSetLSN(page, lsn); - MarkBufferDirty(nbuffer); - } - - if (BufferIsValid(nbuffer) && nbuffer != obuffer) - UnlockReleaseBuffer(nbuffer); - if (BufferIsValid(obuffer)) - UnlockReleaseBuffer(obuffer); - - /* - * If the new page is running low on free space, update the FSM as well. - * Arbitrarily, our definition of "low" is less than 20%. We can't do much - * better than that without knowing the fill-factor for the table. - * - * However, don't update the FSM on HOT updates, because after crash - * recovery, either the old or the new tuple will certainly be dead and - * prunable. After pruning, the page will have roughly as much free space - * as it did before the update, assuming the new tuple is about the same - * size as the old one. - * - * XXX: Don't do this if the page was restored from full page image. We - * don't bother to update the FSM in that case, it doesn't need to be - * totally accurate anyway. - */ - if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(rlocator, newblk, freespace); -} - -static void -tdeheap_xlog_confirm(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_confirm *xlrec = (xl_tdeheap_confirm *) XLogRecGetData(record); - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp = NULL; - HeapTupleHeader htup; - - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) - { - page = BufferGetPage(buffer); - - offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) - elog(PANIC, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - /* - * Confirm tuple as actually inserted - */ - ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum); - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); -} - -static void -tdeheap_xlog_lock(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_lock *xlrec = (xl_tdeheap_lock *) XLogRecGetData(record); - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp = NULL; - HeapTupleHeader htup; - - /* - * The visibility map may need to be fixed even if the heap page is - * already up-to-date. - */ - if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) - { - RelFileLocator rlocator; - Buffer vmbuffer = InvalidBuffer; - BlockNumber block; - Relation reln; - - XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); - reln = CreateFakeRelcacheEntry(rlocator); - - tdeheap_visibilitymap_pin(reln, block, &vmbuffer); - tdeheap_visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); - - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) - { - page = (Page) BufferGetPage(buffer); - - offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) - elog(PANIC, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; - fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, - &htup->t_infomask2); - - /* - * Clear relevant update flags, but only if the modified infomask says - * there's no update. - */ - if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask)) - { - HeapTupleHeaderClearHotUpdated(htup); - /* Make sure there is no forward chain link in t_ctid */ - ItemPointerSet(&htup->t_ctid, - BufferGetBlockNumber(buffer), - offnum); - } - HeapTupleHeaderSetXmax(htup, xlrec->xmax); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); -} - -static void -tdeheap_xlog_lock_updated(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_lock_updated *xlrec; - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp = NULL; - HeapTupleHeader htup; - - xlrec = (xl_tdeheap_lock_updated *) XLogRecGetData(record); - - /* - * The visibility map may need to be fixed even if the heap page is - * already up-to-date. - */ - if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) - { - RelFileLocator rlocator; - Buffer vmbuffer = InvalidBuffer; - BlockNumber block; - Relation reln; - - XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); - reln = CreateFakeRelcacheEntry(rlocator); - - tdeheap_visibilitymap_pin(reln, block, &vmbuffer); - tdeheap_visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); - - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) - { - page = BufferGetPage(buffer); - - offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) - elog(PANIC, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; - fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, - &htup->t_infomask2); - HeapTupleHeaderSetXmax(htup, xlrec->xmax); - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); -} - -static void -tdeheap_xlog_inplace(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_tdeheap_inplace *xlrec = (xl_tdeheap_inplace *) XLogRecGetData(record); - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp = NULL; - HeapTupleHeader htup; - uint32 oldlen; - Size newlen; - - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) - { - char *newtup = XLogRecGetBlockData(record, 0, &newlen); - - page = BufferGetPage(buffer); - - offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) - elog(PANIC, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - oldlen = ItemIdGetLength(lp) - htup->t_hoff; - if (oldlen != newlen) - elog(PANIC, "wrong tuple length"); - - memcpy((char *) htup + htup->t_hoff, newtup, newlen); - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); -} - -void -tdeheap_redo(XLogReaderState *record) -{ - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - - /* - * These operations don't overwrite MVCC data so no conflict processing is - * required. The ones in heap2 rmgr do. - */ - - switch (info & XLOG_HEAP_OPMASK) - { - case XLOG_HEAP_INSERT: - tdeheap_xlog_insert(record); - break; - case XLOG_HEAP_DELETE: - tdeheap_xlog_delete(record); - break; - case XLOG_HEAP_UPDATE: - tdeheap_xlog_update(record, false); - break; - case XLOG_HEAP_TRUNCATE: - - /* - * TRUNCATE is a no-op because the actions are already logged as - * SMGR WAL records. TRUNCATE WAL record only exists for logical - * decoding. - */ - break; - case XLOG_HEAP_HOT_UPDATE: - tdeheap_xlog_update(record, true); - break; - case XLOG_HEAP_CONFIRM: - tdeheap_xlog_confirm(record); - break; - case XLOG_HEAP_LOCK: - tdeheap_xlog_lock(record); - break; - case XLOG_HEAP_INPLACE: - tdeheap_xlog_inplace(record); - break; - default: - elog(PANIC, "pg_tde_redo: unknown op code %u", info); - } -} - -void -heapam2_redo(XLogReaderState *record) -{ - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - - switch (info & XLOG_HEAP_OPMASK) - { - case XLOG_HEAP2_PRUNE: - tdeheap_xlog_prune(record); - break; - case XLOG_HEAP2_VACUUM: - tdeheap_xlog_vacuum(record); - break; - case XLOG_HEAP2_FREEZE_PAGE: - tdeheap_xlog_freeze_page(record); - break; - case XLOG_HEAP2_VISIBLE: - tdeheap_xlog_visible(record); - break; - case XLOG_HEAP2_MULTI_INSERT: - tdeheap_xlog_multi_insert(record); - break; - case XLOG_HEAP2_LOCK_UPDATED: - tdeheap_xlog_lock_updated(record); - break; - case XLOG_HEAP2_NEW_CID: - - /* - * Nothing to do on a real replay, only used during logical - * decoding. - */ - break; - case XLOG_HEAP2_REWRITE: - tdeheap_xlog_logical_rewrite(record); - break; - default: - elog(PANIC, "heap2_redo: unknown op code %u", info); - } -} - -/* - * Mask a heap page before performing consistency checks on it. - */ -void -tdeheap_mask(char *pagedata, BlockNumber blkno) -{ - Page page = (Page) pagedata; - OffsetNumber off; - - mask_page_lsn_and_checksum(page); - - mask_page_hint_bits(page); - mask_unused_space(page); - - for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) - { - ItemId iid = PageGetItemId(page, off); - char *page_item; - - page_item = (char *) (page + ItemIdGetOffset(iid)); - - if (ItemIdIsNormal(iid)) - { - HeapTupleHeader page_htup = (HeapTupleHeader) page_item; - - /* - * If xmin of a tuple is not yet frozen, we should ignore - * differences in hint bits, since they can be set without - * emitting WAL. - */ - if (!HeapTupleHeaderXminFrozen(page_htup)) - page_htup->t_infomask &= ~HEAP_XACT_MASK; - else - { - /* Still we need to mask xmax hint bits. */ - page_htup->t_infomask &= ~HEAP_XMAX_INVALID; - page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; - } - - /* - * During replay, we set Command Id to FirstCommandId. Hence, mask - * it. See tdeheap_xlog_insert() for details. - */ - page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; - - /* - * For a speculative tuple, tdeheap_insert() does not set ctid in the - * caller-passed heap tuple itself, leaving the ctid field to - * contain a speculative token value - a per-backend monotonically - * increasing identifier. Besides, it does not WAL-log ctid under - * any circumstances. - * - * During redo, tdeheap_xlog_insert() sets t_ctid to current block - * number and self offset number. It doesn't care about any - * speculative insertions on the primary. Hence, we set t_ctid to - * current block number and self offset number to ignore any - * inconsistency. - */ - if (HeapTupleHeaderIsSpeculative(page_htup)) - ItemPointerSet(&page_htup->t_ctid, blkno, off); - - /* - * NB: Not ignoring ctid changes due to the tuple having moved - * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's - * important information that needs to be in-sync between primary - * and standby, and thus is WAL logged. - */ - } - - /* - * Ignore any padding bytes after the tuple, when the length of the - * item is not MAXALIGNed. - */ - if (ItemIdHasStorage(iid)) - { - int len = ItemIdGetLength(iid); - int padlen = MAXALIGN(len) - len; - - if (padlen > 0) - memset(page_item + len, MASK_MARKER, padlen); - } - } -} - -/* - * HeapCheckForSerializableConflictOut - * We are reading a tuple. If it's not visible, there may be a - * rw-conflict out with the inserter. Otherwise, if it is visible to us - * but has been deleted, there may be a rw-conflict out with the deleter. - * - * We will determine the top level xid of the writing transaction with which - * we may be in conflict, and ask CheckForSerializableConflictOut() to check - * for overlap with our own transaction. - * - * This function should be called just about anywhere in heapam.c where a - * tuple has been read. The caller must hold at least a shared lock on the - * buffer, because this function might set hint bits on the tuple. There is - * currently no known reason to call this function from an index AM. - */ -void -HeapCheckForSerializableConflictOut(bool visible, Relation relation, - HeapTuple tuple, Buffer buffer, - Snapshot snapshot) -{ - TransactionId xid; - HTSV_Result htsvResult; - - if (!CheckForSerializableConflictOutNeeded(relation, snapshot)) - return; - - /* - * Check to see whether the tuple has been written to by a concurrent - * transaction, either to create it not visible to us, or to delete it - * while it is visible to us. The "visible" bool indicates whether the - * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else - * is going on with it. - * - * In the event of a concurrently inserted tuple that also happens to have - * been concurrently updated (by a separate transaction), the xmin of the - * tuple will be used -- not the updater's xid. - */ - htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer); - switch (htsvResult) - { - case HEAPTUPLE_LIVE: - if (visible) - return; - xid = HeapTupleHeaderGetXmin(tuple->t_data); - break; - case HEAPTUPLE_RECENTLY_DEAD: - case HEAPTUPLE_DELETE_IN_PROGRESS: - if (visible) - xid = HeapTupleHeaderGetUpdateXid(tuple->t_data); - else - xid = HeapTupleHeaderGetXmin(tuple->t_data); - - if (TransactionIdPrecedes(xid, TransactionXmin)) - { - /* This is like the HEAPTUPLE_DEAD case */ - Assert(!visible); - return; - } - break; - case HEAPTUPLE_INSERT_IN_PROGRESS: - xid = HeapTupleHeaderGetXmin(tuple->t_data); - break; - case HEAPTUPLE_DEAD: - Assert(!visible); - return; - default: - - /* - * The only way to get to this default clause is if a new value is - * added to the enum type without adding it to this switch - * statement. That's a bug, so elog. - */ - elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult); - - /* - * In spite of having all enum values covered and calling elog on - * this default, some compilers think this is a code path which - * allows xid to be used below without initialization. Silence - * that warning. - */ - xid = InvalidTransactionId; - } - - Assert(TransactionIdIsValid(xid)); - Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); - - /* - * Find top level xid. Bail out if xid is too early to be a conflict, or - * if it's our own xid. - */ - if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) - return; - xid = SubTransGetTopmostTransaction(xid); - if (TransactionIdPrecedes(xid, TransactionXmin)) - return; - - CheckForSerializableConflictOut(relation, xid, snapshot); -} diff --git a/src/access/pg_tdeam_handler.c b/src/access/pg_tdeam_handler.c deleted file mode 100644 index f237444b..00000000 --- a/src/access/pg_tdeam_handler.c +++ /dev/null @@ -1,2663 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_tdeam_handler.c - * heap table access method code - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/access/heap/pg_tdeam_handler.c - * - * - * NOTES - * This files wires up the lower level heapam.c et al routines with the - * tableam abstraction. - * - *------------------------------------------------------------------------- - */ - -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include "access/pg_tde_slot.h" - -#include "access/pg_tdeam.h" -#include "access/pg_tdetoast.h" -#include "access/pg_tde_rewrite.h" -#include "access/pg_tde_tdemap.h" - -#include "encryption/enc_tde.h" - -#include "access/genam.h" -#include "access/multixact.h" -#include "access/syncscan.h" -#include "access/tableam.h" -#include "access/tsmapi.h" -#include "access/xact.h" -#include "catalog/catalog.h" -#include "catalog/index.h" -#include "catalog/storage.h" -#include "catalog/storage_xlog.h" -#include "commands/progress.h" -#include "executor/executor.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "storage/bufmgr.h" -#include "storage/bufpage.h" -#include "storage/lmgr.h" -#include "storage/predicate.h" -#include "storage/procarray.h" -#include "storage/smgr.h" -#include "utils/builtins.h" -#include "utils/rel.h" - -PG_FUNCTION_INFO_V1(pg_tdeam_basic_handler); -#ifdef PERCONA_FORK -PG_FUNCTION_INFO_V1(pg_tdeam_handler); -#endif - - -static void reform_and_rewrite_tuple(HeapTuple tuple, - Relation OldHeap, Relation NewHeap, - Datum *values, bool *isnull, RewriteState rwstate); - -static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, - HeapTuple tuple, - OffsetNumber tupoffset); - -static BlockNumber pg_tdeam_scan_get_blocks_done(HeapScanDesc hscan); - -static const TableAmRoutine pg_tdeam_methods; - - -/* ------------------------------------------------------------------------ - * Slot related callbacks for heap AM - * ------------------------------------------------------------------------ - */ - -static const TupleTableSlotOps * -pg_tdeam_slot_callbacks(Relation relation) -{ - return &TTSOpsTDEBufferHeapTuple; -} - - -/* ------------------------------------------------------------------------ - * Index Scan Callbacks for heap AM - * ------------------------------------------------------------------------ - */ - -static IndexFetchTableData * -pg_tdeam_index_fetch_begin(Relation rel) -{ - IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData)); - - hscan->xs_base.rel = rel; - hscan->xs_cbuf = InvalidBuffer; - - return &hscan->xs_base; -} - -static void -pg_tdeam_index_fetch_reset(IndexFetchTableData *scan) -{ - IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; - - if (BufferIsValid(hscan->xs_cbuf)) - { - ReleaseBuffer(hscan->xs_cbuf); - hscan->xs_cbuf = InvalidBuffer; - } -} - -static void -pg_tdeam_index_fetch_end(IndexFetchTableData *scan) -{ - IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; - - pg_tdeam_index_fetch_reset(scan); - - pfree(hscan); -} - -static bool -pg_tdeam_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, - Snapshot snapshot, - TupleTableSlot *slot, - bool *call_again, bool *all_dead) -{ - IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; - BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; - bool got_tdeheap_tuple; - - Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); - - /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ - if (!*call_again) - { - /* Switch to correct buffer if we don't have it already */ - Buffer prev_buf = hscan->xs_cbuf; - - hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, - hscan->xs_base.rel, - ItemPointerGetBlockNumber(tid)); - - /* - * Prune page, but only if we weren't already on this page - */ - if (prev_buf != hscan->xs_cbuf) - tdeheap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); - } - - /* Obtain share-lock on the buffer so we can examine visibility */ - LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE); - got_tdeheap_tuple = tdeheap_hot_search_buffer(tid, - hscan->xs_base.rel, - hscan->xs_cbuf, - snapshot, - &bslot->base.tupdata, - all_dead, - !*call_again); - bslot->base.tupdata.t_self = *tid; - LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK); - - if (got_tdeheap_tuple) - { - /* - * Only in a non-MVCC snapshot can more than one member of the HOT - * chain be visible. - */ - *call_again = !IsMVCCSnapshot(snapshot); - - slot->tts_tableOid = RelationGetRelid(scan->rel); - PGTdeExecStoreBufferHeapTuple(scan->rel, &bslot->base.tupdata, slot, hscan->xs_cbuf); - } - else - { - /* We've reached the end of the HOT chain. */ - *call_again = false; - } - - return got_tdeheap_tuple; -} - - -/* ------------------------------------------------------------------------ - * Callbacks for non-modifying operations on individual tuples for heap AM - * ------------------------------------------------------------------------ - */ - -static bool -pg_tdeam_fetch_row_version(Relation relation, - ItemPointer tid, - Snapshot snapshot, - TupleTableSlot *slot) -{ - BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; - Buffer buffer; - - Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); - - bslot->base.tupdata.t_self = *tid; - if (tdeheap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false)) - { - /* store in slot, transferring existing pin */ - PGTdeExecStorePinnedBufferHeapTuple(relation, &bslot->base.tupdata, slot, buffer); - slot->tts_tableOid = RelationGetRelid(relation); - - return true; - } - - return false; -} - -static bool -pg_tdeam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid) -{ - HeapScanDesc hscan = (HeapScanDesc) scan; - - return ItemPointerIsValid(tid) && - ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks; -} - -static bool -pg_tdeam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, - Snapshot snapshot) -{ - BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; - bool res; - - Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); - Assert(BufferIsValid(bslot->buffer)); - - /* - * We need buffer pin and lock to call HeapTupleSatisfiesVisibility. - * Caller should be holding pin, but not lock. - */ - LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE); - res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot, - bslot->buffer); - LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK); - - return res; -} - - -/* ---------------------------------------------------------------------------- - * Functions for manipulations of physical tuples for heap AM. - * ---------------------------------------------------------------------------- - */ - -static void -pg_tdeam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, - int options, BulkInsertState bistate) -{ - bool shouldFree = true; - HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); - - /* Update the tuple with table oid */ - slot->tts_tableOid = RelationGetRelid(relation); - tuple->t_tableOid = slot->tts_tableOid; - - /* Perform the insertion, and copy the resulting ItemPointer */ - tdeheap_insert(relation, tuple, cid, options, bistate); - ItemPointerCopy(&tuple->t_self, &slot->tts_tid); - - if (shouldFree) - pfree(tuple); -} - -static void -pg_tdeam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot, - CommandId cid, int options, - BulkInsertState bistate, uint32 specToken) -{ - bool shouldFree = true; - HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); - - /* Update the tuple with table oid */ - slot->tts_tableOid = RelationGetRelid(relation); - tuple->t_tableOid = slot->tts_tableOid; - - HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken); - options |= HEAP_INSERT_SPECULATIVE; - - /* Perform the insertion, and copy the resulting ItemPointer */ - tdeheap_insert(relation, tuple, cid, options, bistate); - ItemPointerCopy(&tuple->t_self, &slot->tts_tid); - - if (shouldFree) - pfree(tuple); -} - -static void -pg_tdeam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, - uint32 specToken, bool succeeded) -{ - bool shouldFree = true; - HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); - - /* adjust the tuple's state accordingly */ - if (succeeded) - tdeheap_finish_speculative(relation, &slot->tts_tid); - else - tdeheap_abort_speculative(relation, &slot->tts_tid); - - if (shouldFree) - pfree(tuple); -} - -static TM_Result -pg_tdeam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) -{ - /* - * Currently Deleting of index tuples are handled at vacuum, in case if - * the storage itself is cleaning the dead tuples by itself, it is the - * time to call the index tuple deletion also. - */ - return tdeheap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart); -} - - -static TM_Result -pg_tdeam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, - CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, - LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes) -{ - bool shouldFree = true; - HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); - TM_Result result; - - /* Update the tuple with table oid */ - slot->tts_tableOid = RelationGetRelid(relation); - tuple->t_tableOid = slot->tts_tableOid; - - result = tdeheap_update(relation, otid, tuple, cid, crosscheck, wait, - tmfd, lockmode, update_indexes); - ItemPointerCopy(&tuple->t_self, &slot->tts_tid); - - /* - * Decide whether new index entries are needed for the tuple - * - * Note: tdeheap_update returns the tid (location) of the new tuple in the - * t_self field. - * - * If the update is not HOT, we must update all indexes. If the update is - * HOT, it could be that we updated summarized columns, so we either - * update only summarized indexes, or none at all. - */ - if (result != TM_Ok) - { - Assert(*update_indexes == TU_None); - *update_indexes = TU_None; - } - else if (!HeapTupleIsHeapOnly(tuple)) - Assert(*update_indexes == TU_All); - else - Assert((*update_indexes == TU_Summarizing) || - (*update_indexes == TU_None)); - - if (shouldFree) - pfree(tuple); - - return result; -} - -static TM_Result -pg_tdeam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, - TupleTableSlot *slot, CommandId cid, LockTupleMode mode, - LockWaitPolicy wait_policy, uint8 flags, - TM_FailureData *tmfd) -{ - BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; - TM_Result result; - Buffer buffer; - HeapTuple tuple = &bslot->base.tupdata; - bool follow_updates; - - follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0; - tmfd->traversed = false; - - Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); - -tuple_lock_retry: - tuple->t_self = *tid; - result = tdeheap_lock_tuple(relation, tuple, cid, mode, wait_policy, - follow_updates, &buffer, tmfd); - - if (result == TM_Updated && - (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION)) - { - /* Should not encounter speculative tuple on recheck */ - Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); - - ReleaseBuffer(buffer); - - if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self)) - { - SnapshotData SnapshotDirty; - TransactionId priorXmax; - - /* it was updated, so look at the updated version */ - *tid = tmfd->ctid; - /* updated row should have xmin matching this xmax */ - priorXmax = tmfd->xmax; - - /* signal that a tuple later in the chain is getting locked */ - tmfd->traversed = true; - - /* - * fetch target tuple - * - * Loop here to deal with updated or busy tuples - */ - InitDirtySnapshot(SnapshotDirty); - for (;;) - { - if (ItemPointerIndicatesMovedPartitions(tid)) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("tuple to be locked was already moved to another partition due to concurrent update"))); - - tuple->t_self = *tid; - if (tdeheap_fetch(relation, &SnapshotDirty, tuple, &buffer, true)) - { - /* - * If xmin isn't what we're expecting, the slot must have - * been recycled and reused for an unrelated tuple. This - * implies that the latest version of the row was deleted, - * so we need do nothing. (Should be safe to examine xmin - * without getting buffer's content lock. We assume - * reading a TransactionId to be atomic, and Xmin never - * changes in an existing tuple, except to invalid or - * frozen, and neither of those can match priorXmax.) - */ - if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), - priorXmax)) - { - ReleaseBuffer(buffer); - return TM_Deleted; - } - - /* otherwise xmin should not be dirty... */ - if (TransactionIdIsValid(SnapshotDirty.xmin)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"", - SnapshotDirty.xmin, - ItemPointerGetBlockNumber(&tuple->t_self), - ItemPointerGetOffsetNumber(&tuple->t_self), - RelationGetRelationName(relation)))); - - /* - * If tuple is being updated by other transaction then we - * have to wait for its commit/abort, or die trying. - */ - if (TransactionIdIsValid(SnapshotDirty.xmax)) - { - ReleaseBuffer(buffer); - switch (wait_policy) - { - case LockWaitBlock: - XactLockTableWait(SnapshotDirty.xmax, - relation, &tuple->t_self, - XLTW_FetchUpdated); - break; - case LockWaitSkip: - if (!ConditionalXactLockTableWait(SnapshotDirty.xmax)) - /* skip instead of waiting */ - return TM_WouldBlock; - break; - case LockWaitError: - if (!ConditionalXactLockTableWait(SnapshotDirty.xmax)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", - RelationGetRelationName(relation)))); - break; - } - continue; /* loop back to repeat tdeheap_fetch */ - } - - /* - * If tuple was inserted by our own transaction, we have - * to check cmin against cid: cmin >= current CID means - * our command cannot see the tuple, so we should ignore - * it. Otherwise tdeheap_lock_tuple() will throw an error, - * and so would any later attempt to update or delete the - * tuple. (We need not check cmax because - * HeapTupleSatisfiesDirty will consider a tuple deleted - * by our transaction dead, regardless of cmax.) We just - * checked that priorXmax == xmin, so we can test that - * variable instead of doing HeapTupleHeaderGetXmin again. - */ - if (TransactionIdIsCurrentTransactionId(priorXmax) && - HeapTupleHeaderGetCmin(tuple->t_data) >= cid) - { - tmfd->xmax = priorXmax; - - /* - * Cmin is the problematic value, so store that. See - * above. - */ - tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data); - ReleaseBuffer(buffer); - return TM_SelfModified; - } - - /* - * This is a live tuple, so try to lock it again. - */ - ReleaseBuffer(buffer); - goto tuple_lock_retry; - } - - /* - * If the referenced slot was actually empty, the latest - * version of the row must have been deleted, so we need do - * nothing. - */ - if (tuple->t_data == NULL) - { - Assert(!BufferIsValid(buffer)); - return TM_Deleted; - } - - /* - * As above, if xmin isn't what we're expecting, do nothing. - */ - if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), - priorXmax)) - { - ReleaseBuffer(buffer); - return TM_Deleted; - } - - /* - * If we get here, the tuple was found but failed - * SnapshotDirty. Assuming the xmin is either a committed xact - * or our own xact (as it certainly should be if we're trying - * to modify the tuple), this must mean that the row was - * updated or deleted by either a committed xact or our own - * xact. If it was deleted, we can ignore it; if it was - * updated then chain up to the next version and repeat the - * whole process. - * - * As above, it should be safe to examine xmax and t_ctid - * without the buffer content lock, because they can't be - * changing. We'd better hold a buffer pin though. - */ - if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) - { - /* deleted, so forget about it */ - ReleaseBuffer(buffer); - return TM_Deleted; - } - - /* updated, so look at the updated row */ - *tid = tuple->t_data->t_ctid; - /* updated row should have xmin matching this xmax */ - priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); - ReleaseBuffer(buffer); - /* loop back to fetch next in chain */ - } - } - else - { - /* tuple was deleted, so give up */ - return TM_Deleted; - } - } - - slot->tts_tableOid = RelationGetRelid(relation); - tuple->t_tableOid = slot->tts_tableOid; - - /* store in slot, transferring existing pin */ - PGTdeExecStorePinnedBufferHeapTuple(relation, tuple, slot, buffer); - - return result; -} - - -/* ------------------------------------------------------------------------ - * DDL related callbacks for heap AM. - * ------------------------------------------------------------------------ - */ - -static void -pg_tdeam_relation_set_new_filelocator(Relation rel, - const RelFileLocator *newrlocator, - char persistence, - TransactionId *freezeXid, - MultiXactId *minmulti) -{ - SMgrRelation srel; - - /* - * Initialize to the minimum XID that could put tuples in the table. We - * know that no xacts older than RecentXmin are still running, so that - * will do. - */ - *freezeXid = RecentXmin; - - /* - * Similarly, initialize the minimum Multixact to the first value that - * could possibly be stored in tuples in the table. Running transactions - * could reuse values from their local cache, so we are careful to - * consider all currently running multis. - * - * XXX this could be refined further, but is it worth the hassle? - */ - *minmulti = GetOldestMultiXactId(); - - srel = RelationCreateStorage(*newrlocator, persistence, true); - - /* - * If required, set up an init fork for an unlogged table so that it can - * be correctly reinitialized on restart. An immediate sync is required - * even if the page has been logged, because the write did not go through - * shared_buffers and therefore a concurrent checkpoint may have moved the - * redo pointer past our xlog record. Recovery may as well remove it - * while replaying, for example, XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE - * record. Therefore, logging is necessary even if wal_level=minimal. - */ - if (persistence == RELPERSISTENCE_UNLOGGED) - { - Assert(rel->rd_rel->relkind == RELKIND_RELATION || - rel->rd_rel->relkind == RELKIND_MATVIEW || - rel->rd_rel->relkind == RELKIND_TOASTVALUE); - smgrcreate(srel, INIT_FORKNUM, false); - log_smgrcreate(newrlocator, INIT_FORKNUM); - smgrimmedsync(srel, INIT_FORKNUM); - } - - smgrclose(srel); - - /* Update TDE filemap */ - if (rel->rd_rel->relkind == RELKIND_RELATION || - rel->rd_rel->relkind == RELKIND_MATVIEW || - rel->rd_rel->relkind == RELKIND_TOASTVALUE) - { - ereport(DEBUG1, - (errmsg("creating key file for relation %s", RelationGetRelationName(rel)))); - - pg_tde_create_key_map_entry(newrlocator); - } -} - -static void -pg_tdeam_relation_nontransactional_truncate(Relation rel) -{ - RelationTruncate(rel, 0); -} - -static void -pg_tdeam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) -{ - SMgrRelation dstrel; - - dstrel = smgropen(*newrlocator, rel->rd_backend); - - /* - * Since we copy the file directly without looking at the shared buffers, - * we'd better first flush out any pages of the source relation that are - * in shared buffers. We assume no new changes will be made while we are - * holding exclusive lock on the rel. - */ - FlushRelationBuffers(rel); - - /* - * Create and copy all forks of the relation, and schedule unlinking of - * old physical files. - * - * NOTE: any conflict in relfilenumber value will be caught in - * RelationCreateStorage(). - */ - RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true); - - /* copy main fork */ - RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM, - rel->rd_rel->relpersistence); - - /* copy those extra forks that exist */ - for (ForkNumber forkNum = MAIN_FORKNUM + 1; - forkNum <= MAX_FORKNUM; forkNum++) - { - if (smgrexists(RelationGetSmgr(rel), forkNum)) - { - smgrcreate(dstrel, forkNum, false); - - /* - * WAL log creation if the relation is persistent, or this is the - * init fork of an unlogged relation. - */ - if (RelationIsPermanent(rel) || - (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && - forkNum == INIT_FORKNUM)) - log_smgrcreate(newrlocator, forkNum); - RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum, - rel->rd_rel->relpersistence); - } - } - - - /* drop old relation, and close new one */ - RelationDropStorage(rel); - smgrclose(dstrel); -} - -static void -pg_tdeam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, - Relation OldIndex, bool use_sort, - TransactionId OldestXmin, - TransactionId *xid_cutoff, - MultiXactId *multi_cutoff, - double *num_tuples, - double *tups_vacuumed, - double *tups_recently_dead) -{ - RewriteState rwstate; - IndexScanDesc indexScan; - TableScanDesc tableScan; - HeapScanDesc heapScan; - bool is_system_catalog; - Tuplesortstate *tuplesort; - TupleDesc oldTupDesc = RelationGetDescr(OldHeap); - TupleDesc newTupDesc = RelationGetDescr(NewHeap); - TupleTableSlot *slot; - int natts; - Datum *values; - bool *isnull; - BufferHeapTupleTableSlot *hslot; - BlockNumber prev_cblock = InvalidBlockNumber; - - /* Remember if it's a system catalog */ - is_system_catalog = IsSystemRelation(OldHeap); - - /* - * Valid smgr_targblock implies something already wrote to the relation. - * This may be harmless, but this function hasn't planned for it. - */ - Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber); - - /* Preallocate values/isnull arrays */ - natts = newTupDesc->natts; - values = (Datum *) palloc(natts * sizeof(Datum)); - isnull = (bool *) palloc(natts * sizeof(bool)); - - /* Initialize the rewrite operation */ - rwstate = begin_tdeheap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff, - *multi_cutoff); - - - /* Set up sorting if wanted */ - if (use_sort) - tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex, - maintenance_work_mem, - NULL, TUPLESORT_NONE); - else - tuplesort = NULL; - - /* - * Prepare to scan the OldHeap. To ensure we see recently-dead tuples - * that still need to be copied, we scan with SnapshotAny and use - * HeapTupleSatisfiesVacuum for the visibility test. - */ - if (OldIndex != NULL && !use_sort) - { - const int ci_index[] = { - PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_INDEX_RELID - }; - int64 ci_val[2]; - - /* Set phase and OIDOldIndex to columns */ - ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; - ci_val[1] = RelationGetRelid(OldIndex); - pgstat_progress_update_multi_param(2, ci_index, ci_val); - - tableScan = NULL; - heapScan = NULL; - indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0); - index_rescan(indexScan, NULL, 0, NULL, 0); - } - else - { - /* In scan-and-sort mode and also VACUUM FULL, set phase */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); - - tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); - heapScan = (HeapScanDesc) tableScan; - indexScan = NULL; - - /* Set total heap blocks */ - pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, - heapScan->rs_nblocks); - } - - slot = table_slot_create(OldHeap, NULL); - hslot = (BufferHeapTupleTableSlot *) slot; - - /* - * Scan through the OldHeap, either in OldIndex order or sequentially; - * copy each tuple into the NewHeap, or transiently to the tuplesort - * module. Note that we don't bother sorting dead tuples (they won't get - * to the new table anyway). - */ - for (;;) - { - HeapTuple tuple; - Buffer buf; - bool isdead; - - CHECK_FOR_INTERRUPTS(); - - if (indexScan != NULL) - { - if (!index_getnext_slot(indexScan, ForwardScanDirection, slot)) - break; - - /* Since we used no scan keys, should never need to recheck */ - if (indexScan->xs_recheck) - elog(ERROR, "CLUSTER does not support lossy index conditions"); - } - else - { - if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot)) - { - /* - * If the last pages of the scan were empty, we would go to - * the next phase while tdeheap_blks_scanned != tdeheap_blks_total. - * Instead, to ensure that tdeheap_blks_scanned is equivalent to - * tdeheap_blks_total after the table scan phase, this parameter - * is manually updated to the correct value when the table - * scan finishes. - */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, - heapScan->rs_nblocks); - break; - } - - /* - * In scan-and-sort mode and also VACUUM FULL, set heap blocks - * scanned - * - * Note that heapScan may start at an offset and wrap around, i.e. - * rs_startblock may be >0, and rs_cblock may end with a number - * below rs_startblock. To prevent showing this wraparound to the - * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks). - */ - if (prev_cblock != heapScan->rs_cblock) - { - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, - (heapScan->rs_cblock + - heapScan->rs_nblocks - - heapScan->rs_startblock - ) % heapScan->rs_nblocks + 1); - prev_cblock = heapScan->rs_cblock; - } - } - - tuple = ExecFetchSlotHeapTuple(slot, false, NULL); - buf = hslot->buffer; - - LockBuffer(buf, BUFFER_LOCK_SHARE); - - switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf)) - { - case HEAPTUPLE_DEAD: - /* Definitely dead */ - isdead = true; - break; - case HEAPTUPLE_RECENTLY_DEAD: - *tups_recently_dead += 1; - /* fall through */ - case HEAPTUPLE_LIVE: - /* Live or recently dead, must copy it */ - isdead = false; - break; - case HEAPTUPLE_INSERT_IN_PROGRESS: - - /* - * Since we hold exclusive lock on the relation, normally the - * only way to see this is if it was inserted earlier in our - * own transaction. However, it can happen in system - * catalogs, since we tend to release write lock before commit - * there. Give a warning if neither case applies; but in any - * case we had better copy it. - */ - if (!is_system_catalog && - !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) - elog(WARNING, "concurrent insert in progress within table \"%s\"", - RelationGetRelationName(OldHeap)); - /* treat as live */ - isdead = false; - break; - case HEAPTUPLE_DELETE_IN_PROGRESS: - - /* - * Similar situation to INSERT_IN_PROGRESS case. - */ - if (!is_system_catalog && - !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data))) - elog(WARNING, "concurrent delete in progress within table \"%s\"", - RelationGetRelationName(OldHeap)); - /* treat as recently dead */ - *tups_recently_dead += 1; - isdead = false; - break; - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - isdead = false; /* keep compiler quiet */ - break; - } - - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - - if (isdead) - { - *tups_vacuumed += 1; - /* heap rewrite module still needs to see it... */ - if (rewrite_tdeheap_dead_tuple(rwstate, tuple)) - { - /* A previous recently-dead tuple is now known dead */ - *tups_vacuumed += 1; - *tups_recently_dead -= 1; - } - continue; - } - - *num_tuples += 1; - if (tuplesort != NULL) - { - tuplesort_putheaptuple(tuplesort, tuple); - - /* - * In scan-and-sort mode, report increase in number of tuples - * scanned - */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, - *num_tuples); - } - else - { - const int ct_index[] = { - PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, - PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN - }; - int64 ct_val[2]; - - reform_and_rewrite_tuple(tuple, OldHeap, NewHeap, - values, isnull, rwstate); - - /* - * In indexscan mode and also VACUUM FULL, report increase in - * number of tuples scanned and written - */ - ct_val[0] = *num_tuples; - ct_val[1] = *num_tuples; - pgstat_progress_update_multi_param(2, ct_index, ct_val); - } - } - - if (indexScan != NULL) - index_endscan(indexScan); - if (tableScan != NULL) - table_endscan(tableScan); - if (slot) - ExecDropSingleTupleTableSlot(slot); - - /* - * In scan-and-sort mode, complete the sort, then read out all live tuples - * from the tuplestore and write them to the new relation. - */ - if (tuplesort != NULL) - { - double n_tuples = 0; - - /* Report that we are now sorting tuples */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_SORT_TUPLES); - - tuplesort_performsort(tuplesort); - - /* Report that we are now writing new heap */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP); - - for (;;) - { - HeapTuple tuple; - - CHECK_FOR_INTERRUPTS(); - - tuple = tuplesort_getheaptuple(tuplesort, true); - if (tuple == NULL) - break; - - n_tuples += 1; - reform_and_rewrite_tuple(tuple, - OldHeap, NewHeap, - values, isnull, - rwstate); - /* Report n_tuples */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, - n_tuples); - } - - tuplesort_end(tuplesort); - } - - /* Write out any remaining tuples, and fsync if needed */ - end_tdeheap_rewrite(rwstate); - - /* Clean up */ - pfree(values); - pfree(isnull); -} - -static bool -pg_tdeam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, - BufferAccessStrategy bstrategy) -{ - HeapScanDesc hscan = (HeapScanDesc) scan; - - /* - * We must maintain a pin on the target page's buffer to ensure that - * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from - * under us. Hence, pin the page until we are done looking at it. We - * also choose to hold sharelock on the buffer throughout --- we could - * release and re-acquire sharelock for each tuple, but since we aren't - * doing much work per tuple, the extra lock traffic is probably better - * avoided. - */ - hscan->rs_cblock = blockno; - hscan->rs_cindex = FirstOffsetNumber; - hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, - blockno, RBM_NORMAL, bstrategy); - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - - /* in heap all blocks can contain tuples, so always return true */ - return true; -} - -static bool -pg_tdeam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, - double *liverows, double *deadrows, - TupleTableSlot *slot) -{ - HeapScanDesc hscan = (HeapScanDesc) scan; - Page targpage; - OffsetNumber maxoffset; - BufferHeapTupleTableSlot *hslot; - - Assert(TTS_IS_TDE_BUFFERTUPLE(slot)); - - hslot = (BufferHeapTupleTableSlot *) slot; - targpage = BufferGetPage(hscan->rs_cbuf); - maxoffset = PageGetMaxOffsetNumber(targpage); - - /* Inner loop over all tuples on the selected page */ - for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++) - { - ItemId itemid; - HeapTuple targtuple = &hslot->base.tupdata; - bool sample_it = false; - - itemid = PageGetItemId(targpage, hscan->rs_cindex); - - /* - * We ignore unused and redirect line pointers. DEAD line pointers - * should be counted as dead, because we need vacuum to run to get rid - * of them. Note that this rule agrees with the way that - * tdeheap_page_prune() counts things. - */ - if (!ItemIdIsNormal(itemid)) - { - if (ItemIdIsDead(itemid)) - *deadrows += 1; - continue; - } - - ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex); - - targtuple->t_tableOid = RelationGetRelid(scan->rs_rd); - targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); - targtuple->t_len = ItemIdGetLength(itemid); - - switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin, - hscan->rs_cbuf)) - { - case HEAPTUPLE_LIVE: - sample_it = true; - *liverows += 1; - break; - - case HEAPTUPLE_DEAD: - case HEAPTUPLE_RECENTLY_DEAD: - /* Count dead and recently-dead rows */ - *deadrows += 1; - break; - - case HEAPTUPLE_INSERT_IN_PROGRESS: - - /* - * Insert-in-progress rows are not counted. We assume that - * when the inserting transaction commits or aborts, it will - * send a stats message to increment the proper count. This - * works right only if that transaction ends after we finish - * analyzing the table; if things happen in the other order, - * its stats update will be overwritten by ours. However, the - * error will be large only if the other transaction runs long - * enough to insert many tuples, so assuming it will finish - * after us is the safer option. - * - * A special case is that the inserting transaction might be - * our own. In this case we should count and sample the row, - * to accommodate users who load a table and analyze it in one - * transaction. (pgstat_report_analyze has to adjust the - * numbers we report to the cumulative stats system to make - * this come out right.) - */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data))) - { - sample_it = true; - *liverows += 1; - } - break; - - case HEAPTUPLE_DELETE_IN_PROGRESS: - - /* - * We count and sample delete-in-progress rows the same as - * live ones, so that the stats counters come out right if the - * deleting transaction commits after us, per the same - * reasoning given above. - * - * If the delete was done by our own transaction, however, we - * must count the row as dead to make pgstat_report_analyze's - * stats adjustments come out right. (Note: this works out - * properly when the row was both inserted and deleted in our - * xact.) - * - * The net effect of these choices is that we act as though an - * IN_PROGRESS transaction hasn't happened yet, except if it - * is our own transaction, which we assume has happened. - * - * This approach ensures that we behave sanely if we see both - * the pre-image and post-image rows for a row being updated - * by a concurrent transaction: we will sample the pre-image - * but not the post-image. We also get sane results if the - * concurrent transaction never commits. - */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data))) - *deadrows += 1; - else - { - sample_it = true; - *liverows += 1; - } - break; - - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - - if (sample_it) - { - PGTdeExecStoreBufferHeapTuple(scan->rs_rd, targtuple, slot, hscan->rs_cbuf); - hscan->rs_cindex++; - - /* note that we leave the buffer locked here! */ - return true; - } - } - - /* Now release the lock and pin on the page */ - UnlockReleaseBuffer(hscan->rs_cbuf); - hscan->rs_cbuf = InvalidBuffer; - TdeSlotForgetDecryptedTuple(slot); - /* also prevent old slot contents from having pin on page */ - ExecClearTuple(slot); - - return false; -} - -static double -pg_tdeam_index_build_range_scan(Relation heapRelation, - Relation indexRelation, - IndexInfo *indexInfo, - bool allow_sync, - bool anyvisible, - bool progress, - BlockNumber start_blockno, - BlockNumber numblocks, - IndexBuildCallback callback, - void *callback_state, - TableScanDesc scan) -{ - HeapScanDesc hscan; - bool is_system_catalog; - bool checking_uniqueness; - HeapTuple heapTuple; - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - double reltuples; - ExprState *predicate; - TupleTableSlot *slot; - EState *estate; - ExprContext *econtext; - Snapshot snapshot; - bool need_unregister_snapshot = false; - TransactionId OldestXmin; - BlockNumber previous_blkno = InvalidBlockNumber; - BlockNumber root_blkno = InvalidBlockNumber; - OffsetNumber root_offsets[MaxHeapTuplesPerPage]; - - /* - * sanity checks - */ - Assert(OidIsValid(indexRelation->rd_rel->relam)); - - /* Remember if it's a system catalog */ - is_system_catalog = IsSystemRelation(heapRelation); - - /* See whether we're verifying uniqueness/exclusion properties */ - checking_uniqueness = (indexInfo->ii_Unique || - indexInfo->ii_ExclusionOps != NULL); - - /* - * "Any visible" mode is not compatible with uniqueness checks; make sure - * only one of those is requested. - */ - Assert(!(anyvisible && checking_uniqueness)); - - /* - * Need an EState for evaluation of index expressions and partial-index - * predicates. Also a slot to hold the current tuple. - */ - estate = CreateExecutorState(); - econtext = GetPerTupleExprContext(estate); - slot = table_slot_create(heapRelation, NULL); - - /* Arrange for econtext's scan tuple to be the tuple under test */ - econtext->ecxt_scantuple = slot; - - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - - /* - * Prepare for scan of the base relation. In a normal index build, we use - * SnapshotAny because we must retrieve all tuples and do our own time - * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, or during bootstrap, we take a regular MVCC snapshot - * and index whatever's live according to that. - */ - OldestXmin = InvalidTransactionId; - - /* okay to ignore lazy VACUUMs here */ - if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) - OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); - - if (!scan) - { - /* - * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. - */ - if (!TransactionIdIsValid(OldestXmin)) - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; - } - else - snapshot = SnapshotAny; - - scan = table_beginscan_strat(heapRelation, /* relation */ - snapshot, /* snapshot */ - 0, /* number of keys */ - NULL, /* scan key */ - true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ - } - else - { - /* - * Parallel index build. - * - * Parallel case never registers/unregisters own snapshot. Snapshot - * is taken from parallel heap scan, and is SnapshotAny or an MVCC - * snapshot, based on same criteria as serial case. - */ - Assert(!IsBootstrapProcessingMode()); - Assert(allow_sync); - snapshot = scan->rs_snapshot; - } - - hscan = (HeapScanDesc) scan; - - /* - * Must have called GetOldestNonRemovableTransactionId() if using - * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially - * worth checking this for parallel builds, since ambuild routines that - * support parallel builds must work these details out for themselves.) - */ - Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); - Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : - !TransactionIdIsValid(OldestXmin)); - Assert(snapshot == SnapshotAny || !anyvisible); - - /* Publish number of blocks to scan */ - if (progress) - { - BlockNumber nblocks; - - if (hscan->rs_base.rs_parallel != NULL) - { - ParallelBlockTableScanDesc pbscan; - - pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; - nblocks = pbscan->phs_nblocks; - } - else - nblocks = hscan->rs_nblocks; - - pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, - nblocks); - } - - /* set our scan endpoints */ - if (!allow_sync) - tdeheap_setscanlimits(scan, start_blockno, numblocks); - else - { - /* syncscan can only be requested on whole relation */ - Assert(start_blockno == 0); - Assert(numblocks == InvalidBlockNumber); - } - - reltuples = 0; - - /* - * Scan all tuples in the base relation. - */ - while ((heapTuple = tdeheap_getnext(scan, ForwardScanDirection)) != NULL) - { - bool tupleIsAlive; - - CHECK_FOR_INTERRUPTS(); - - /* Report scan progress, if asked to. */ - if (progress) - { - BlockNumber blocks_done = pg_tdeam_scan_get_blocks_done(hscan); - - if (blocks_done != previous_blkno) - { - pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, - blocks_done); - previous_blkno = blocks_done; - } - } - - /* - * When dealing with a HOT-chain of updated tuples, we want to index - * the values of the live tuple (if any), but index it under the TID - * of the chain's root tuple. This approach is necessary to preserve - * the HOT-chain structure in the heap. So we need to be able to find - * the root item offset for every tuple that's in a HOT-chain. When - * first reaching a new page of the relation, call - * tdeheap_get_root_tuples() to build a map of root item offsets on the - * page. - * - * It might look unsafe to use this information across buffer - * lock/unlock. However, we hold ShareLock on the table so no - * ordinary insert/update/delete should occur; and we hold pin on the - * buffer continuously while visiting the page, so no pruning - * operation can occur either. - * - * In cases with only ShareUpdateExclusiveLock on the table, it's - * possible for some HOT tuples to appear that we didn't know about - * when we first read the page. To handle that case, we re-obtain the - * list of root offsets when a HOT tuple points to a root item that we - * don't know about. - * - * Also, although our opinions about tuple liveness could change while - * we scan the page (due to concurrent transaction commits/aborts), - * the chain root locations won't, so this info doesn't need to be - * rebuilt after waiting for another transaction. - * - * Note the implied assumption that there is no more than one live - * tuple per HOT-chain --- else we could create more than one index - * entry pointing to the same root tuple. - */ - if (hscan->rs_cblock != root_blkno) - { - Page page = BufferGetPage(hscan->rs_cbuf); - - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - tdeheap_get_root_tuples(page, root_offsets); - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - root_blkno = hscan->rs_cblock; - } - - if (snapshot == SnapshotAny) - { - /* do our own time qual check */ - bool indexIt; - TransactionId xwait; - - recheck: - - /* - * We could possibly get away with not locking the buffer here, - * since caller should hold ShareLock on the relation, but let's - * be conservative about it. (This remark is still correct even - * with HOT-pruning: our pin on the buffer prevents pruning.) - */ - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - - /* - * The criteria for counting a tuple as live in this block need to - * match what analyze.c's pg_tdeam_scan_analyze_next_tuple() does, - * otherwise CREATE INDEX and ANALYZE may produce wildly different - * reltuples values, e.g. when there are many recently-dead - * tuples. - */ - switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin, - hscan->rs_cbuf)) - { - case HEAPTUPLE_DEAD: - /* Definitely dead, we can ignore it */ - indexIt = false; - tupleIsAlive = false; - break; - case HEAPTUPLE_LIVE: - /* Normal case, index and unique-check it */ - indexIt = true; - tupleIsAlive = true; - /* Count it as live, too */ - reltuples += 1; - break; - case HEAPTUPLE_RECENTLY_DEAD: - - /* - * If tuple is recently deleted then we must index it - * anyway to preserve MVCC semantics. (Pre-existing - * transactions could try to use the index after we finish - * building it, and may need to see such tuples.) - * - * However, if it was HOT-updated then we must only index - * the live tuple at the end of the HOT-chain. Since this - * breaks semantics for pre-existing snapshots, mark the - * index as unusable for them. - * - * We don't count recently-dead tuples in reltuples, even - * if we index them; see pg_tdeam_scan_analyze_next_tuple(). - */ - if (HeapTupleIsHotUpdated(heapTuple)) - { - indexIt = false; - /* mark the index as unsafe for old snapshots */ - indexInfo->ii_BrokenHotChain = true; - } - else - indexIt = true; - /* In any case, exclude the tuple from unique-checking */ - tupleIsAlive = false; - break; - case HEAPTUPLE_INSERT_IN_PROGRESS: - - /* - * In "anyvisible" mode, this tuple is visible and we - * don't need any further checks. - */ - if (anyvisible) - { - indexIt = true; - tupleIsAlive = true; - reltuples += 1; - break; - } - - /* - * Since caller should hold ShareLock or better, normally - * the only way to see this is if it was inserted earlier - * in our own transaction. However, it can happen in - * system catalogs, since we tend to release write lock - * before commit there. Give a warning if neither case - * applies. - */ - xwait = HeapTupleHeaderGetXmin(heapTuple->t_data); - if (!TransactionIdIsCurrentTransactionId(xwait)) - { - if (!is_system_catalog) - elog(WARNING, "concurrent insert in progress within table \"%s\"", - RelationGetRelationName(heapRelation)); - - /* - * If we are performing uniqueness checks, indexing - * such a tuple could lead to a bogus uniqueness - * failure. In that case we wait for the inserting - * transaction to finish and check again. - */ - if (checking_uniqueness) - { - /* - * Must drop the lock on the buffer before we wait - */ - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - XactLockTableWait(xwait, heapRelation, - &heapTuple->t_self, - XLTW_InsertIndexUnique); - CHECK_FOR_INTERRUPTS(); - goto recheck; - } - } - else - { - /* - * For consistency with - * pg_tdeam_scan_analyze_next_tuple(), count - * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only - * when inserted by our own transaction. - */ - reltuples += 1; - } - - /* - * We must index such tuples, since if the index build - * commits then they're good. - */ - indexIt = true; - tupleIsAlive = true; - break; - case HEAPTUPLE_DELETE_IN_PROGRESS: - - /* - * As with INSERT_IN_PROGRESS case, this is unexpected - * unless it's our own deletion or a system catalog; but - * in anyvisible mode, this tuple is visible. - */ - if (anyvisible) - { - indexIt = true; - tupleIsAlive = false; - reltuples += 1; - break; - } - - xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); - if (!TransactionIdIsCurrentTransactionId(xwait)) - { - if (!is_system_catalog) - elog(WARNING, "concurrent delete in progress within table \"%s\"", - RelationGetRelationName(heapRelation)); - - /* - * If we are performing uniqueness checks, assuming - * the tuple is dead could lead to missing a - * uniqueness violation. In that case we wait for the - * deleting transaction to finish and check again. - * - * Also, if it's a HOT-updated tuple, we should not - * index it but rather the live tuple at the end of - * the HOT-chain. However, the deleting transaction - * could abort, possibly leaving this tuple as live - * after all, in which case it has to be indexed. The - * only way to know what to do is to wait for the - * deleting transaction to finish and check again. - */ - if (checking_uniqueness || - HeapTupleIsHotUpdated(heapTuple)) - { - /* - * Must drop the lock on the buffer before we wait - */ - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - XactLockTableWait(xwait, heapRelation, - &heapTuple->t_self, - XLTW_InsertIndexUnique); - CHECK_FOR_INTERRUPTS(); - goto recheck; - } - - /* - * Otherwise index it but don't check for uniqueness, - * the same as a RECENTLY_DEAD tuple. - */ - indexIt = true; - - /* - * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live, - * if they were not deleted by the current - * transaction. That's what - * pg_tdeam_scan_analyze_next_tuple() does, and we want - * the behavior to be consistent. - */ - reltuples += 1; - } - else if (HeapTupleIsHotUpdated(heapTuple)) - { - /* - * It's a HOT-updated tuple deleted by our own xact. - * We can assume the deletion will commit (else the - * index contents don't matter), so treat the same as - * RECENTLY_DEAD HOT-updated tuples. - */ - indexIt = false; - /* mark the index as unsafe for old snapshots */ - indexInfo->ii_BrokenHotChain = true; - } - else - { - /* - * It's a regular tuple deleted by our own xact. Index - * it, but don't check for uniqueness nor count in - * reltuples, the same as a RECENTLY_DEAD tuple. - */ - indexIt = true; - } - /* In any case, exclude the tuple from unique-checking */ - tupleIsAlive = false; - break; - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - indexIt = tupleIsAlive = false; /* keep compiler quiet */ - break; - } - - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - if (!indexIt) - continue; - } - else - { - /* tdeheap_getnext did the time qual check */ - tupleIsAlive = true; - reltuples += 1; - } - - MemoryContextReset(econtext->ecxt_per_tuple_memory); - - /* Set up for predicate or expression evaluation */ - PGTdeExecStoreBufferHeapTuple(heapRelation, heapTuple, slot, hscan->rs_cbuf); - - /* - * In a partial index, discard tuples that don't satisfy the - * predicate. - */ - if (predicate != NULL) - { - if (!ExecQual(predicate, econtext)) - continue; - } - - /* - * For the current heap tuple, extract all the attributes we use in - * this index, and note which are null. This also performs evaluation - * of any expressions needed. - */ - FormIndexDatum(indexInfo, - slot, - estate, - values, - isnull); - - /* - * You'd think we should go ahead and build the index tuple here, but - * some index AMs want to do further processing on the data first. So - * pass the values[] and isnull[] arrays, instead. - */ - - if (HeapTupleIsHeapOnly(heapTuple)) - { - /* - * For a heap-only tuple, pretend its TID is that of the root. See - * src/backend/access/heap/README.HOT for discussion. - */ - ItemPointerData tid; - OffsetNumber offnum; - - offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self); - - /* - * If a HOT tuple points to a root that we don't know about, - * obtain root items afresh. If that still fails, report it as - * corruption. - */ - if (root_offsets[offnum - 1] == InvalidOffsetNumber) - { - Page page = BufferGetPage(hscan->rs_cbuf); - - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - tdeheap_get_root_tuples(page, root_offsets); - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - } - - if (!OffsetNumberIsValid(root_offsets[offnum - 1])) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", - ItemPointerGetBlockNumber(&heapTuple->t_self), - offnum, - RelationGetRelationName(heapRelation)))); - - ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self), - root_offsets[offnum - 1]); - - /* Call the AM's callback routine to process the tuple */ - callback(indexRelation, &tid, values, isnull, tupleIsAlive, - callback_state); - } - else - { - /* Call the AM's callback routine to process the tuple */ - callback(indexRelation, &heapTuple->t_self, values, isnull, - tupleIsAlive, callback_state); - } - } - - /* Report scan progress one last time. */ - if (progress) - { - BlockNumber blks_done; - - if (hscan->rs_base.rs_parallel != NULL) - { - ParallelBlockTableScanDesc pbscan; - - pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; - blks_done = pbscan->phs_nblocks; - } - else - blks_done = hscan->rs_nblocks; - - pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, - blks_done); - } - - table_endscan(scan); - - /* we can now forget our snapshot, if set and registered by us */ - if (need_unregister_snapshot) - UnregisterSnapshot(snapshot); - - ExecDropSingleTupleTableSlot(slot); - - FreeExecutorState(estate); - - /* These may have been pointing to the now-gone estate */ - indexInfo->ii_ExpressionsState = NIL; - indexInfo->ii_PredicateState = NULL; - - return reltuples; -} - -static void -pg_tdeam_index_validate_scan(Relation heapRelation, - Relation indexRelation, - IndexInfo *indexInfo, - Snapshot snapshot, - ValidateIndexState *state) -{ - TableScanDesc scan; - HeapScanDesc hscan; - HeapTuple heapTuple; - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - ExprState *predicate; - TupleTableSlot *slot; - EState *estate; - ExprContext *econtext; - BlockNumber root_blkno = InvalidBlockNumber; - OffsetNumber root_offsets[MaxHeapTuplesPerPage]; - bool in_index[MaxHeapTuplesPerPage]; - BlockNumber previous_blkno = InvalidBlockNumber; - - /* state variables for the merge */ - ItemPointer indexcursor = NULL; - ItemPointerData decoded; - bool tuplesort_empty = false; - - /* - * sanity checks - */ - Assert(OidIsValid(indexRelation->rd_rel->relam)); - - /* - * Need an EState for evaluation of index expressions and partial-index - * predicates. Also a slot to hold the current tuple. - */ - estate = CreateExecutorState(); - econtext = GetPerTupleExprContext(estate); - slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), - &TTSOpsHeapTuple); - - /* Arrange for econtext's scan tuple to be the tuple under test */ - econtext->ecxt_scantuple = slot; - - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - - /* - * Prepare for scan of the base relation. We need just those tuples - * satisfying the passed-in reference snapshot. We must disable syncscan - * here, because it's critical that we read from block zero forward to - * match the sorted TIDs. - */ - scan = table_beginscan_strat(heapRelation, /* relation */ - snapshot, /* snapshot */ - 0, /* number of keys */ - NULL, /* scan key */ - true, /* buffer access strategy OK */ - false); /* syncscan not OK */ - hscan = (HeapScanDesc) scan; - - pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, - hscan->rs_nblocks); - - /* - * Scan all tuples matching the snapshot. - */ - while ((heapTuple = tdeheap_getnext(scan, ForwardScanDirection)) != NULL) - { - ItemPointer heapcursor = &heapTuple->t_self; - ItemPointerData rootTuple; - OffsetNumber root_offnum; - - CHECK_FOR_INTERRUPTS(); - - state->htups += 1; - - if ((previous_blkno == InvalidBlockNumber) || - (hscan->rs_cblock != previous_blkno)) - { - pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, - hscan->rs_cblock); - previous_blkno = hscan->rs_cblock; - } - - /* - * As commented in table_index_build_scan, we should index heap-only - * tuples under the TIDs of their root tuples; so when we advance onto - * a new heap page, build a map of root item offsets on the page. - * - * This complicates merging against the tuplesort output: we will - * visit the live tuples in order by their offsets, but the root - * offsets that we need to compare against the index contents might be - * ordered differently. So we might have to "look back" within the - * tuplesort output, but only within the current page. We handle that - * by keeping a bool array in_index[] showing all the - * already-passed-over tuplesort output TIDs of the current page. We - * clear that array here, when advancing onto a new heap page. - */ - if (hscan->rs_cblock != root_blkno) - { - Page page = BufferGetPage(hscan->rs_cbuf); - - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - tdeheap_get_root_tuples(page, root_offsets); - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - memset(in_index, 0, sizeof(in_index)); - - root_blkno = hscan->rs_cblock; - } - - /* Convert actual tuple TID to root TID */ - rootTuple = *heapcursor; - root_offnum = ItemPointerGetOffsetNumber(heapcursor); - - if (HeapTupleIsHeapOnly(heapTuple)) - { - root_offnum = root_offsets[root_offnum - 1]; - if (!OffsetNumberIsValid(root_offnum)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", - ItemPointerGetBlockNumber(heapcursor), - ItemPointerGetOffsetNumber(heapcursor), - RelationGetRelationName(heapRelation)))); - ItemPointerSetOffsetNumber(&rootTuple, root_offnum); - } - - /* - * "merge" by skipping through the index tuples until we find or pass - * the current root tuple. - */ - while (!tuplesort_empty && - (!indexcursor || - ItemPointerCompare(indexcursor, &rootTuple) < 0)) - { - Datum ts_val; - bool ts_isnull; - - if (indexcursor) - { - /* - * Remember index items seen earlier on the current heap page - */ - if (ItemPointerGetBlockNumber(indexcursor) == root_blkno) - in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true; - } - - tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, - false, &ts_val, &ts_isnull, - NULL); - Assert(tuplesort_empty || !ts_isnull); - if (!tuplesort_empty) - { - itemptr_decode(&decoded, DatumGetInt64(ts_val)); - indexcursor = &decoded; - } - else - { - /* Be tidy */ - indexcursor = NULL; - } - } - - /* - * If the tuplesort has overshot *and* we didn't see a match earlier, - * then this tuple is missing from the index, so insert it. - */ - if ((tuplesort_empty || - ItemPointerCompare(indexcursor, &rootTuple) > 0) && - !in_index[root_offnum - 1]) - { - MemoryContextReset(econtext->ecxt_per_tuple_memory); - - /* Set up for predicate or expression evaluation */ - ExecStoreHeapTuple(heapTuple, slot, false); - - /* - * In a partial index, discard tuples that don't satisfy the - * predicate. - */ - if (predicate != NULL) - { - if (!ExecQual(predicate, econtext)) - continue; - } - - /* - * For the current heap tuple, extract all the attributes we use - * in this index, and note which are null. This also performs - * evaluation of any expressions needed. - */ - FormIndexDatum(indexInfo, - slot, - estate, - values, - isnull); - - /* - * You'd think we should go ahead and build the index tuple here, - * but some index AMs want to do further processing on the data - * first. So pass the values[] and isnull[] arrays, instead. - */ - - /* - * If the tuple is already committed dead, you might think we - * could suppress uniqueness checking, but this is no longer true - * in the presence of HOT, because the insert is actually a proxy - * for a uniqueness check on the whole HOT-chain. That is, the - * tuple we have here could be dead because it was already - * HOT-updated, and if so the updating transaction will not have - * thought it should insert index entries. The index AM will - * check the whole HOT-chain and correctly detect a conflict if - * there is one. - */ - - index_insert(indexRelation, - values, - isnull, - &rootTuple, - heapRelation, - indexInfo->ii_Unique ? - UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - false, - indexInfo); - - state->tups_inserted += 1; - } - } - - table_endscan(scan); - - ExecDropSingleTupleTableSlot(slot); - - FreeExecutorState(estate); - - /* These may have been pointing to the now-gone estate */ - indexInfo->ii_ExpressionsState = NIL; - indexInfo->ii_PredicateState = NULL; -} - -/* - * Return the number of blocks that have been read by this scan since - * starting. This is meant for progress reporting rather than be fully - * accurate: in a parallel scan, workers can be concurrently reading blocks - * further ahead than what we report. - */ -static BlockNumber -pg_tdeam_scan_get_blocks_done(HeapScanDesc hscan) -{ - ParallelBlockTableScanDesc bpscan = NULL; - BlockNumber startblock; - BlockNumber blocks_done; - - if (hscan->rs_base.rs_parallel != NULL) - { - bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; - startblock = bpscan->phs_startblock; - } - else - startblock = hscan->rs_startblock; - - /* - * Might have wrapped around the end of the relation, if startblock was - * not zero. - */ - if (hscan->rs_cblock > startblock) - blocks_done = hscan->rs_cblock - startblock; - else - { - BlockNumber nblocks; - - nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks; - blocks_done = nblocks - startblock + - hscan->rs_cblock; - } - - return blocks_done; -} - - -/* ------------------------------------------------------------------------ - * Miscellaneous callbacks for the heap AM - * ------------------------------------------------------------------------ - */ - -/* - * Check to see whether the table needs a TOAST table. It does only if - * (1) there are any toastable attributes, and (2) the maximum length - * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to - * create a toast table for something like "f1 varchar(20)".) - */ -static bool -pg_tdeam_relation_needs_toast_table(Relation rel) -{ - int32 data_length = 0; - bool maxlength_unknown = false; - bool has_toastable_attrs = false; - TupleDesc tupdesc = rel->rd_att; - int32 tuple_length; - int i; - - for (i = 0; i < tupdesc->natts; i++) - { - Form_pg_attribute att = TupleDescAttr(tupdesc, i); - - if (att->attisdropped) - continue; - data_length = att_align_nominal(data_length, att->attalign); - if (att->attlen > 0) - { - /* Fixed-length types are never toastable */ - data_length += att->attlen; - } - else - { - int32 maxlen = type_maximum_size(att->atttypid, - att->atttypmod); - - if (maxlen < 0) - maxlength_unknown = true; - else - data_length += maxlen; - if (att->attstorage != TYPSTORAGE_PLAIN) - has_toastable_attrs = true; - } - } - if (!has_toastable_attrs) - return false; /* nothing to toast? */ - if (maxlength_unknown) - return true; /* any unlimited-length attrs? */ - tuple_length = MAXALIGN(SizeofHeapTupleHeader + - BITMAPLEN(tupdesc->natts)) + - MAXALIGN(data_length); - return (tuple_length > TOAST_TUPLE_THRESHOLD); -} - -/* - * TOAST tables for heap relations are just heap relations. - */ -static Oid -pg_tdeam_relation_toast_am(Relation rel) -{ - return rel->rd_rel->relam; -} - - -/* ------------------------------------------------------------------------ - * Planner related callbacks for the heap AM - * ------------------------------------------------------------------------ - */ - -#define HEAP_OVERHEAD_BYTES_PER_TUPLE \ - (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData)) -#define HEAP_USABLE_BYTES_PER_PAGE \ - (BLCKSZ - SizeOfPageHeaderData) - -static void -pg_tdeam_estimate_rel_size(Relation rel, int32 *attr_widths, - BlockNumber *pages, double *tuples, - double *allvisfrac) -{ - table_block_relation_estimate_size(rel, attr_widths, pages, - tuples, allvisfrac, - HEAP_OVERHEAD_BYTES_PER_TUPLE, - HEAP_USABLE_BYTES_PER_PAGE); -} - - -/* ------------------------------------------------------------------------ - * Executor related callbacks for the heap AM - * ------------------------------------------------------------------------ - */ - -static bool -pg_tdeam_scan_bitmap_next_block(TableScanDesc scan, - TBMIterateResult *tbmres) -{ - HeapScanDesc hscan = (HeapScanDesc) scan; - BlockNumber block = tbmres->blockno; - Buffer buffer; - Snapshot snapshot; - int ntup; - - hscan->rs_cindex = 0; - hscan->rs_ntuples = 0; - - /* - * Ignore any claimed entries past what we think is the end of the - * relation. It may have been extended after the start of our scan (we - * only hold an AccessShareLock, and it could be inserts from this - * backend). We don't take this optimization in SERIALIZABLE isolation - * though, as we need to examine all invisible tuples reachable by the - * index. - */ - if (!IsolationIsSerializable() && block >= hscan->rs_nblocks) - return false; - - /* - * Acquire pin on the target heap page, trading in any pin we held before. - */ - hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf, - scan->rs_rd, - block); - hscan->rs_cblock = block; - buffer = hscan->rs_cbuf; - snapshot = scan->rs_snapshot; - - ntup = 0; - - /* - * Prune and repair fragmentation for the whole page, if possible. - */ - tdeheap_page_prune_opt(scan->rs_rd, buffer); - - /* - * We must hold share lock on the buffer content while examining tuple - * visibility. Afterwards, however, the tuples we have found to be - * visible are guaranteed good as long as we hold the buffer pin. - */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - - /* - * We need two separate strategies for lossy and non-lossy cases. - */ - if (tbmres->ntuples >= 0) - { - /* - * Bitmap is non-lossy, so we just look through the offsets listed in - * tbmres; but we have to follow any HOT chain starting at each such - * offset. - */ - int curslot; - - for (curslot = 0; curslot < tbmres->ntuples; curslot++) - { - OffsetNumber offnum = tbmres->offsets[curslot]; - ItemPointerData tid; - HeapTupleData heapTuple; - - ItemPointerSet(&tid, block, offnum); - if (tdeheap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, - &heapTuple, NULL, true)) - hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); - } - } - else - { - /* - * Bitmap is lossy, so we must examine each line pointer on the page. - * But we can ignore HOT chains, since we'll check each tuple anyway. - */ - Page page = BufferGetPage(buffer); - OffsetNumber maxoff = PageGetMaxOffsetNumber(page); - OffsetNumber offnum; - - for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) - { - ItemId lp; - HeapTupleData loctup; - bool valid; - - lp = PageGetItemId(page, offnum); - if (!ItemIdIsNormal(lp)) - continue; - loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); - loctup.t_len = ItemIdGetLength(lp); - loctup.t_tableOid = scan->rs_rd->rd_id; - ItemPointerSet(&loctup.t_self, block, offnum); - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); - if (valid) - { - hscan->rs_vistuples[ntup++] = offnum; - PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot, - HeapTupleHeaderGetXmin(loctup.t_data)); - } - HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, - buffer, snapshot); - } - } - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - Assert(ntup <= MaxHeapTuplesPerPage); - hscan->rs_ntuples = ntup; - - return ntup > 0; -} - -static bool -pg_tdeam_scan_bitmap_next_tuple(TableScanDesc scan, - TBMIterateResult *tbmres, - TupleTableSlot *slot) -{ - HeapScanDesc hscan = (HeapScanDesc) scan; - OffsetNumber targoffset; - Page page; - ItemId lp; - - /* - * Out of range? If so, nothing more to look at on this page - */ - if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples) - return false; - - targoffset = hscan->rs_vistuples[hscan->rs_cindex]; - page = BufferGetPage(hscan->rs_cbuf); - lp = PageGetItemId(page, targoffset); - Assert(ItemIdIsNormal(lp)); - - hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); - hscan->rs_ctup.t_len = ItemIdGetLength(lp); - hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; - ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset); - - pgstat_count_tdeheap_fetch(scan->rs_rd); - - /* - * Set up the result slot to point to this tuple. Note that the slot - * acquires a pin on the buffer. - */ - PGTdeExecStoreBufferHeapTuple(scan->rs_rd, &hscan->rs_ctup, - slot, - hscan->rs_cbuf); - - hscan->rs_cindex++; - - return true; -} - -static bool -pg_tdeam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate) -{ - HeapScanDesc hscan = (HeapScanDesc) scan; - TsmRoutine *tsm = scanstate->tsmroutine; - BlockNumber blockno; - - /* return false immediately if relation is empty */ - if (hscan->rs_nblocks == 0) - return false; - - if (tsm->NextSampleBlock) - { - blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks); - hscan->rs_cblock = blockno; - } - else - { - /* scanning table sequentially */ - - if (hscan->rs_cblock == InvalidBlockNumber) - { - Assert(!hscan->rs_inited); - blockno = hscan->rs_startblock; - } - else - { - Assert(hscan->rs_inited); - - blockno = hscan->rs_cblock + 1; - - if (blockno >= hscan->rs_nblocks) - { - /* wrap to beginning of rel, might not have started at 0 */ - blockno = 0; - } - - /* - * Report our new scan position for synchronization purposes. - * - * Note: we do this before checking for end of scan so that the - * final state of the position hint is back at the start of the - * rel. That's not strictly necessary, but otherwise when you run - * the same query multiple times the starting position would shift - * a little bit backwards on every invocation, which is confusing. - * We don't guarantee any specific ordering in general, though. - */ - if (scan->rs_flags & SO_ALLOW_SYNC) - ss_report_location(scan->rs_rd, blockno); - - if (blockno == hscan->rs_startblock) - { - blockno = InvalidBlockNumber; - } - } - } - - if (!BlockNumberIsValid(blockno)) - { - if (BufferIsValid(hscan->rs_cbuf)) - ReleaseBuffer(hscan->rs_cbuf); - hscan->rs_cbuf = InvalidBuffer; - hscan->rs_cblock = InvalidBlockNumber; - hscan->rs_inited = false; - - return false; - } - - tdeheapgetpage(scan, blockno); - hscan->rs_inited = true; - - return true; -} - -static bool -pg_tdeam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, - TupleTableSlot *slot) -{ - HeapScanDesc hscan = (HeapScanDesc) scan; - TsmRoutine *tsm = scanstate->tsmroutine; - BlockNumber blockno = hscan->rs_cblock; - bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0; - - Page page; - bool all_visible; - OffsetNumber maxoffset; - - /* - * When not using pagemode, we must lock the buffer during tuple - * visibility checks. - */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(hscan->rs_cbuf); - all_visible = PageIsAllVisible(page) && - !scan->rs_snapshot->takenDuringRecovery; - maxoffset = PageGetMaxOffsetNumber(page); - - for (;;) - { - OffsetNumber tupoffset; - - CHECK_FOR_INTERRUPTS(); - - /* Ask the tablesample method which tuples to check on this page. */ - tupoffset = tsm->NextSampleTuple(scanstate, - blockno, - maxoffset); - - if (OffsetNumberIsValid(tupoffset)) - { - ItemId itemid; - bool visible; - HeapTuple tuple = &(hscan->rs_ctup); - - /* Skip invalid tuple pointers. */ - itemid = PageGetItemId(page, tupoffset); - if (!ItemIdIsNormal(itemid)) - continue; - - tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple->t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tuple->t_self), blockno, tupoffset); - - - if (all_visible) - visible = true; - else - visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf, - tuple, tupoffset); - - /* in pagemode, tdeheapgetpage did this for us */ - if (!pagemode) - HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple, - hscan->rs_cbuf, scan->rs_snapshot); - - /* Try next tuple from same page. */ - if (!visible) - continue; - - /* Found visible tuple, return it. */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - PGTdeExecStoreBufferHeapTuple(scan->rs_rd, tuple, slot, hscan->rs_cbuf); - - /* Count successfully-fetched tuples as heap fetches */ - pgstat_count_tdeheap_getnext(scan->rs_rd); - - return true; - } - else - { - /* - * If we get here, it means we've exhausted the items on this page - * and it's time to move to the next. - */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - /* - * Hack: - * The issue is that, The previous call that would have used the same - * TupleTableSlot would have just deleted the memory context for the slot - * and refrained from calling the clear slot function. So, the slot would - * have the non NULL pointer to the decrypted tuple which is now invalid. - * So, we need to explicitly clear the decrypted tuple pointer before - * calling the clear slot function. - */ - TdeSlotForgetDecryptedTuple(slot); - ExecClearTuple(slot); - return false; - } - } - - Assert(0); -} - - -/* ---------------------------------------------------------------------------- - * Helper functions for the above. - * ---------------------------------------------------------------------------- - */ - -/* - * Reconstruct and rewrite the given tuple - * - * We cannot simply copy the tuple as-is, for several reasons: - * - * 1. We'd like to squeeze out the values of any dropped columns, both - * to save space and to ensure we have no corner-case failures. (It's - * possible for example that the new table hasn't got a TOAST table - * and so is unable to store any large values of dropped cols.) - * - * 2. The tuple might not even be legal for the new table; this is - * currently only known to happen as an after-effect of ALTER TABLE - * SET WITHOUT OIDS. - * - * So, we must reconstruct the tuple from component Datums. - */ -static void -reform_and_rewrite_tuple(HeapTuple tuple, - Relation OldHeap, Relation NewHeap, - Datum *values, bool *isnull, RewriteState rwstate) -{ - TupleDesc oldTupDesc = RelationGetDescr(OldHeap); - TupleDesc newTupDesc = RelationGetDescr(NewHeap); - HeapTuple copiedTuple; - int i; - - tdeheap_deform_tuple(tuple, oldTupDesc, values, isnull); - - /* Be sure to null out any dropped columns */ - for (i = 0; i < newTupDesc->natts; i++) - { - if (TupleDescAttr(newTupDesc, i)->attisdropped) - isnull[i] = true; - } - - copiedTuple = tdeheap_form_tuple(newTupDesc, values, isnull); - - /* The heap rewrite module does the rest */ - rewrite_tdeheap_tuple(rwstate, tuple, copiedTuple); - - tdeheap_freetuple(copiedTuple); -} - -/* - * Check visibility of the tuple. - */ -static bool -SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, - HeapTuple tuple, - OffsetNumber tupoffset) -{ - HeapScanDesc hscan = (HeapScanDesc) scan; - - if (scan->rs_flags & SO_ALLOW_PAGEMODE) - { - /* - * In pageatatime mode, tdeheapgetpage() already did visibility checks, - * so just look at the info it left in rs_vistuples[]. - * - * We use a binary search over the known-sorted array. Note: we could - * save some effort if we insisted that NextSampleTuple select tuples - * in increasing order, but it's not clear that there would be enough - * gain to justify the restriction. - */ - int start = 0, - end = hscan->rs_ntuples - 1; - - while (start <= end) - { - int mid = (start + end) / 2; - OffsetNumber curoffset = hscan->rs_vistuples[mid]; - - if (tupoffset == curoffset) - return true; - else if (tupoffset < curoffset) - end = mid - 1; - else - start = mid + 1; - } - - return false; - } - else - { - /* Otherwise, we have to check the tuple individually. */ - return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot, - buffer); - } -} - - -/* ------------------------------------------------------------------------ - * Definition of the heap table access method. - * ------------------------------------------------------------------------ - */ - -static const TableAmRoutine pg_tdeam_methods = { - .type = T_TableAmRoutine, - - .slot_callbacks = pg_tdeam_slot_callbacks, - - .scan_begin = tdeheap_beginscan, - .scan_end = tdeheap_endscan, - .scan_rescan = tdeheap_rescan, - .scan_getnextslot = tdeheap_getnextslot, - - .scan_set_tidrange = tdeheap_set_tidrange, - .scan_getnextslot_tidrange = tdeheap_getnextslot_tidrange, - - .parallelscan_estimate = table_block_parallelscan_estimate, - .parallelscan_initialize = table_block_parallelscan_initialize, - .parallelscan_reinitialize = table_block_parallelscan_reinitialize, - - .index_fetch_begin = pg_tdeam_index_fetch_begin, - .index_fetch_reset = pg_tdeam_index_fetch_reset, - .index_fetch_end = pg_tdeam_index_fetch_end, - .index_fetch_tuple = pg_tdeam_index_fetch_tuple, - - .tuple_insert = pg_tdeam_tuple_insert, - .tuple_insert_speculative = pg_tdeam_tuple_insert_speculative, - .tuple_complete_speculative = pg_tdeam_tuple_complete_speculative, - .multi_insert = tdeheap_multi_insert, - .tuple_delete = pg_tdeam_tuple_delete, - .tuple_update = pg_tdeam_tuple_update, - .tuple_lock = pg_tdeam_tuple_lock, - - .tuple_fetch_row_version = pg_tdeam_fetch_row_version, - .tuple_get_latest_tid = tdeheap_get_latest_tid, - .tuple_tid_valid = pg_tdeam_tuple_tid_valid, - .tuple_satisfies_snapshot = pg_tdeam_tuple_satisfies_snapshot, - .index_delete_tuples = tdeheap_index_delete_tuples, - - .relation_set_new_filelocator = pg_tdeam_relation_set_new_filelocator, - .relation_nontransactional_truncate = pg_tdeam_relation_nontransactional_truncate, - .relation_copy_data = pg_tdeam_relation_copy_data, - .relation_copy_for_cluster = pg_tdeam_relation_copy_for_cluster, - .relation_vacuum = tdeheap_vacuum_rel, - .scan_analyze_next_block = pg_tdeam_scan_analyze_next_block, - .scan_analyze_next_tuple = pg_tdeam_scan_analyze_next_tuple, - .index_build_range_scan = pg_tdeam_index_build_range_scan, - .index_validate_scan = pg_tdeam_index_validate_scan, - - .relation_size = table_block_relation_size, - .relation_needs_toast_table = pg_tdeam_relation_needs_toast_table, - .relation_toast_am = pg_tdeam_relation_toast_am, - .relation_fetch_toast_slice = tdeheap_fetch_toast_slice, - - .relation_estimate_size = pg_tdeam_estimate_rel_size, - - .scan_bitmap_next_block = pg_tdeam_scan_bitmap_next_block, - .scan_bitmap_next_tuple = pg_tdeam_scan_bitmap_next_tuple, - .scan_sample_next_block = pg_tdeam_scan_sample_next_block, - .scan_sample_next_tuple = pg_tdeam_scan_sample_next_tuple -}; - -const TableAmRoutine * -GetPGTdeamTableAmRoutine(void) -{ - return &pg_tdeam_methods; -} - -Datum -pg_tdeam_basic_handler(PG_FUNCTION_ARGS) -{ - PG_RETURN_POINTER(&pg_tdeam_methods); -} - -#ifdef PERCONA_FORK -Datum -pg_tdeam_handler(PG_FUNCTION_ARGS) -{ - PG_RETURN_POINTER(GetHeapamTableAmRoutine()); -} -#endif - -bool -is_tdeheap_rel(Relation rel) -{ - return (rel->rd_tableam == (TableAmRoutine *) &pg_tdeam_methods); -} diff --git a/src/access/pg_tdeam_visibility.c b/src/access/pg_tdeam_visibility.c deleted file mode 100644 index c037e30c..00000000 --- a/src/access/pg_tdeam_visibility.c +++ /dev/null @@ -1,1793 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_tdeam_visibility.c - * Tuple visibility rules for tuples stored in heap. - * - * NOTE: all the HeapTupleSatisfies routines will update the tuple's - * "hint" status bits if we see that the inserting or deleting transaction - * has now committed or aborted (and it is safe to set the hint bits). - * If the hint bits are changed, MarkBufferDirtyHint is called on - * the passed-in buffer. The caller must hold not only a pin, but at least - * shared buffer content lock on the buffer containing the tuple. - * - * NOTE: When using a non-MVCC snapshot, we must check - * TransactionIdIsInProgress (which looks in the PGPROC array) before - * TransactionIdDidCommit (which look in pg_xact). Otherwise we have a race - * condition: we might decide that a just-committed transaction crashed, - * because none of the tests succeed. xact.c is careful to record - * commit/abort in pg_xact before it unsets MyProc->xid in the PGPROC array. - * That fixes that problem, but it also means there is a window where - * TransactionIdIsInProgress and TransactionIdDidCommit will both return true. - * If we check only TransactionIdDidCommit, we could consider a tuple - * committed when a later GetSnapshotData call will still think the - * originating transaction is in progress, which leads to application-level - * inconsistency. The upshot is that we gotta check TransactionIdIsInProgress - * first in all code paths, except for a few cases where we are looking at - * subtransactions of our own main transaction and so there can't be any race - * condition. - * - * We can't use TransactionIdDidAbort here because it won't treat transactions - * that were in progress during a crash as aborted. We determine that - * transactions aborted/crashed through process of elimination instead. - * - * When using an MVCC snapshot, we rely on XidInMVCCSnapshot rather than - * TransactionIdIsInProgress, but the logic is otherwise the same: do not - * check pg_xact until after deciding that the xact is no longer in progress. - * - * - * Summary of visibility functions: - * - * HeapTupleSatisfiesMVCC() - * visible to supplied snapshot, excludes current command - * HeapTupleSatisfiesUpdate() - * visible to instant snapshot, with user-supplied command - * counter and more complex result - * HeapTupleSatisfiesSelf() - * visible to instant snapshot and current command - * HeapTupleSatisfiesDirty() - * like HeapTupleSatisfiesSelf(), but includes open transactions - * HeapTupleSatisfiesVacuum() - * visible to any running transaction, used by VACUUM - * HeapTupleSatisfiesNonVacuumable() - * Snapshot-style API for HeapTupleSatisfiesVacuum - * HeapTupleSatisfiesToast() - * visible unless part of interrupted vacuum, used for TOAST - * HeapTupleSatisfiesAny() - * all tuples are visible - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * IDENTIFICATION - * src/backend/access/heap/pg_tdeam_visibility.c - * - *------------------------------------------------------------------------- - */ - -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include "access/pg_tdeam.h" - -#include "access/htup_details.h" -#include "access/multixact.h" -#include "access/subtrans.h" -#include "access/tableam.h" -#include "access/transam.h" -#include "access/xact.h" -#include "access/xlog.h" -#include "storage/bufmgr.h" -#include "storage/procarray.h" -#include "utils/builtins.h" -#include "utils/combocid.h" -#include "utils/snapmgr.h" - - -/* - * SetHintBits() - * - * Set commit/abort hint bits on a tuple, if appropriate at this time. - * - * It is only safe to set a transaction-committed hint bit if we know the - * transaction's commit record is guaranteed to be flushed to disk before the - * buffer, or if the table is temporary or unlogged and will be obliterated by - * a crash anyway. We cannot change the LSN of the page here, because we may - * hold only a share lock on the buffer, so we can only use the LSN to - * interlock this if the buffer's LSN already is newer than the commit LSN; - * otherwise we have to just refrain from setting the hint bit until some - * future re-examination of the tuple. - * - * We can always set hint bits when marking a transaction aborted. (Some - * code in pg_tdeam.c relies on that!) - * - * Also, if we are cleaning up HEAP_MOVED_IN or HEAP_MOVED_OFF entries, then - * we can always set the hint bits, since pre-9.0 VACUUM FULL always used - * synchronous commits and didn't move tuples that weren't previously - * hinted. (This is not known by this subroutine, but is applied by its - * callers.) Note: old-style VACUUM FULL is gone, but we have to keep this - * module's support for MOVED_OFF/MOVED_IN flag bits for as long as we - * support in-place update from pre-9.0 databases. - * - * Normal commits may be asynchronous, so for those we need to get the LSN - * of the transaction and then check whether this is flushed. - * - * The caller should pass xid as the XID of the transaction to check, or - * InvalidTransactionId if no check is needed. - */ -static inline void -SetHintBits(HeapTupleHeader tuple, Buffer buffer, - uint16 infomask, TransactionId xid) -{ - if (TransactionIdIsValid(xid)) - { - /* NB: xid must be known committed here! */ - XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); - - if (BufferIsPermanent(buffer) && XLogNeedsFlush(commitLSN) && - BufferGetLSNAtomic(buffer) < commitLSN) - { - /* not flushed and no LSN interlock, so don't set hint */ - return; - } - } - - tuple->t_infomask |= infomask; - MarkBufferDirtyHint(buffer, true); -} - -/* - * HeapTupleSetHintBits --- exported version of SetHintBits() - * - * This must be separate because of C99's brain-dead notions about how to - * implement inline functions. - */ -void -HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, - uint16 infomask, TransactionId xid) -{ - SetHintBits(tuple, buffer, infomask, xid); -} - - -/* - * HeapTupleSatisfiesSelf - * True iff heap tuple is valid "for itself". - * - * See SNAPSHOT_MVCC's definition for the intended behaviour. - * - * Note: - * Assumes heap tuple is valid. - * - * The satisfaction of "itself" requires the following: - * - * ((Xmin == my-transaction && the row was updated by the current transaction, and - * (Xmax is null it was not deleted - * [|| Xmax != my-transaction)]) [or it was deleted by another transaction] - * || - * - * (Xmin is committed && the row was modified by a committed transaction, and - * (Xmax is null || the row has not been deleted, or - * (Xmax != my-transaction && the row was deleted by another transaction - * Xmax is not committed))) that has not been committed - */ -static bool -HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) -{ - HeapTupleHeader tuple = htup->t_data; - - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - - if (!HeapTupleHeaderXminCommitted(tuple)) - { - if (HeapTupleHeaderXminInvalid(tuple)) - return false; - - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) - { - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ - return true; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ - return true; - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - /* updating subtransaction must have aborted */ - if (!TransactionIdIsCurrentTransactionId(xmax)) - return true; - else - return false; - } - - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; - } - - return false; - } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) - return false; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); - else - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - - /* by here, the inserting transaction has committed */ - - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ - return true; - - if (tuple->t_infomask & HEAP_XMAX_COMMITTED) - { - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return true; - return false; /* updated by other */ - } - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return true; - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - if (TransactionIdIsCurrentTransactionId(xmax)) - return false; - if (TransactionIdIsInProgress(xmax)) - return true; - if (TransactionIdDidCommit(xmax)) - return false; - /* it must have aborted or crashed */ - return true; - } - - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return true; - return false; - } - - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) - return true; - - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; - } - - /* xmax transaction committed */ - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - { - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; - } - - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); - return false; -} - -/* - * HeapTupleSatisfiesAny - * Dummy "satisfies" routine: any tuple satisfies SnapshotAny. - */ -static bool -HeapTupleSatisfiesAny(HeapTuple htup, Snapshot snapshot, Buffer buffer) -{ - return true; -} - -/* - * HeapTupleSatisfiesToast - * True iff heap tuple is valid as a TOAST row. - * - * See SNAPSHOT_TOAST's definition for the intended behaviour. - * - * This is a simplified version that only checks for VACUUM moving conditions. - * It's appropriate for TOAST usage because TOAST really doesn't want to do - * its own time qual checks; if you can see the main table row that contains - * a TOAST reference, you should be able to see the TOASTed value. However, - * vacuuming a TOAST table is independent of the main table, and in case such - * a vacuum fails partway through, we'd better do this much checking. - * - * Among other things, this means you can't do UPDATEs of rows in a TOAST - * table. - */ -static bool -HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, - Buffer buffer) -{ - HeapTupleHeader tuple = htup->t_data; - - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - - if (!HeapTupleHeaderXminCommitted(tuple)) - { - if (HeapTupleHeaderXminInvalid(tuple)) - return false; - - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } - - /* - * An invalid Xmin can be left behind by a speculative insertion that - * is canceled by super-deleting the tuple. This also applies to - * TOAST tuples created during speculative insertion. - */ - else if (!TransactionIdIsValid(HeapTupleHeaderGetXmin(tuple))) - return false; - } - - /* otherwise assume the tuple is valid for TOAST. */ - return true; -} - -/* - * HeapTupleSatisfiesUpdate - * - * This function returns a more detailed result code than most of the - * functions in this file, since UPDATE needs to know more than "is it - * visible?". It also allows for user-supplied CommandId rather than - * relying on CurrentCommandId. - * - * The possible return codes are: - * - * TM_Invisible: the tuple didn't exist at all when the scan started, e.g. it - * was created by a later CommandId. - * - * TM_Ok: The tuple is valid and visible, so it may be updated. - * - * TM_SelfModified: The tuple was updated by the current transaction, after - * the current scan started. - * - * TM_Updated: The tuple was updated by a committed transaction (including - * the case where the tuple was moved into a different partition). - * - * TM_Deleted: The tuple was deleted by a committed transaction. - * - * TM_BeingModified: The tuple is being updated by an in-progress transaction - * other than the current transaction. (Note: this includes the case where - * the tuple is share-locked by a MultiXact, even if the MultiXact includes - * the current transaction. Callers that want to distinguish that case must - * test for it themselves.) - */ -TM_Result -HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, - Buffer buffer) -{ - HeapTupleHeader tuple = htup->t_data; - - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - - if (!HeapTupleHeaderXminCommitted(tuple)) - { - if (HeapTupleHeaderXminInvalid(tuple)) - return TM_Invisible; - - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return TM_Invisible; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return TM_Invisible; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return TM_Invisible; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return TM_Invisible; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) - { - if (HeapTupleHeaderGetCmin(tuple) >= curcid) - return TM_Invisible; /* inserted after scan started */ - - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ - return TM_Ok; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - { - TransactionId xmax; - - xmax = HeapTupleHeaderGetRawXmax(tuple); - - /* - * Careful here: even though this tuple was created by our own - * transaction, it might be locked by other transactions, if - * the original version was key-share locked when we updated - * it. - */ - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - if (MultiXactIdIsRunning(xmax, true)) - return TM_BeingModified; - else - return TM_Ok; - } - - /* - * If the locker is gone, then there is nothing of interest - * left in this Xmax; otherwise, report the tuple as - * locked/updated. - */ - if (!TransactionIdIsInProgress(xmax)) - return TM_Ok; - return TM_BeingModified; - } - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - /* deleting subtransaction must have aborted */ - if (!TransactionIdIsCurrentTransactionId(xmax)) - { - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), - false)) - return TM_BeingModified; - return TM_Ok; - } - else - { - if (HeapTupleHeaderGetCmax(tuple) >= curcid) - return TM_SelfModified; /* updated after scan started */ - else - return TM_Invisible; /* updated before scan started */ - } - } - - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return TM_Ok; - } - - if (HeapTupleHeaderGetCmax(tuple) >= curcid) - return TM_SelfModified; /* updated after scan started */ - else - return TM_Invisible; /* updated before scan started */ - } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) - return TM_Invisible; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); - else - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return TM_Invisible; - } - } - - /* by here, the inserting transaction has committed */ - - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ - return TM_Ok; - - if (tuple->t_infomask & HEAP_XMAX_COMMITTED) - { - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return TM_Ok; - if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) - return TM_Updated; /* updated by other */ - else - return TM_Deleted; /* deleted by other */ - } - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - if (HEAP_LOCKED_UPGRADED(tuple->t_infomask)) - return TM_Ok; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - { - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), true)) - return TM_BeingModified; - - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); - return TM_Ok; - } - - xmax = HeapTupleGetUpdateXid(tuple); - if (!TransactionIdIsValid(xmax)) - { - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) - return TM_BeingModified; - } - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - if (TransactionIdIsCurrentTransactionId(xmax)) - { - if (HeapTupleHeaderGetCmax(tuple) >= curcid) - return TM_SelfModified; /* updated after scan started */ - else - return TM_Invisible; /* updated before scan started */ - } - - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) - return TM_BeingModified; - - if (TransactionIdDidCommit(xmax)) - { - if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) - return TM_Updated; - else - return TM_Deleted; - } - - /* - * By here, the update in the Xmax is either aborted or crashed, but - * what about the other members? - */ - - if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) - { - /* - * There's no member, even just a locker, alive anymore, so we can - * mark the Xmax as invalid. - */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return TM_Ok; - } - else - { - /* There are lockers running */ - return TM_BeingModified; - } - } - - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return TM_BeingModified; - if (HeapTupleHeaderGetCmax(tuple) >= curcid) - return TM_SelfModified; /* updated after scan started */ - else - return TM_Invisible; /* updated before scan started */ - } - - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) - return TM_BeingModified; - - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return TM_Ok; - } - - /* xmax transaction committed */ - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - { - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return TM_Ok; - } - - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); - if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) - return TM_Updated; /* updated by other */ - else - return TM_Deleted; /* deleted by other */ -} - -/* - * HeapTupleSatisfiesDirty - * True iff heap tuple is valid including effects of open transactions. - * - * See SNAPSHOT_DIRTY's definition for the intended behaviour. - * - * This is essentially like HeapTupleSatisfiesSelf as far as effects of - * the current transaction and committed/aborted xacts are concerned. - * However, we also include the effects of other xacts still in progress. - * - * A special hack is that the passed-in snapshot struct is used as an - * output argument to return the xids of concurrent xacts that affected the - * tuple. snapshot->xmin is set to the tuple's xmin if that is another - * transaction that's still in progress; or to InvalidTransactionId if the - * tuple's xmin is committed good, committed dead, or my own xact. - * Similarly for snapshot->xmax and the tuple's xmax. If the tuple was - * inserted speculatively, meaning that the inserter might still back down - * on the insertion without aborting the whole transaction, the associated - * token is also returned in snapshot->speculativeToken. - */ -static bool -HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, - Buffer buffer) -{ - HeapTupleHeader tuple = htup->t_data; - - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - - snapshot->xmin = snapshot->xmax = InvalidTransactionId; - snapshot->speculativeToken = 0; - - if (!HeapTupleHeaderXminCommitted(tuple)) - { - if (HeapTupleHeaderXminInvalid(tuple)) - return false; - - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) - { - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ - return true; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ - return true; - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - /* updating subtransaction must have aborted */ - if (!TransactionIdIsCurrentTransactionId(xmax)) - return true; - else - return false; - } - - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; - } - - return false; - } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) - { - /* - * Return the speculative token to caller. Caller can worry about - * xmax, since it requires a conclusively locked row version, and - * a concurrent update to this tuple is a conflict of its - * purposes. - */ - if (HeapTupleHeaderIsSpeculative(tuple)) - { - snapshot->speculativeToken = - HeapTupleHeaderGetSpeculativeToken(tuple); - - Assert(snapshot->speculativeToken != 0); - } - - snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple); - /* XXX shouldn't we fall through to look at xmax? */ - return true; /* in insertion by other */ - } - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); - else - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - - /* by here, the inserting transaction has committed */ - - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ - return true; - - if (tuple->t_infomask & HEAP_XMAX_COMMITTED) - { - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return true; - return false; /* updated by other */ - } - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return true; - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - if (TransactionIdIsCurrentTransactionId(xmax)) - return false; - if (TransactionIdIsInProgress(xmax)) - { - snapshot->xmax = xmax; - return true; - } - if (TransactionIdDidCommit(xmax)) - return false; - /* it must have aborted or crashed */ - return true; - } - - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return true; - return false; - } - - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) - { - if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple); - return true; - } - - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; - } - - /* xmax transaction committed */ - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - { - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; - } - - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); - return false; /* updated by other */ -} - -/* - * HeapTupleSatisfiesMVCC - * True iff heap tuple is valid for the given MVCC snapshot. - * - * See SNAPSHOT_MVCC's definition for the intended behaviour. - * - * Notice that here, we will not update the tuple status hint bits if the - * inserting/deleting transaction is still running according to our snapshot, - * even if in reality it's committed or aborted by now. This is intentional. - * Checking the true transaction state would require access to high-traffic - * shared data structures, creating contention we'd rather do without, and it - * would not change the result of our visibility check anyway. The hint bits - * will be updated by the first visitor that has a snapshot new enough to see - * the inserting/deleting transaction as done. In the meantime, the cost of - * leaving the hint bits unset is basically that each HeapTupleSatisfiesMVCC - * call will need to run TransactionIdIsCurrentTransactionId in addition to - * XidInMVCCSnapshot (but it would have to do the latter anyway). In the old - * coding where we tried to set the hint bits as soon as possible, we instead - * did TransactionIdIsInProgress in each call --- to no avail, as long as the - * inserting/deleting transaction was still running --- which was more cycles - * and more contention on ProcArrayLock. - */ -static bool -HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, - Buffer buffer) -{ - HeapTupleHeader tuple = htup->t_data; - - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - - if (!HeapTupleHeaderXminCommitted(tuple)) - { - if (HeapTupleHeaderXminInvalid(tuple)) - return false; - - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!XidInMVCCSnapshot(xvac, snapshot)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (XidInMVCCSnapshot(xvac, snapshot)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) - { - if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) - return false; /* inserted after scan started */ - - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ - return true; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ - return true; - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - /* updating subtransaction must have aborted */ - if (!TransactionIdIsCurrentTransactionId(xmax)) - return true; - else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - return true; /* updated after scan started */ - else - return false; /* updated before scan started */ - } - - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; - } - - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - return true; /* deleted after scan started */ - else - return false; /* deleted before scan started */ - } - else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) - return false; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); - else - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - else - { - /* xmin is committed, but maybe not according to our snapshot */ - if (!HeapTupleHeaderXminFrozen(tuple) && - XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) - return false; /* treat as still in progress */ - } - - /* by here, the inserting transaction has committed */ - - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ - return true; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return true; - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - /* already checked above */ - Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - if (TransactionIdIsCurrentTransactionId(xmax)) - { - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - return true; /* deleted after scan started */ - else - return false; /* deleted before scan started */ - } - if (XidInMVCCSnapshot(xmax, snapshot)) - return true; - if (TransactionIdDidCommit(xmax)) - return false; /* updating transaction committed */ - /* it must have aborted or crashed */ - return true; - } - - if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) - { - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - return true; /* deleted after scan started */ - else - return false; /* deleted before scan started */ - } - - if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) - return true; - - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; - } - - /* xmax transaction committed */ - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); - } - else - { - /* xmax is committed, but maybe not according to our snapshot */ - if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) - return true; /* treat as still in progress */ - } - - /* xmax transaction committed */ - - return false; -} - - -/* - * HeapTupleSatisfiesVacuum - * - * Determine the status of tuples for VACUUM purposes. Here, what - * we mainly want to know is if a tuple is potentially visible to *any* - * running transaction. If so, it can't be removed yet by VACUUM. - * - * OldestXmin is a cutoff XID (obtained from - * GetOldestNonRemovableTransactionId()). Tuples deleted by XIDs >= - * OldestXmin are deemed "recently dead"; they might still be visible to some - * open transaction, so we can't remove them, even if we see that the deleting - * transaction has committed. - */ -HTSV_Result -HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, - Buffer buffer) -{ - TransactionId dead_after = InvalidTransactionId; - HTSV_Result res; - - res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); - - if (res == HEAPTUPLE_RECENTLY_DEAD) - { - Assert(TransactionIdIsValid(dead_after)); - - if (TransactionIdPrecedes(dead_after, OldestXmin)) - res = HEAPTUPLE_DEAD; - } - else - Assert(!TransactionIdIsValid(dead_after)); - - return res; -} - -/* - * Work horse for HeapTupleSatisfiesVacuum and similar routines. - * - * In contrast to HeapTupleSatisfiesVacuum this routine, when encountering a - * tuple that could still be visible to some backend, stores the xid that - * needs to be compared with the horizon in *dead_after, and returns - * HEAPTUPLE_RECENTLY_DEAD. The caller then can perform the comparison with - * the horizon. This is e.g. useful when comparing with different horizons. - * - * Note: HEAPTUPLE_DEAD can still be returned here, e.g. if the inserting - * transaction aborted. - */ -HTSV_Result -HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after) -{ - HeapTupleHeader tuple = htup->t_data; - - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - Assert(dead_after != NULL); - - *dead_after = InvalidTransactionId; - - /* - * Has inserting transaction committed? - * - * If the inserting transaction aborted, then the tuple was never visible - * to any other transaction, so we can delete it immediately. - */ - if (!HeapTupleHeaderXminCommitted(tuple)) - { - if (HeapTupleHeaderXminInvalid(tuple)) - return HEAPTUPLE_DEAD; - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - if (TransactionIdIsInProgress(xvac)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HEAPTUPLE_DEAD; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - if (TransactionIdIsInProgress(xvac)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HEAPTUPLE_DEAD; - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) - { - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ - return HEAPTUPLE_INSERT_IN_PROGRESS; - /* only locked? run infomask-only check first, for performance */ - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) || - HeapTupleHeaderIsOnlyLocked(tuple)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - /* inserted and then deleted by same xact */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple))) - return HEAPTUPLE_DELETE_IN_PROGRESS; - /* deleting subtransaction must have aborted */ - return HEAPTUPLE_INSERT_IN_PROGRESS; - } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) - { - /* - * It'd be possible to discern between INSERT/DELETE in progress - * here by looking at xmax - but that doesn't seem beneficial for - * the majority of callers and even detrimental for some. We'd - * rather have callers look at/wait for xmin than xmax. It's - * always correct to return INSERT_IN_PROGRESS because that's - * what's happening from the view of other backends. - */ - return HEAPTUPLE_INSERT_IN_PROGRESS; - } - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); - else - { - /* - * Not in Progress, Not Committed, so either Aborted or crashed - */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HEAPTUPLE_DEAD; - } - - /* - * At this point the xmin is known committed, but we might not have - * been able to set the hint bit yet; so we can no longer Assert that - * it's set. - */ - } - - /* - * Okay, the inserter committed, so it was good at some point. Now what - * about the deleting transaction? - */ - if (tuple->t_infomask & HEAP_XMAX_INVALID) - return HEAPTUPLE_LIVE; - - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - { - /* - * "Deleting" xact really only locked it, so the tuple is live in any - * case. However, we should make sure that either XMAX_COMMITTED or - * XMAX_INVALID gets set once the xact is gone, to reduce the costs of - * examining the tuple for future xacts. - */ - if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) - { - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - /* - * If it's a pre-pg_upgrade tuple, the multixact cannot - * possibly be running; otherwise have to check. - */ - if (!HEAP_LOCKED_UPGRADED(tuple->t_infomask) && - MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), - true)) - return HEAPTUPLE_LIVE; - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); - } - else - { - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) - return HEAPTUPLE_LIVE; - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - } - } - - /* - * We don't really care whether xmax did commit, abort or crash. We - * know that xmax did lock the tuple, but it did not and will never - * actually update it. - */ - - return HEAPTUPLE_LIVE; - } - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax = HeapTupleGetUpdateXid(tuple); - - /* already checked above */ - Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - if (TransactionIdIsInProgress(xmax)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - else if (TransactionIdDidCommit(xmax)) - { - /* - * The multixact might still be running due to lockers. Need to - * allow for pruning if below the xid horizon regardless -- - * otherwise we could end up with a tuple where the updater has to - * be removed due to the horizon, but is not pruned away. It's - * not a problem to prune that tuple, because any remaining - * lockers will also be present in newer tuple versions. - */ - *dead_after = xmax; - return HEAPTUPLE_RECENTLY_DEAD; - } - else if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) - { - /* - * Not in Progress, Not Committed, so either Aborted or crashed. - * Mark the Xmax as invalid. - */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); - } - - return HEAPTUPLE_LIVE; - } - - if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) - { - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) - return HEAPTUPLE_DELETE_IN_PROGRESS; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); - else - { - /* - * Not in Progress, Not Committed, so either Aborted or crashed - */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return HEAPTUPLE_LIVE; - } - - /* - * At this point the xmax is known committed, but we might not have - * been able to set the hint bit yet; so we can no longer Assert that - * it's set. - */ - } - - /* - * Deleter committed, allow caller to check if it was recent enough that - * some open transactions could still see the tuple. - */ - *dead_after = HeapTupleHeaderGetRawXmax(tuple); - return HEAPTUPLE_RECENTLY_DEAD; -} - - -/* - * HeapTupleSatisfiesNonVacuumable - * - * True if tuple might be visible to some transaction; false if it's - * surely dead to everyone, ie, vacuumable. - * - * See SNAPSHOT_NON_VACUUMABLE's definition for the intended behaviour. - * - * This is an interface to HeapTupleSatisfiesVacuum that's callable via - * HeapTupleSatisfiesSnapshot, so it can be used through a Snapshot. - * snapshot->vistest must have been set up with the horizon to use. - */ -static bool -HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot, - Buffer buffer) -{ - TransactionId dead_after = InvalidTransactionId; - HTSV_Result res; - - res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); - - if (res == HEAPTUPLE_RECENTLY_DEAD) - { - Assert(TransactionIdIsValid(dead_after)); - - if (GlobalVisTestIsRemovableXid(snapshot->vistest, dead_after)) - res = HEAPTUPLE_DEAD; - } - else - Assert(!TransactionIdIsValid(dead_after)); - - return res != HEAPTUPLE_DEAD; -} - - -/* - * HeapTupleIsSurelyDead - * - * Cheaply determine whether a tuple is surely dead to all onlookers. - * We sometimes use this in lieu of HeapTupleSatisfiesVacuum when the - * tuple has just been tested by another visibility routine (usually - * HeapTupleSatisfiesMVCC) and, therefore, any hint bits that can be set - * should already be set. We assume that if no hint bits are set, the xmin - * or xmax transaction is still running. This is therefore faster than - * HeapTupleSatisfiesVacuum, because we consult neither procarray nor CLOG. - * It's okay to return false when in doubt, but we must return true only - * if the tuple is removable. - */ -bool -HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest) -{ - HeapTupleHeader tuple = htup->t_data; - - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - - /* - * If the inserting transaction is marked invalid, then it aborted, and - * the tuple is definitely dead. If it's marked neither committed nor - * invalid, then we assume it's still alive (since the presumption is that - * all relevant hint bits were just set moments ago). - */ - if (!HeapTupleHeaderXminCommitted(tuple)) - return HeapTupleHeaderXminInvalid(tuple); - - /* - * If the inserting transaction committed, but any deleting transaction - * aborted, the tuple is still alive. - */ - if (tuple->t_infomask & HEAP_XMAX_INVALID) - return false; - - /* - * If the XMAX is just a lock, the tuple is still alive. - */ - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return false; - - /* - * If the Xmax is a MultiXact, it might be dead or alive, but we cannot - * know without checking pg_multixact. - */ - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - return false; - - /* If deleter isn't known to have committed, assume it's still running. */ - if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) - return false; - - /* Deleter committed, so tuple is dead if the XID is old enough. */ - return GlobalVisTestIsRemovableXid(vistest, - HeapTupleHeaderGetRawXmax(tuple)); -} - -/* - * Is the tuple really only locked? That is, is it not updated? - * - * It's easy to check just infomask bits if the locker is not a multi; but - * otherwise we need to verify that the updating transaction has not aborted. - * - * This function is here because it follows the same visibility rules laid out - * at the top of this file. - */ -bool -HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) -{ - TransactionId xmax; - - /* if there's no valid Xmax, then there's obviously no update either */ - if (tuple->t_infomask & HEAP_XMAX_INVALID) - return true; - - if (tuple->t_infomask & HEAP_XMAX_LOCK_ONLY) - return true; - - /* invalid xmax means no update */ - if (!TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple))) - return true; - - /* - * if HEAP_XMAX_LOCK_ONLY is not set and not a multi, then this must - * necessarily have been updated - */ - if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)) - return false; - - /* ... but if it's a multi, then perhaps the updating Xid aborted. */ - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - if (TransactionIdIsCurrentTransactionId(xmax)) - return false; - if (TransactionIdIsInProgress(xmax)) - return false; - if (TransactionIdDidCommit(xmax)) - return false; - - /* - * not current, not in progress, not committed -- must have aborted or - * crashed - */ - return true; -} - -/* - * check whether the transaction id 'xid' is in the pre-sorted array 'xip'. - */ -static bool -TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num) -{ - return num > 0 && - bsearch(&xid, xip, num, sizeof(TransactionId), xidComparator) != NULL; -} - -/* - * See the comments for HeapTupleSatisfiesMVCC for the semantics this function - * obeys. - * - * Only usable on tuples from catalog tables! - * - * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support - * reading catalog pages which couldn't have been created in an older version. - * - * We don't set any hint bits in here as it seems unlikely to be beneficial as - * those should already be set by normal access and it seems to be too - * dangerous to do so as the semantics of doing so during timetravel are more - * complicated than when dealing "only" with the present. - */ -static bool -HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, - Buffer buffer) -{ - HeapTupleHeader tuple = htup->t_data; - TransactionId xmin = HeapTupleHeaderGetXmin(tuple); - TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple); - - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - - /* inserting transaction aborted */ - if (HeapTupleHeaderXminInvalid(tuple)) - { - Assert(!TransactionIdDidCommit(xmin)); - return false; - } - /* check if it's one of our txids, toplevel is also in there */ - else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt)) - { - bool resolved; - CommandId cmin = HeapTupleHeaderGetRawCommandId(tuple); - CommandId cmax = InvalidCommandId; - - /* - * another transaction might have (tried to) delete this tuple or - * cmin/cmax was stored in a combo CID. So we need to lookup the - * actual values externally. - */ - resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, - htup, buffer, - &cmin, &cmax); - - /* - * If we haven't resolved the combo CID to cmin/cmax, that means we - * have not decoded the combo CID yet. That means the cmin is - * definitely in the future, and we're not supposed to see the tuple - * yet. - * - * XXX This only applies to decoding of in-progress transactions. In - * regular logical decoding we only execute this code at commit time, - * at which point we should have seen all relevant combo CIDs. So - * ideally, we should error out in this case but in practice, this - * won't happen. If we are too worried about this then we can add an - * elog inside ResolveCminCmaxDuringDecoding. - * - * XXX For the streaming case, we can track the largest combo CID - * assigned, and error out based on this (when unable to resolve combo - * CID below that observed maximum value). - */ - if (!resolved) - return false; - - Assert(cmin != InvalidCommandId); - - if (cmin >= snapshot->curcid) - return false; /* inserted after scan started */ - /* fall through */ - } - /* committed before our xmin horizon. Do a normal visibility check. */ - else if (TransactionIdPrecedes(xmin, snapshot->xmin)) - { - Assert(!(HeapTupleHeaderXminCommitted(tuple) && - !TransactionIdDidCommit(xmin))); - - /* check for hint bit first, consult clog afterwards */ - if (!HeapTupleHeaderXminCommitted(tuple) && - !TransactionIdDidCommit(xmin)) - return false; - /* fall through */ - } - /* beyond our xmax horizon, i.e. invisible */ - else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax)) - { - return false; - } - /* check if it's a committed transaction in [xmin, xmax) */ - else if (TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt)) - { - /* fall through */ - } - - /* - * none of the above, i.e. between [xmin, xmax) but hasn't committed. I.e. - * invisible. - */ - else - { - return false; - } - - /* at this point we know xmin is visible, go on to check xmax */ - - /* xid invalid or aborted */ - if (tuple->t_infomask & HEAP_XMAX_INVALID) - return true; - /* locked tuples are always visible */ - else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - return true; - - /* - * We can see multis here if we're looking at user tables or if somebody - * SELECT ... FOR SHARE/UPDATE a system table. - */ - else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - xmax = HeapTupleGetUpdateXid(tuple); - } - - /* check if it's one of our txids, toplevel is also in there */ - if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt)) - { - bool resolved; - CommandId cmin; - CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple); - - /* Lookup actual cmin/cmax values */ - resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, - htup, buffer, - &cmin, &cmax); - - /* - * If we haven't resolved the combo CID to cmin/cmax, that means we - * have not decoded the combo CID yet. That means the cmax is - * definitely in the future, and we're still supposed to see the - * tuple. - * - * XXX This only applies to decoding of in-progress transactions. In - * regular logical decoding we only execute this code at commit time, - * at which point we should have seen all relevant combo CIDs. So - * ideally, we should error out in this case but in practice, this - * won't happen. If we are too worried about this then we can add an - * elog inside ResolveCminCmaxDuringDecoding. - * - * XXX For the streaming case, we can track the largest combo CID - * assigned, and error out based on this (when unable to resolve combo - * CID below that observed maximum value). - */ - if (!resolved || cmax == InvalidCommandId) - return true; - - if (cmax >= snapshot->curcid) - return true; /* deleted after scan started */ - else - return false; /* deleted before scan started */ - } - /* below xmin horizon, normal transaction state is valid */ - else if (TransactionIdPrecedes(xmax, snapshot->xmin)) - { - Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED && - !TransactionIdDidCommit(xmax))); - - /* check hint bit first */ - if (tuple->t_infomask & HEAP_XMAX_COMMITTED) - return false; - - /* check clog */ - return !TransactionIdDidCommit(xmax); - } - /* above xmax horizon, we cannot possibly see the deleting transaction */ - else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax)) - return true; - /* xmax is between [xmin, xmax), check known committed array */ - else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt)) - return false; - /* xmax is between [xmin, xmax), but known not to have committed yet */ - else - return true; -} - -/* - * HeapTupleSatisfiesVisibility - * True iff heap tuple satisfies a time qual. - * - * Notes: - * Assumes heap tuple is valid, and buffer at least share locked. - * - * Hint bits in the HeapTuple's t_infomask may be updated as a side effect; - * if so, the indicated buffer is marked dirty. - */ -bool -HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) -{ - switch (snapshot->snapshot_type) - { - case SNAPSHOT_MVCC: - return HeapTupleSatisfiesMVCC(htup, snapshot, buffer); - case SNAPSHOT_SELF: - return HeapTupleSatisfiesSelf(htup, snapshot, buffer); - case SNAPSHOT_ANY: - return HeapTupleSatisfiesAny(htup, snapshot, buffer); - case SNAPSHOT_TOAST: - return HeapTupleSatisfiesToast(htup, snapshot, buffer); - case SNAPSHOT_DIRTY: - return HeapTupleSatisfiesDirty(htup, snapshot, buffer); - case SNAPSHOT_HISTORIC_MVCC: - return HeapTupleSatisfiesHistoricMVCC(htup, snapshot, buffer); - case SNAPSHOT_NON_VACUUMABLE: - return HeapTupleSatisfiesNonVacuumable(htup, snapshot, buffer); - } - - return false; /* keep compiler quiet */ -} diff --git a/src/access/pg_tdetoast.c b/src/access/pg_tdetoast.c deleted file mode 100644 index 6b4d45d5..00000000 --- a/src/access/pg_tdetoast.c +++ /dev/null @@ -1,1262 +0,0 @@ -/*------------------------------------------------------------------------- - * - * heaptoast.c - * Heap-specific definitions for external and compressed storage - * of variable size attributes. - * - * Copyright (c) 2000-2023, PostgreSQL Global Development Group - * - * - * IDENTIFICATION - * src/backend/access/heap/heaptoast.c - * - * - * INTERFACE ROUTINES - * tdeheap_toast_insert_or_update - - * Try to make a given tuple fit into one page by compressing - * or moving off attributes - * - * tdeheap_toast_delete - - * Reclaim toast storage when a tuple is deleted - * - *------------------------------------------------------------------------- - */ -#include "pg_tde_defines.h" - -#include "postgres.h" - -#include "access/pg_tdeam.h" -#include "access/pg_tdetoast.h" - -#include "access/detoast.h" -#include "access/genam.h" -#include "access/toast_helper.h" -#include "access/toast_internals.h" -#include "miscadmin.h" -#include "utils/fmgroids.h" -#include "utils/snapmgr.h" -#include "encryption/enc_tde.h" - -#define TDE_TOAST_COMPRESS_HEADER_SIZE (VARHDRSZ_COMPRESSED - VARHDRSZ) - -static void tdeheap_toast_tuple_externalize(ToastTupleContext *ttc, - int attribute, int options); -static Datum tdeheap_toast_save_datum(Relation rel, Datum value, - struct varlena *oldexternal, - int options); -static void tdeheap_toast_encrypt(Pointer dval, Oid valueid, RelKeyData *keys); -static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); -static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); - - -/* ---------- - * tdeheap_toast_delete - - * - * Cascaded delete toast-entries on DELETE - * ---------- - */ -void -tdeheap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative) -{ - TupleDesc tupleDesc; - Datum toast_values[MaxHeapAttributeNumber]; - bool toast_isnull[MaxHeapAttributeNumber]; - - /* - * We should only ever be called for tuples of plain relations or - * materialized views --- recursing on a toast rel is bad news. - */ - Assert(rel->rd_rel->relkind == RELKIND_RELATION || - rel->rd_rel->relkind == RELKIND_MATVIEW); - - /* - * Get the tuple descriptor and break down the tuple into fields. - * - * NOTE: it's debatable whether to use tdeheap_deform_tuple() here or just - * tdeheap_getattr() only the varlena columns. The latter could win if there - * are few varlena columns and many non-varlena ones. However, - * tdeheap_deform_tuple costs only O(N) while the tdeheap_getattr way would cost - * O(N^2) if there are many varlena columns, so it seems better to err on - * the side of linear cost. (We won't even be here unless there's at - * least one varlena column, by the way.) - */ - tupleDesc = rel->rd_att; - - Assert(tupleDesc->natts <= MaxHeapAttributeNumber); - tdeheap_deform_tuple(oldtup, tupleDesc, toast_values, toast_isnull); - - /* Do the real work. */ - toast_delete_external(rel, toast_values, toast_isnull, is_speculative); -} - - -/* ---------- - * tdeheap_toast_insert_or_update - - * - * Delete no-longer-used toast-entries and create new ones to - * make the new tuple fit on INSERT or UPDATE - * - * Inputs: - * newtup: the candidate new tuple to be inserted - * oldtup: the old row version for UPDATE, or NULL for INSERT - * options: options to be passed to tdeheap_insert() for toast rows - * Result: - * either newtup if no toasting is needed, or a palloc'd modified tuple - * that is what should actually get stored - * - * NOTE: neither newtup nor oldtup will be modified. This is a change - * from the pre-8.1 API of this routine. - * ---------- - */ -HeapTuple -tdeheap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, - int options) -{ - HeapTuple result_tuple; - TupleDesc tupleDesc; - int numAttrs; - - Size maxDataLen; - Size hoff; - - bool toast_isnull[MaxHeapAttributeNumber]; - bool toast_oldisnull[MaxHeapAttributeNumber]; - Datum toast_values[MaxHeapAttributeNumber]; - Datum toast_oldvalues[MaxHeapAttributeNumber]; - ToastAttrInfo toast_attr[MaxHeapAttributeNumber]; - ToastTupleContext ttc; - - /* - * Ignore the INSERT_SPECULATIVE option. Speculative insertions/super - * deletions just normally insert/delete the toast values. It seems - * easiest to deal with that here, instead on, potentially, multiple - * callers. - */ - options &= ~HEAP_INSERT_SPECULATIVE; - - /* - * We should only ever be called for tuples of plain relations or - * materialized views --- recursing on a toast rel is bad news. - */ - Assert(rel->rd_rel->relkind == RELKIND_RELATION || - rel->rd_rel->relkind == RELKIND_MATVIEW); - - /* - * Get the tuple descriptor and break down the tuple(s) into fields. - */ - tupleDesc = rel->rd_att; - numAttrs = tupleDesc->natts; - - Assert(numAttrs <= MaxHeapAttributeNumber); - tdeheap_deform_tuple(newtup, tupleDesc, toast_values, toast_isnull); - if (oldtup != NULL) - tdeheap_deform_tuple(oldtup, tupleDesc, toast_oldvalues, toast_oldisnull); - - /* ---------- - * Prepare for toasting - * ---------- - */ - ttc.ttc_rel = rel; - ttc.ttc_values = toast_values; - ttc.ttc_isnull = toast_isnull; - if (oldtup == NULL) - { - ttc.ttc_oldvalues = NULL; - ttc.ttc_oldisnull = NULL; - } - else - { - ttc.ttc_oldvalues = toast_oldvalues; - ttc.ttc_oldisnull = toast_oldisnull; - } - ttc.ttc_attr = toast_attr; - toast_tuple_init(&ttc); - - /* ---------- - * Compress and/or save external until data fits into target length - * - * 1: Inline compress attributes with attstorage EXTENDED, and store very - * large attributes with attstorage EXTENDED or EXTERNAL external - * immediately - * 2: Store attributes with attstorage EXTENDED or EXTERNAL external - * 3: Inline compress attributes with attstorage MAIN - * 4: Store attributes with attstorage MAIN external - * ---------- - */ - - /* compute header overhead --- this should match tdeheap_form_tuple() */ - hoff = SizeofHeapTupleHeader; - if ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) - hoff += BITMAPLEN(numAttrs); - hoff = MAXALIGN(hoff); - /* now convert to a limit on the tuple data size */ - maxDataLen = RelationGetToastTupleTarget(rel, TOAST_TUPLE_TARGET) - hoff; - - /* - * Look for attributes with attstorage EXTENDED to compress. Also find - * large attributes with attstorage EXTENDED or EXTERNAL, and store them - * external. - */ - while (tdeheap_compute_data_size(tupleDesc, - toast_values, toast_isnull) > maxDataLen) - { - int biggest_attno; - - biggest_attno = toast_tuple_find_biggest_attribute(&ttc, true, false); - if (biggest_attno < 0) - break; - - /* - * Attempt to compress it inline, if it has attstorage EXTENDED - */ - if (TupleDescAttr(tupleDesc, biggest_attno)->attstorage == TYPSTORAGE_EXTENDED) - toast_tuple_try_compression(&ttc, biggest_attno); - else - { - /* - * has attstorage EXTERNAL, ignore on subsequent compression - * passes - */ - toast_attr[biggest_attno].tai_colflags |= TOASTCOL_INCOMPRESSIBLE; - } - - /* - * If this value is by itself more than maxDataLen (after compression - * if any), push it out to the toast table immediately, if possible. - * This avoids uselessly compressing other fields in the common case - * where we have one long field and several short ones. - * - * XXX maybe the threshold should be less than maxDataLen? - */ - if (toast_attr[biggest_attno].tai_size > maxDataLen && - rel->rd_rel->reltoastrelid != InvalidOid) - tdeheap_toast_tuple_externalize(&ttc, biggest_attno, options); - } - - /* - * Second we look for attributes of attstorage EXTENDED or EXTERNAL that - * are still inline, and make them external. But skip this if there's no - * toast table to push them to. - */ - while (tdeheap_compute_data_size(tupleDesc, - toast_values, toast_isnull) > maxDataLen && - rel->rd_rel->reltoastrelid != InvalidOid) - { - int biggest_attno; - - biggest_attno = toast_tuple_find_biggest_attribute(&ttc, false, false); - if (biggest_attno < 0) - break; - tdeheap_toast_tuple_externalize(&ttc, biggest_attno, options); - } - - /* - * Round 3 - this time we take attributes with storage MAIN into - * compression - */ - while (tdeheap_compute_data_size(tupleDesc, - toast_values, toast_isnull) > maxDataLen) - { - int biggest_attno; - - biggest_attno = toast_tuple_find_biggest_attribute(&ttc, true, true); - if (biggest_attno < 0) - break; - - toast_tuple_try_compression(&ttc, biggest_attno); - } - - /* - * Finally we store attributes of type MAIN externally. At this point we - * increase the target tuple size, so that MAIN attributes aren't stored - * externally unless really necessary. - */ - maxDataLen = TOAST_TUPLE_TARGET_MAIN - hoff; - - while (tdeheap_compute_data_size(tupleDesc, - toast_values, toast_isnull) > maxDataLen && - rel->rd_rel->reltoastrelid != InvalidOid) - { - int biggest_attno; - - biggest_attno = toast_tuple_find_biggest_attribute(&ttc, false, true); - if (biggest_attno < 0) - break; - - tdeheap_toast_tuple_externalize(&ttc, biggest_attno, options); - } - - /* - * In the case we toasted any values, we need to build a new heap tuple - * with the changed values. - */ - if ((ttc.ttc_flags & TOAST_NEEDS_CHANGE) != 0) - { - HeapTupleHeader olddata = newtup->t_data; - HeapTupleHeader new_data; - int32 new_header_len; - int32 new_data_len; - int32 new_tuple_len; - - /* - * Calculate the new size of the tuple. - * - * Note: we used to assume here that the old tuple's t_hoff must equal - * the new_header_len value, but that was incorrect. The old tuple - * might have a smaller-than-current natts, if there's been an ALTER - * TABLE ADD COLUMN since it was stored; and that would lead to a - * different conclusion about the size of the null bitmap, or even - * whether there needs to be one at all. - */ - new_header_len = SizeofHeapTupleHeader; - if ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) - new_header_len += BITMAPLEN(numAttrs); - new_header_len = MAXALIGN(new_header_len); - new_data_len = tdeheap_compute_data_size(tupleDesc, - toast_values, toast_isnull); - new_tuple_len = new_header_len + new_data_len; - - /* - * Allocate and zero the space needed, and fill HeapTupleData fields. - */ - result_tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + new_tuple_len); - result_tuple->t_len = new_tuple_len; - result_tuple->t_self = newtup->t_self; - result_tuple->t_tableOid = newtup->t_tableOid; - new_data = (HeapTupleHeader) ((char *) result_tuple + HEAPTUPLESIZE); - result_tuple->t_data = new_data; - - /* - * Copy the existing tuple header, but adjust natts and t_hoff. - */ - memcpy(new_data, olddata, SizeofHeapTupleHeader); - HeapTupleHeaderSetNatts(new_data, numAttrs); - new_data->t_hoff = new_header_len; - - /* Copy over the data, and fill the null bitmap if needed */ - tdeheap_fill_tuple(tupleDesc, - toast_values, - toast_isnull, - (char *) new_data + new_header_len, - new_data_len, - &(new_data->t_infomask), - ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) ? - new_data->t_bits : NULL); - } - else - result_tuple = newtup; - - toast_tuple_cleanup(&ttc); - - return result_tuple; -} - - -/* ---------- - * toast_flatten_tuple - - * - * "Flatten" a tuple to contain no out-of-line toasted fields. - * (This does not eliminate compressed or short-header datums.) - * - * Note: we expect the caller already checked HeapTupleHasExternal(tup), - * so there is no need for a short-circuit path. - * ---------- - */ -HeapTuple -toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc) -{ - HeapTuple new_tuple; - int numAttrs = tupleDesc->natts; - int i; - Datum toast_values[MaxTupleAttributeNumber]; - bool toast_isnull[MaxTupleAttributeNumber]; - bool toast_free[MaxTupleAttributeNumber]; - - /* - * Break down the tuple into fields. - */ - Assert(numAttrs <= MaxTupleAttributeNumber); - tdeheap_deform_tuple(tup, tupleDesc, toast_values, toast_isnull); - - memset(toast_free, 0, numAttrs * sizeof(bool)); - - for (i = 0; i < numAttrs; i++) - { - /* - * Look at non-null varlena attributes - */ - if (!toast_isnull[i] && TupleDescAttr(tupleDesc, i)->attlen == -1) - { - struct varlena *new_value; - - new_value = (struct varlena *) DatumGetPointer(toast_values[i]); - if (VARATT_IS_EXTERNAL(new_value)) - { - new_value = detoast_external_attr(new_value); - toast_values[i] = PointerGetDatum(new_value); - toast_free[i] = true; - } - } - } - - /* - * Form the reconfigured tuple. - */ - new_tuple = tdeheap_form_tuple(tupleDesc, toast_values, toast_isnull); - - /* - * Be sure to copy the tuple's identity fields. We also make a point of - * copying visibility info, just in case anybody looks at those fields in - * a syscache entry. - */ - new_tuple->t_self = tup->t_self; - new_tuple->t_tableOid = tup->t_tableOid; - - new_tuple->t_data->t_choice = tup->t_data->t_choice; - new_tuple->t_data->t_ctid = tup->t_data->t_ctid; - new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; - new_tuple->t_data->t_infomask |= - tup->t_data->t_infomask & HEAP_XACT_MASK; - new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; - new_tuple->t_data->t_infomask2 |= - tup->t_data->t_infomask2 & HEAP2_XACT_MASK; - - /* - * Free allocated temp values - */ - for (i = 0; i < numAttrs; i++) - if (toast_free[i]) - pfree(DatumGetPointer(toast_values[i])); - - return new_tuple; -} - - -/* ---------- - * toast_flatten_tuple_to_datum - - * - * "Flatten" a tuple containing out-of-line toasted fields into a Datum. - * The result is always palloc'd in the current memory context. - * - * We have a general rule that Datums of container types (rows, arrays, - * ranges, etc) must not contain any external TOAST pointers. Without - * this rule, we'd have to look inside each Datum when preparing a tuple - * for storage, which would be expensive and would fail to extend cleanly - * to new sorts of container types. - * - * However, we don't want to say that tuples represented as HeapTuples - * can't contain toasted fields, so instead this routine should be called - * when such a HeapTuple is being converted into a Datum. - * - * While we're at it, we decompress any compressed fields too. This is not - * necessary for correctness, but reflects an expectation that compression - * will be more effective if applied to the whole tuple not individual - * fields. We are not so concerned about that that we want to deconstruct - * and reconstruct tuples just to get rid of compressed fields, however. - * So callers typically won't call this unless they see that the tuple has - * at least one external field. - * - * On the other hand, in-line short-header varlena fields are left alone. - * If we "untoasted" them here, they'd just get changed back to short-header - * format anyway within tdeheap_fill_tuple. - * ---------- - */ -Datum -toast_flatten_tuple_to_datum(HeapTupleHeader tup, - uint32 tup_len, - TupleDesc tupleDesc) -{ - HeapTupleHeader new_data; - int32 new_header_len; - int32 new_data_len; - int32 new_tuple_len; - HeapTupleData tmptup; - int numAttrs = tupleDesc->natts; - int i; - bool has_nulls = false; - Datum toast_values[MaxTupleAttributeNumber]; - bool toast_isnull[MaxTupleAttributeNumber]; - bool toast_free[MaxTupleAttributeNumber]; - - /* Build a temporary HeapTuple control structure */ - tmptup.t_len = tup_len; - ItemPointerSetInvalid(&(tmptup.t_self)); - tmptup.t_tableOid = InvalidOid; - tmptup.t_data = tup; - - /* - * Break down the tuple into fields. - */ - Assert(numAttrs <= MaxTupleAttributeNumber); - tdeheap_deform_tuple(&tmptup, tupleDesc, toast_values, toast_isnull); - - memset(toast_free, 0, numAttrs * sizeof(bool)); - - for (i = 0; i < numAttrs; i++) - { - /* - * Look at non-null varlena attributes - */ - if (toast_isnull[i]) - has_nulls = true; - else if (TupleDescAttr(tupleDesc, i)->attlen == -1) - { - struct varlena *new_value; - - new_value = (struct varlena *) DatumGetPointer(toast_values[i]); - if (VARATT_IS_EXTERNAL(new_value) || - VARATT_IS_COMPRESSED(new_value)) - { - new_value = detoast_attr(new_value); - toast_values[i] = PointerGetDatum(new_value); - toast_free[i] = true; - } - } - } - - /* - * Calculate the new size of the tuple. - * - * This should match the reconstruction code in - * tdeheap_toast_insert_or_update. - */ - new_header_len = SizeofHeapTupleHeader; - if (has_nulls) - new_header_len += BITMAPLEN(numAttrs); - new_header_len = MAXALIGN(new_header_len); - new_data_len = tdeheap_compute_data_size(tupleDesc, - toast_values, toast_isnull); - new_tuple_len = new_header_len + new_data_len; - - new_data = (HeapTupleHeader) palloc0(new_tuple_len); - - /* - * Copy the existing tuple header, but adjust natts and t_hoff. - */ - memcpy(new_data, tup, SizeofHeapTupleHeader); - HeapTupleHeaderSetNatts(new_data, numAttrs); - new_data->t_hoff = new_header_len; - - /* Set the composite-Datum header fields correctly */ - HeapTupleHeaderSetDatumLength(new_data, new_tuple_len); - HeapTupleHeaderSetTypeId(new_data, tupleDesc->tdtypeid); - HeapTupleHeaderSetTypMod(new_data, tupleDesc->tdtypmod); - - /* Copy over the data, and fill the null bitmap if needed */ - tdeheap_fill_tuple(tupleDesc, - toast_values, - toast_isnull, - (char *) new_data + new_header_len, - new_data_len, - &(new_data->t_infomask), - has_nulls ? new_data->t_bits : NULL); - - /* - * Free allocated temp values - */ - for (i = 0; i < numAttrs; i++) - if (toast_free[i]) - pfree(DatumGetPointer(toast_values[i])); - - return PointerGetDatum(new_data); -} - - -/* ---------- - * toast_build_flattened_tuple - - * - * Build a tuple containing no out-of-line toasted fields. - * (This does not eliminate compressed or short-header datums.) - * - * This is essentially just like tdeheap_form_tuple, except that it will - * expand any external-data pointers beforehand. - * - * It's not very clear whether it would be preferable to decompress - * in-line compressed datums while at it. For now, we don't. - * ---------- - */ -HeapTuple -toast_build_flattened_tuple(TupleDesc tupleDesc, - Datum *values, - bool *isnull) -{ - HeapTuple new_tuple; - int numAttrs = tupleDesc->natts; - int num_to_free; - int i; - Datum new_values[MaxTupleAttributeNumber]; - Pointer freeable_values[MaxTupleAttributeNumber]; - - /* - * We can pass the caller's isnull array directly to tdeheap_form_tuple, but - * we potentially need to modify the values array. - */ - Assert(numAttrs <= MaxTupleAttributeNumber); - memcpy(new_values, values, numAttrs * sizeof(Datum)); - - num_to_free = 0; - for (i = 0; i < numAttrs; i++) - { - /* - * Look at non-null varlena attributes - */ - if (!isnull[i] && TupleDescAttr(tupleDesc, i)->attlen == -1) - { - struct varlena *new_value; - - new_value = (struct varlena *) DatumGetPointer(new_values[i]); - if (VARATT_IS_EXTERNAL(new_value)) - { - new_value = detoast_external_attr(new_value); - new_values[i] = PointerGetDatum(new_value); - freeable_values[num_to_free++] = (Pointer) new_value; - } - } - } - - /* - * Form the reconfigured tuple. - */ - new_tuple = tdeheap_form_tuple(tupleDesc, new_values, isnull); - - /* - * Free allocated temp values - */ - for (i = 0; i < num_to_free; i++) - pfree(freeable_values[i]); - - return new_tuple; -} - -/* - * Fetch a TOAST slice from a heap table. - * - * toastrel is the relation from which chunks are to be fetched. - * valueid identifies the TOAST value from which chunks are being fetched. - * attrsize is the total size of the TOAST value. - * sliceoffset is the byte offset within the TOAST value from which to fetch. - * slicelength is the number of bytes to be fetched from the TOAST value. - * result is the varlena into which the results should be written. - */ -void -tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, - int32 sliceoffset, int32 slicelength, - struct varlena *result) -{ - Relation *toastidxs; - ScanKeyData toastkey[3]; - TupleDesc toasttupDesc = toastrel->rd_att; - int nscankeys; - SysScanDesc toastscan; - HeapTuple ttup; - int32 expectedchunk; - int32 totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; - int startchunk; - int endchunk; - int num_indexes; - int validIndex; - SnapshotData SnapshotToast; - char decrypted_data[TOAST_MAX_CHUNK_SIZE]; - RelKeyData *key = GetRelationKey(toastrel->rd_locator); - char iv_prefix[16] = {0,}; - - - /* Look for the valid index of toast relation */ - validIndex = toast_open_indexes(toastrel, - AccessShareLock, - &toastidxs, - &num_indexes); - - startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; - endchunk = (sliceoffset + slicelength - 1) / TOAST_MAX_CHUNK_SIZE; - Assert(endchunk <= totalchunks); - - /* Set up a scan key to fetch from the index. */ - ScanKeyInit(&toastkey[0], - (AttrNumber) 1, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(valueid)); - - /* - * No additional condition if fetching all chunks. Otherwise, use an - * equality condition for one chunk, and a range condition otherwise. - */ - if (startchunk == 0 && endchunk == totalchunks - 1) - nscankeys = 1; - else if (startchunk == endchunk) - { - ScanKeyInit(&toastkey[1], - (AttrNumber) 2, - BTEqualStrategyNumber, F_INT4EQ, - Int32GetDatum(startchunk)); - nscankeys = 2; - } - else - { - ScanKeyInit(&toastkey[1], - (AttrNumber) 2, - BTGreaterEqualStrategyNumber, F_INT4GE, - Int32GetDatum(startchunk)); - ScanKeyInit(&toastkey[2], - (AttrNumber) 2, - BTLessEqualStrategyNumber, F_INT4LE, - Int32GetDatum(endchunk)); - nscankeys = 3; - } - - /* Prepare for scan */ - init_toast_snapshot(&SnapshotToast); - toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], - &SnapshotToast, nscankeys, toastkey); - - memcpy(iv_prefix, &valueid, sizeof(Oid)); - - /* - * Read the chunks by index - * - * The index is on (valueid, chunkidx) so they will come in order - */ - expectedchunk = startchunk; - while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) - { - int32 curchunk; - Pointer chunk; - bool isnull; - char *chunkdata; - int32 chunksize; - int32 expected_size; - int32 chcpystrt; - int32 chcpyend; - int32 encrypt_offset; - - /* - * Have a chunk, extract the sequence number and the data - */ - curchunk = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); - Assert(!isnull); - chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); - Assert(!isnull); - if (!VARATT_IS_EXTENDED(chunk)) - { - chunksize = VARSIZE(chunk) - VARHDRSZ; - chunkdata = VARDATA(chunk); - } - else if (VARATT_IS_SHORT(chunk)) - { - /* could happen due to tdeheap_form_tuple doing its thing */ - chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; - chunkdata = VARDATA_SHORT(chunk); - } - else - { - /* should never happen */ - elog(ERROR, "found toasted toast chunk for toast value %u in %s", - valueid, RelationGetRelationName(toastrel)); - chunksize = 0; /* keep compiler quiet */ - chunkdata = NULL; - } - - /* - * Some checks on the data we've found - */ - if (curchunk != expectedchunk) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("unexpected chunk number %d (expected %d) for toast value %u in %s", - curchunk, expectedchunk, valueid, - RelationGetRelationName(toastrel)))); - if (curchunk > endchunk) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", - curchunk, - startchunk, endchunk, valueid, - RelationGetRelationName(toastrel)))); - expected_size = curchunk < totalchunks - 1 ? TOAST_MAX_CHUNK_SIZE - : attrsize - ((totalchunks - 1) * TOAST_MAX_CHUNK_SIZE); - if (chunksize != expected_size) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s", - chunksize, expected_size, - curchunk, totalchunks, valueid, - RelationGetRelationName(toastrel)))); - - /* - * Copy the data into proper place in our result - */ - chcpystrt = 0; - chcpyend = chunksize - 1; - if (curchunk == startchunk) - chcpystrt = sliceoffset % TOAST_MAX_CHUNK_SIZE; - if (curchunk == endchunk) - chcpyend = (sliceoffset + slicelength - 1) % TOAST_MAX_CHUNK_SIZE; - - /* - * If TOAST is compressed, the first TDE_TOAST_COMPRESS_HEADER_SIZE (4 bytes) is - * not encrypted and contains compression info. It should be added to the - * result as it is and the rest should be decrypted. Encryption offset in - * that case will be 0 for the first chunk (despite the encrypted data - * starting with the offset TDE_TOAST_COMPRESS_HEADER_SIZE, we've encrypted it - * without compression headers) and `chunk start offset - 4` for the next - * chunks. - */ - encrypt_offset = chcpystrt; - if (VARATT_IS_COMPRESSED(result)) { - if (curchunk == 0) { - memcpy(VARDATA(result), chunkdata + chcpystrt, TDE_TOAST_COMPRESS_HEADER_SIZE); - chcpystrt += TDE_TOAST_COMPRESS_HEADER_SIZE; - } else { - encrypt_offset -= TDE_TOAST_COMPRESS_HEADER_SIZE; - } - } - /* Decrypt the data chunk by chunk here */ - - PG_TDE_DECRYPT_DATA(iv_prefix, (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + encrypt_offset, - chunkdata + chcpystrt, - (chcpyend - chcpystrt) + 1, - decrypted_data, key); - - memcpy(VARDATA(result) + - (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, - decrypted_data, - (chcpyend - chcpystrt) + 1); - - expectedchunk++; - } - - /* - * Final checks that we successfully fetched the datum - */ - if (expectedchunk != (endchunk + 1)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("missing chunk number %d for toast value %u in %s", - expectedchunk, valueid, - RelationGetRelationName(toastrel)))); - - /* End scan and close indexes. */ - systable_endscan_ordered(toastscan); - toast_close_indexes(toastidxs, num_indexes, AccessShareLock); -} -// TODO: these should be in their own file so we can proplerly autoupdate them -/* pg_tde extension */ -static void -tdeheap_toast_encrypt(Pointer dval, Oid valueid, RelKeyData *key) -{ - int32 data_size =0; - char* data_p; - char* encrypted_data; - char iv_prefix[16] = {0,}; - - /* - * Encryption specific data_p and data_size as we have to avoid - * encryption of the compression info. - * See https://github.com/Percona-Lab/pg_tde/commit/dee6e357ef05d217a4c4df131249a80e5e909163 - */ - if (VARATT_IS_SHORT(dval)) - { - data_p = VARDATA_SHORT(dval); - data_size = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; - } - else if (VARATT_IS_COMPRESSED(dval)) - { - data_p = VARDATA_4B_C(dval); - data_size = VARSIZE(dval) - VARHDRSZ_COMPRESSED; - } - else - { - data_p = VARDATA(dval); - data_size = VARSIZE(dval) - VARHDRSZ; - } - /* Now encrypt the data and replace it in ttc */ - encrypted_data = (char *)palloc(data_size); - - memcpy(iv_prefix, &valueid, sizeof(Oid)); - PG_TDE_ENCRYPT_DATA(iv_prefix, 0, data_p, data_size, encrypted_data, key); - - memcpy(data_p, encrypted_data, data_size); - pfree(encrypted_data); -} - -/* - * Move an attribute to external storage. - * - * copy from PG src/backend/access/table/toast_helper.c - */ -static void -tdeheap_toast_tuple_externalize(ToastTupleContext *ttc, int attribute, int options) -{ - Datum *value = &ttc->ttc_values[attribute]; - Datum old_value = *value; - ToastAttrInfo *attr = &ttc->ttc_attr[attribute]; - - attr->tai_colflags |= TOASTCOL_IGNORE; - *value = tdeheap_toast_save_datum(ttc->ttc_rel, old_value, attr->tai_oldexternal, - options); - if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0) - pfree(DatumGetPointer(old_value)); - attr->tai_colflags |= TOASTCOL_NEEDS_FREE; - ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE); -} - -/* ---------- - * tdeheap_toast_save_datum - - * - * Save one single datum into the secondary relation and return - * a Datum reference for it. - * It also encrypts toasted data. - * - * rel: the main relation we're working with (not the toast rel!) - * value: datum to be pushed to toast storage - * oldexternal: if not NULL, toast pointer previously representing the datum - * options: options to be passed to tdeheap_insert() for toast rows - * - * based on toast_save_datum from PG src/backend/access/common/toast_internals.c - * ---------- - */ -static Datum -tdeheap_toast_save_datum(Relation rel, Datum value, - struct varlena *oldexternal, int options) -{ - Relation toastrel; - Relation *toastidxs; - HeapTuple toasttup; - TupleDesc toasttupDesc; - Datum t_values[3]; - bool t_isnull[3]; - CommandId mycid = GetCurrentCommandId(true); - struct varlena *result; - struct varatt_external toast_pointer; - union - { - struct varlena hdr; - /* this is to make the union big enough for a chunk: */ - char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; - /* ensure union is aligned well enough: */ - int32 align_it; - } chunk_data; - int32 chunk_size; - int32 chunk_seq = 0; - char *data_p; - int32 data_todo; - Pointer dval = DatumGetPointer(value); - int num_indexes; - int validIndex; - - - Assert(!VARATT_IS_EXTERNAL(value)); - - /* - * Open the toast relation and its indexes. We can use the index to check - * uniqueness of the OID we assign to the toasted item, even though it has - * additional columns besides OID. - */ - toastrel = table_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); - toasttupDesc = toastrel->rd_att; - - /* Open all the toast indexes and look for the valid one */ - validIndex = toast_open_indexes(toastrel, - RowExclusiveLock, - &toastidxs, - &num_indexes); - - /* - * Get the data pointer and length, and compute va_rawsize and va_extinfo. - * - * va_rawsize is the size of the equivalent fully uncompressed datum, so - * we have to adjust for short headers. - * - * va_extinfo stored the actual size of the data payload in the toast - * records and the compression method in first 2 bits if data is - * compressed. - */ - if (VARATT_IS_SHORT(dval)) - { - data_p = VARDATA_SHORT(dval); - data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; - toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ - toast_pointer.va_extinfo = data_todo; - } - else if (VARATT_IS_COMPRESSED(dval)) - { - data_p = VARDATA(dval); - data_todo = VARSIZE(dval) - VARHDRSZ; - /* rawsize in a compressed datum is just the size of the payload */ - toast_pointer.va_rawsize = VARDATA_COMPRESSED_GET_EXTSIZE(dval) + VARHDRSZ; - - /* set external size and compression method */ - VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer, data_todo, - VARDATA_COMPRESSED_GET_COMPRESS_METHOD(dval)); - /* Assert that the numbers look like it's compressed */ - Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); - } - else - { - data_p = VARDATA(dval); - data_todo = VARSIZE(dval) - VARHDRSZ; - toast_pointer.va_rawsize = VARSIZE(dval); - toast_pointer.va_extinfo = data_todo; - } - - /* - * Insert the correct table OID into the result TOAST pointer. - * - * Normally this is the actual OID of the target toast table, but during - * table-rewriting operations such as CLUSTER, we have to insert the OID - * of the table's real permanent toast table instead. rd_toastoid is set - * if we have to substitute such an OID. - */ - if (OidIsValid(rel->rd_toastoid)) - toast_pointer.va_toastrelid = rel->rd_toastoid; - else - toast_pointer.va_toastrelid = RelationGetRelid(toastrel); - - /* - * Choose an OID to use as the value ID for this toast value. - * - * Normally we just choose an unused OID within the toast table. But - * during table-rewriting operations where we are preserving an existing - * toast table OID, we want to preserve toast value OIDs too. So, if - * rd_toastoid is set and we had a prior external value from that same - * toast table, re-use its value ID. If we didn't have a prior external - * value (which is a corner case, but possible if the table's attstorage - * options have been changed), we have to pick a value ID that doesn't - * conflict with either new or existing toast value OIDs. - */ - if (!OidIsValid(rel->rd_toastoid)) - { - /* normal case: just choose an unused OID */ - toast_pointer.va_valueid = - GetNewOidWithIndex(toastrel, - RelationGetRelid(toastidxs[validIndex]), - (AttrNumber) 1); - } - else - { - /* rewrite case: check to see if value was in old toast table */ - toast_pointer.va_valueid = InvalidOid; - if (oldexternal != NULL) - { - struct varatt_external old_toast_pointer; - - Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); - /* Must copy to access aligned fields */ - VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); - if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) - { - /* This value came from the old toast table; reuse its OID */ - toast_pointer.va_valueid = old_toast_pointer.va_valueid; - - /* - * There is a corner case here: the table rewrite might have - * to copy both live and recently-dead versions of a row, and - * those versions could easily reference the same toast value. - * When we copy the second or later version of such a row, - * reusing the OID will mean we select an OID that's already - * in the new toast table. Check for that, and if so, just - * fall through without writing the data again. - * - * While annoying and ugly-looking, this is a good thing - * because it ensures that we wind up with only one copy of - * the toast value when there is only one copy in the old - * toast table. Before we detected this case, we'd have made - * multiple copies, wasting space; and what's worse, the - * copies belonging to already-deleted heap tuples would not - * be reclaimed by VACUUM. - */ - if (toastrel_valueid_exists(toastrel, - toast_pointer.va_valueid)) - { - /* Match, so short-circuit the data storage loop below */ - data_todo = 0; - } - } - } - if (toast_pointer.va_valueid == InvalidOid) - { - /* - * new value; must choose an OID that doesn't conflict in either - * old or new toast table - */ - do - { - toast_pointer.va_valueid = - GetNewOidWithIndex(toastrel, - RelationGetRelid(toastidxs[validIndex]), - (AttrNumber) 1); - } while (toastid_valueid_exists(rel->rd_toastoid, - toast_pointer.va_valueid)); - } - } - - /* - * Encrypt toast data. - */ - tdeheap_toast_encrypt(dval, toast_pointer.va_valueid, GetRelationKey(toastrel->rd_locator)); - - /* - * Initialize constant parts of the tuple data - */ - t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); - t_values[2] = PointerGetDatum(&chunk_data); - t_isnull[0] = false; - t_isnull[1] = false; - t_isnull[2] = false; - - /* - * Split up the item into chunks - */ - while (data_todo > 0) - { - int i; - - CHECK_FOR_INTERRUPTS(); - - /* - * Calculate the size of this chunk - */ - chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); - - /* - * Build a tuple and store it - */ - t_values[1] = Int32GetDatum(chunk_seq++); - SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); - memcpy(VARDATA(&chunk_data), data_p, chunk_size); - toasttup = tdeheap_form_tuple(toasttupDesc, t_values, t_isnull); - - /* - * The tuple should be insterted not encrypted. - * TOAST data already encrypted. - */ - options |= HEAP_INSERT_TDE_NO_ENCRYPT; - tdeheap_insert(toastrel, toasttup, mycid, options, NULL); - - /* - * Create the index entry. We cheat a little here by not using - * FormIndexDatum: this relies on the knowledge that the index columns - * are the same as the initial columns of the table for all the - * indexes. We also cheat by not providing an IndexInfo: this is okay - * for now because btree doesn't need one, but we might have to be - * more honest someday. - * - * Note also that there had better not be any user-created index on - * the TOAST table, since we don't bother to update anything else. - */ - for (i = 0; i < num_indexes; i++) - { - /* Only index relations marked as ready can be updated */ - if (toastidxs[i]->rd_index->indisready) - index_insert(toastidxs[i], t_values, t_isnull, - &(toasttup->t_self), - toastrel, - toastidxs[i]->rd_index->indisunique ? - UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - false, NULL); - } - - /* - * Free memory - */ - tdeheap_freetuple(toasttup); - - /* - * Move on to next chunk - */ - data_todo -= chunk_size; - data_p += chunk_size; - } - - /* - * Done - close toast relation and its indexes but keep the lock until - * commit, so as a concurrent reindex done directly on the toast relation - * would be able to wait for this transaction. - */ - toast_close_indexes(toastidxs, num_indexes, NoLock); - table_close(toastrel, NoLock); - - /* - * Create the TOAST pointer value that we'll return - */ - result = (struct varlena *) palloc(TOAST_POINTER_SIZE); - SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); - memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); - - return PointerGetDatum(result); -} - -/* ---------- - * toastrel_valueid_exists - - * - * Test whether a toast value with the given ID exists in the toast relation. - * For safety, we consider a value to exist if there are either live or dead - * toast rows with that ID; see notes for GetNewOidWithIndex(). - * - * copy from PG src/backend/access/common/toast_internals.c - * ---------- - */ -static bool -toastrel_valueid_exists(Relation toastrel, Oid valueid) -{ - bool result = false; - ScanKeyData toastkey; - SysScanDesc toastscan; - int num_indexes; - int validIndex; - Relation *toastidxs; - - /* Fetch a valid index relation */ - validIndex = toast_open_indexes(toastrel, - RowExclusiveLock, - &toastidxs, - &num_indexes); - - /* - * Setup a scan key to find chunks with matching va_valueid - */ - ScanKeyInit(&toastkey, - (AttrNumber) 1, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(valueid)); - - /* - * Is there any such chunk? - */ - toastscan = systable_beginscan(toastrel, - RelationGetRelid(toastidxs[validIndex]), - true, SnapshotAny, 1, &toastkey); - - if (systable_getnext(toastscan) != NULL) - result = true; - - systable_endscan(toastscan); - - /* Clean up */ - toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); - - return result; -} - -/* ---------- - * toastid_valueid_exists - - * - * As above, but work from toast rel's OID not an open relation - * - * copy from PG src/backend/access/common/toast_internals.c - * ---------- - */ -static bool -toastid_valueid_exists(Oid toastrelid, Oid valueid) -{ - bool result; - Relation toastrel; - - toastrel = table_open(toastrelid, AccessShareLock); - - result = toastrel_valueid_exists(toastrel, valueid); - - table_close(toastrel, AccessShareLock); - - return result; -} diff --git a/src/include/access/pg_tde_io.h b/src/include/access/pg_tde_io.h deleted file mode 100644 index 4d0a64bc..00000000 --- a/src/include/access/pg_tde_io.h +++ /dev/null @@ -1,62 +0,0 @@ -/*------------------------------------------------------------------------- - * - * tdeheap_io.h - * POSTGRES heap access method input/output definitions. - * - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * src/include/access/hio.h - * - *------------------------------------------------------------------------- - */ -#ifndef PG_TDE_IO_H -#define PG_TDE_IO_H - -#include "access/htup.h" -#include "storage/buf.h" -#include "utils/relcache.h" - -/* - * state for bulk inserts --- private to heapam.c and hio.c - * - * If current_buf isn't InvalidBuffer, then we are holding an extra pin - * on that buffer. - * - * "typedef struct BulkInsertStateData *BulkInsertState" is in heapam.h - */ -typedef struct BulkInsertStateData -{ - BufferAccessStrategy strategy; /* our BULKWRITE strategy object */ - Buffer current_buf; /* current insertion target page */ - - /* - * State for bulk extensions. - * - * last_free..next_free are further pages that were unused at the time of - * the last extension. They might be in use by the time we use them - * though, so rechecks are needed. - * - * XXX: Eventually these should probably live in RelationData instead, - * alongside targetblock. - * - * already_extended_by is the number of pages that this bulk inserted - * extended by. If we already extended by a significant number of pages, - * we can be more aggressive about extending going forward. - */ - BlockNumber next_free; - BlockNumber last_free; - uint32 already_extended_by; -} BulkInsertStateData; - - -extern void tdeheap_RelationPutHeapTuple(Relation relation, Buffer buffer, - HeapTuple tuple, bool encrypt, bool token); -extern Buffer tdeheap_RelationGetBufferForTuple(Relation relation, Size len, - Buffer otherBuffer, int options, - BulkInsertStateData *bistate, - Buffer *vmbuffer, Buffer *vmbuffer_other, - int num_pages); - -#endif /* PG_TDE_IO_H */ diff --git a/src/include/access/pg_tde_rewrite.h b/src/include/access/pg_tde_rewrite.h deleted file mode 100644 index 5285f39c..00000000 --- a/src/include/access/pg_tde_rewrite.h +++ /dev/null @@ -1,57 +0,0 @@ -/*------------------------------------------------------------------------- - * - * tdeheap_rewrite.h - * Declarations for heap rewrite support functions - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994-5, Regents of the University of California - * - * src/include/access/rewriteheap.h - * - *------------------------------------------------------------------------- - */ -#ifndef PG_TDE_REWRITE_H -#define PG_TDE_REWRITE_H - -#include "access/htup.h" -#include "storage/itemptr.h" -#include "storage/relfilelocator.h" -#include "utils/relcache.h" - -/* struct definition is private to rewriteheap.c */ -typedef struct RewriteStateData *RewriteState; - -extern RewriteState begin_tdeheap_rewrite(Relation old_heap, Relation new_heap, - TransactionId oldest_xmin, TransactionId freeze_xid, - MultiXactId cutoff_multi); -extern void end_tdeheap_rewrite(RewriteState state); -extern void rewrite_tdeheap_tuple(RewriteState state, HeapTuple old_tuple, - HeapTuple new_tuple); -extern bool rewrite_tdeheap_dead_tuple(RewriteState state, HeapTuple old_tuple); - -/* - * On-Disk data format for an individual logical rewrite mapping. - */ -typedef struct LogicalRewriteMappingData -{ - RelFileLocator old_locator; - RelFileLocator new_locator; - ItemPointerData old_tid; - ItemPointerData new_tid; -} LogicalRewriteMappingData; - -/* --- - * The filename consists of the following, dash separated, - * components: - * 1) database oid or InvalidOid for shared relations - * 2) the oid of the relation - * 3) upper 32bit of the LSN at which a rewrite started - * 4) lower 32bit of the LSN at which a rewrite started - * 5) xid we are mapping for - * 6) xid of the xact performing the mapping - * --- - */ -#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x" -extern void CheckPointLogicalRewriteHeap(void); - -#endif /* PG_TDE_REWRITE_H */ diff --git a/src/include/access/pg_tde_visibilitymap.h b/src/include/access/pg_tde_visibilitymap.h deleted file mode 100644 index 0b8213f0..00000000 --- a/src/include/access/pg_tde_visibilitymap.h +++ /dev/null @@ -1,42 +0,0 @@ -/*------------------------------------------------------------------------- - * - * tdeheap_visibilitymap.h - * visibility map interface - * - * - * Portions Copyright (c) 2007-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * src/include/access/pg_tde_visibilitymap.h - * - *------------------------------------------------------------------------- - */ -#ifndef PG_TDE_VISIBILITYMAP_H -#define PG_TDE_VISIBILITYMAP_H - -#include "access/visibilitymapdefs.h" -#include "access/xlogdefs.h" -#include "storage/block.h" -#include "storage/buf.h" -#include "utils/relcache.h" - -/* Macros for visibilitymap test */ -#define VM_ALL_VISIBLE(r, b, v) \ - ((tdeheap_visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_VISIBLE) != 0) -#define VM_ALL_FROZEN(r, b, v) \ - ((tdeheap_visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0) - -extern bool tdeheap_visibilitymap_clear(Relation rel, BlockNumber heapBlk, - Buffer vmbuf, uint8 flags); -extern void tdeheap_visibilitymap_pin(Relation rel, BlockNumber heapBlk, - Buffer *vmbuf); -extern bool tdeheap_visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); -extern void tdeheap_visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, - uint8 flags); -extern uint8 tdeheap_visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); -extern void tdeheap_visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); -extern BlockNumber tdeheap_visibilitymap_prepare_truncate(Relation rel, - BlockNumber nheapblocks); - -#endif /* PG_TDE_VISIBILITYMAP_H */ diff --git a/src/include/access/pg_tdeam.h b/src/include/access/pg_tdeam.h deleted file mode 100644 index b982c8ff..00000000 --- a/src/include/access/pg_tdeam.h +++ /dev/null @@ -1,339 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_tdeam.h - * POSTGRES heap access method definitions. - * - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * src/include/access/heapam.h - * - *------------------------------------------------------------------------- - */ -#ifndef PG_TDEAM_H -#define PG_TDEAM_H - -#include "access/relation.h" /* for backward compatibility */ -#include "access/relscan.h" -#include "access/sdir.h" -#include "access/skey.h" -#include "access/table.h" /* for backward compatibility */ -#include "access/tableam.h" -#include "nodes/lockoptions.h" -#include "nodes/primnodes.h" -#include "storage/bufpage.h" -#include "storage/dsm.h" -#include "storage/lockdefs.h" -#include "storage/shm_toc.h" -#include "utils/relcache.h" -#include "utils/snapshot.h" - - -/* "options" flag bits for tdeheap_insert */ -#define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM -#define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN -#define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL -#define HEAP_INSERT_SPECULATIVE 0x0010 -#define HEAP_INSERT_TDE_NO_ENCRYPT 0x2000 /* to specify rare cases when NO TDE enc */ - -typedef struct BulkInsertStateData *BulkInsertState; -struct TupleTableSlot; -struct VacuumCutoffs; - -#define MaxLockTupleMode LockTupleExclusive - -/* - * Descriptor for heap table scans. - */ -typedef struct HeapScanDescData -{ - TableScanDescData rs_base; /* AM independent part of the descriptor */ - - /* state set up at initscan time */ - BlockNumber rs_nblocks; /* total number of blocks in rel */ - BlockNumber rs_startblock; /* block # to start at */ - BlockNumber rs_numblocks; /* max number of blocks to scan */ - /* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */ - - /* scan current state */ - bool rs_inited; /* false = scan not init'd yet */ - OffsetNumber rs_coffset; /* current offset # in non-page-at-a-time mode */ - BlockNumber rs_cblock; /* current block # in scan, if any */ - Buffer rs_cbuf; /* current buffer in scan, if any */ - /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ - - BufferAccessStrategy rs_strategy; /* access strategy for reads */ - - HeapTupleData rs_ctup; /* current tuple in scan, if any */ - - /* - * For parallel scans to store page allocation data. NULL when not - * performing a parallel scan. - */ - ParallelBlockTableScanWorkerData *rs_parallelworkerdata; - - /* these fields only used in page-at-a-time mode and for bitmap scans */ - int rs_cindex; /* current tuple's index in vistuples */ - int rs_ntuples; /* number of visible tuples on page */ - OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ -} HeapScanDescData; -typedef struct HeapScanDescData *HeapScanDesc; - -/* - * Descriptor for fetches from heap via an index. - */ -typedef struct IndexFetchHeapData -{ - IndexFetchTableData xs_base; /* AM independent part of the descriptor */ - - Buffer xs_cbuf; /* current heap buffer in scan, if any */ - /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ -} IndexFetchHeapData; - -/* Result codes for HeapTupleSatisfiesVacuum */ -typedef enum -{ - HEAPTUPLE_DEAD, /* tuple is dead and deletable */ - HEAPTUPLE_LIVE, /* tuple is live (committed, no deleter) */ - HEAPTUPLE_RECENTLY_DEAD, /* tuple is dead, but not deletable yet */ - HEAPTUPLE_INSERT_IN_PROGRESS, /* inserting xact is still in progress */ - HEAPTUPLE_DELETE_IN_PROGRESS /* deleting xact is still in progress */ -} HTSV_Result; - -/* - * tdeheap_prepare_freeze_tuple may request that tdeheap_freeze_execute_prepared - * check any tuple's to-be-frozen xmin and/or xmax status using pg_xact - */ -#define HEAP_FREEZE_CHECK_XMIN_COMMITTED 0x01 -#define HEAP_FREEZE_CHECK_XMAX_ABORTED 0x02 - -/* tdeheap_prepare_freeze_tuple state describing how to freeze a tuple */ -typedef struct HeapTupleFreeze -{ - /* Fields describing how to process tuple */ - TransactionId xmax; - uint16 t_infomask2; - uint16 t_infomask; - uint8 frzflags; - - /* xmin/xmax check flags */ - uint8 checkflags; - /* Page offset number for tuple */ - OffsetNumber offset; -} HeapTupleFreeze; - -/* - * State used by VACUUM to track the details of freezing all eligible tuples - * on a given heap page. - * - * VACUUM prepares freeze plans for each page via tdeheap_prepare_freeze_tuple - * calls (every tuple with storage gets its own call). This page-level freeze - * state is updated across each call, which ultimately determines whether or - * not freezing the page is required. - * - * Aside from the basic question of whether or not freezing will go ahead, the - * state also tracks the oldest extant XID/MXID in the table as a whole, for - * the purposes of advancing relfrozenxid/relminmxid values in pg_class later - * on. Each tdeheap_prepare_freeze_tuple call pushes NewRelfrozenXid and/or - * NewRelminMxid back as required to avoid unsafe final pg_class values. Any - * and all unfrozen XIDs or MXIDs that remain after VACUUM finishes _must_ - * have values >= the final relfrozenxid/relminmxid values in pg_class. This - * includes XIDs that remain as MultiXact members from any tuple's xmax. - * - * When 'freeze_required' flag isn't set after all tuples are examined, the - * final choice on freezing is made by vacuumlazy.c. It can decide to trigger - * freezing based on whatever criteria it deems appropriate. However, it is - * recommended that vacuumlazy.c avoid early freezing when freezing does not - * enable setting the target page all-frozen in the visibility map afterwards. - */ -typedef struct HeapPageFreeze -{ - /* Is tdeheap_prepare_freeze_tuple caller required to freeze page? */ - bool freeze_required; - - /* - * "Freeze" NewRelfrozenXid/NewRelminMxid trackers. - * - * Trackers used when tdeheap_freeze_execute_prepared freezes, or when there - * are zero freeze plans for a page. It is always valid for vacuumlazy.c - * to freeze any page, by definition. This even includes pages that have - * no tuples with storage to consider in the first place. That way the - * 'totally_frozen' results from tdeheap_prepare_freeze_tuple can always be - * used in the same way, even when no freeze plans need to be executed to - * "freeze the page". Only the "freeze" path needs to consider the need - * to set pages all-frozen in the visibility map under this scheme. - * - * When we freeze a page, we generally freeze all XIDs < OldestXmin, only - * leaving behind XIDs that are ineligible for freezing, if any. And so - * you might wonder why these trackers are necessary at all; why should - * _any_ page that VACUUM freezes _ever_ be left with XIDs/MXIDs that - * ratchet back the top-level NewRelfrozenXid/NewRelminMxid trackers? - * - * It is useful to use a definition of "freeze the page" that does not - * overspecify how MultiXacts are affected. tdeheap_prepare_freeze_tuple - * generally prefers to remove Multis eagerly, but lazy processing is used - * in cases where laziness allows VACUUM to avoid allocating a new Multi. - * The "freeze the page" trackers enable this flexibility. - */ - TransactionId FreezePageRelfrozenXid; - MultiXactId FreezePageRelminMxid; - - /* - * "No freeze" NewRelfrozenXid/NewRelminMxid trackers. - * - * These trackers are maintained in the same way as the trackers used when - * VACUUM scans a page that isn't cleanup locked. Both code paths are - * based on the same general idea (do less work for this page during the - * ongoing VACUUM, at the cost of having to accept older final values). - */ - TransactionId NoFreezePageRelfrozenXid; - MultiXactId NoFreezePageRelminMxid; - -} HeapPageFreeze; - -/* ---------------- - * function prototypes for heap access method - * - * tdeheap_create, tdeheap_create_with_catalog, and tdeheap_drop_with_catalog - * are declared in catalog/heap.h - * ---------------- - */ - - -/* - * HeapScanIsValid - * True iff the heap scan is valid. - */ -#define HeapScanIsValid(scan) PointerIsValid(scan) - -extern TableScanDesc tdeheap_beginscan(Relation relation, Snapshot snapshot, - int nkeys, ScanKey key, - ParallelTableScanDesc parallel_scan, - uint32 flags); -extern void tdeheap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, - BlockNumber numBlks); -extern void tdeheapgetpage(TableScanDesc sscan, BlockNumber block); -extern void tdeheap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, - bool allow_strat, bool allow_sync, bool allow_pagemode); -extern void tdeheap_endscan(TableScanDesc sscan); -extern HeapTuple tdeheap_getnext(TableScanDesc sscan, ScanDirection direction); -extern bool tdeheap_getnextslot(TableScanDesc sscan, - ScanDirection direction, struct TupleTableSlot *slot); -extern void tdeheap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, - ItemPointer maxtid); -extern bool tdeheap_getnextslot_tidrange(TableScanDesc sscan, - ScanDirection direction, - TupleTableSlot *slot); -extern bool tdeheap_fetch(Relation relation, Snapshot snapshot, - HeapTuple tuple, Buffer *userbuf, bool keep_buf); -extern bool tdeheap_hot_search_buffer(ItemPointer tid, Relation relation, - Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call); - -extern void tdeheap_get_latest_tid(TableScanDesc sscan, ItemPointer tid); - -extern BulkInsertState GetBulkInsertState(void); -extern void FreeBulkInsertState(BulkInsertState); -extern void ReleaseBulkInsertStatePin(BulkInsertState bistate); - -extern void tdeheap_insert(Relation relation, HeapTuple tup, CommandId cid, - int options, BulkInsertState bistate); -extern void tdeheap_multi_insert(Relation relation, struct TupleTableSlot **slots, - int ntuples, CommandId cid, int options, - BulkInsertState bistate); -extern TM_Result tdeheap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - struct TM_FailureData *tmfd, bool changingPart); -extern void tdeheap_finish_speculative(Relation relation, ItemPointer tid); -extern void tdeheap_abort_speculative(Relation relation, ItemPointer tid); -extern TM_Result tdeheap_update(Relation relation, ItemPointer otid, - HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, - struct TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); -extern TM_Result tdeheap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, struct TM_FailureData *tmfd); - -extern void tdeheap_inplace_update(Relation relation, HeapTuple tuple); -extern bool tdeheap_prepare_freeze_tuple(HeapTupleHeader tuple, - const struct VacuumCutoffs *cutoffs, - HeapPageFreeze *pagefrz, - HeapTupleFreeze *frz, bool *totally_frozen); -extern void tdeheap_freeze_execute_prepared(Relation rel, Buffer buffer, - TransactionId snapshotConflictHorizon, - HeapTupleFreeze *tuples, int ntuples); -extern bool tdeheap_freeze_tuple(HeapTupleHeader tuple, - TransactionId relfrozenxid, TransactionId relminmxid, - TransactionId FreezeLimit, TransactionId MultiXactCutoff); -extern bool tdeheap_tuple_should_freeze(HeapTupleHeader tuple, - const struct VacuumCutoffs *cutoffs, - TransactionId *NoFreezePageRelfrozenXid, - MultiXactId *NoFreezePageRelminMxid); -extern bool tdeheap_tuple_needs_eventual_freeze(HeapTupleHeader tuple); - -extern void simple_tdeheap_insert(Relation relation, HeapTuple tup); -extern void simple_tdeheap_delete(Relation relation, ItemPointer tid); -extern void simple_tdeheap_update(Relation relation, ItemPointer otid, - HeapTuple tup, TU_UpdateIndexes *update_indexes); - -extern TransactionId tdeheap_index_delete_tuples(Relation rel, - TM_IndexDeleteOp *delstate); - -/* in heap/pruneheap.c */ -struct GlobalVisState; -extern void tdeheap_page_prune_opt(Relation relation, Buffer buffer); -extern int tdeheap_page_prune(Relation relation, Buffer buffer, - struct GlobalVisState *vistest, - TransactionId old_snap_xmin, - TimestampTz old_snap_ts, - int *nnewlpdead, - OffsetNumber *off_loc); -extern void tdeheap_page_prune_execute(Relation rel, Buffer buffer, - OffsetNumber *redirected, int nredirected, - OffsetNumber *nowdead, int ndead, - OffsetNumber *nowunused, int nunused); -extern void tdeheap_get_root_tuples(Page page, OffsetNumber *root_offsets); - -/* in heap/vacuumlazy.c */ -struct VacuumParams; -extern void tdeheap_vacuum_rel(Relation rel, - struct VacuumParams *params, BufferAccessStrategy bstrategy); - -/* in heap/pg_tdeam_visibility.c */ -extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, - Buffer buffer); -extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, - Buffer buffer); -extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, - Buffer buffer); -extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, - TransactionId *dead_after); -extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, - uint16 infomask, TransactionId xid); -extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); -extern bool HeapTupleIsSurelyDead(HeapTuple htup, - struct GlobalVisState *vistest); - -/* - * To avoid leaking too much knowledge about reorderbuffer implementation - * details this is implemented in reorderbuffer.c not pg_tdeam_visibility.c - */ -struct HTAB; -extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data, - Snapshot snapshot, - HeapTuple htup, - Buffer buffer, - CommandId *cmin, CommandId *cmax); -extern void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, - Buffer buffer, Snapshot snapshot); - -/* Defined in pg_tdeam_handler.c */ -extern bool is_tdeheap_rel(Relation rel); - -const TableAmRoutine * -GetPGTdeamTableAmRoutine(void); - -#endif /* PG_TDEAM_H */ diff --git a/src/include/access/pg_tdeam_xlog.h b/src/include/access/pg_tdeam_xlog.h deleted file mode 100644 index 9f07212c..00000000 --- a/src/include/access/pg_tdeam_xlog.h +++ /dev/null @@ -1,421 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_tdeam_xlog.h - * POSTGRES pg_tde access XLOG definitions. - * - * - * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * src/include/access/heapam_xlog.h - * - *------------------------------------------------------------------------- - */ -#ifndef PG_TDEAM_XLOG_H -#define PG_TDEAM_XLOG_H - -#include "access/htup.h" -#include "access/xlogreader.h" -#include "lib/stringinfo.h" -#include "storage/buf.h" -#include "storage/bufpage.h" -#include "storage/relfilelocator.h" -#include "utils/relcache.h" - - -/* - * WAL record definitions for pg_tdeam.c's WAL operations - * - * XLOG allows to store some information in high 4 bits of log - * record xl_info field. We use 3 for opcode and one for init bit. - */ -#define XLOG_HEAP_INSERT 0x00 -#define XLOG_HEAP_DELETE 0x10 -#define XLOG_HEAP_UPDATE 0x20 -#define XLOG_HEAP_TRUNCATE 0x30 -#define XLOG_HEAP_HOT_UPDATE 0x40 -#define XLOG_HEAP_CONFIRM 0x50 -#define XLOG_HEAP_LOCK 0x60 -#define XLOG_HEAP_INPLACE 0x70 - -#define XLOG_HEAP_OPMASK 0x70 -/* - * When we insert 1st item on new page in INSERT, UPDATE, HOT_UPDATE, - * or MULTI_INSERT, we can (and we do) restore entire page in redo - */ -#define XLOG_HEAP_INIT_PAGE 0x80 -/* - * We ran out of opcodes, so pg_tdeam.c now has a second RmgrId. These opcodes - * are associated with RM_HEAP2_ID, but are not logically different from - * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to - * these, too. - */ -#define XLOG_HEAP2_REWRITE 0x00 -#define XLOG_HEAP2_PRUNE 0x10 -#define XLOG_HEAP2_VACUUM 0x20 -#define XLOG_HEAP2_FREEZE_PAGE 0x30 -#define XLOG_HEAP2_VISIBLE 0x40 -#define XLOG_HEAP2_MULTI_INSERT 0x50 -#define XLOG_HEAP2_LOCK_UPDATED 0x60 -#define XLOG_HEAP2_NEW_CID 0x70 - -/* - * xl_tdeheap_insert/xl_tdeheap_multi_insert flag values, 8 bits are available. - */ -/* PD_ALL_VISIBLE was cleared */ -#define XLH_INSERT_ALL_VISIBLE_CLEARED (1<<0) -#define XLH_INSERT_LAST_IN_MULTI (1<<1) -#define XLH_INSERT_IS_SPECULATIVE (1<<2) -#define XLH_INSERT_CONTAINS_NEW_TUPLE (1<<3) -#define XLH_INSERT_ON_TOAST_RELATION (1<<4) - -/* all_frozen_set always implies all_visible_set */ -#define XLH_INSERT_ALL_FROZEN_SET (1<<5) - -/* - * xl_tdeheap_update flag values, 8 bits are available. - */ -/* PD_ALL_VISIBLE was cleared */ -#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED (1<<0) -/* PD_ALL_VISIBLE was cleared in the 2nd page */ -#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED (1<<1) -#define XLH_UPDATE_CONTAINS_OLD_TUPLE (1<<2) -#define XLH_UPDATE_CONTAINS_OLD_KEY (1<<3) -#define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4) -#define XLH_UPDATE_PREFIX_FROM_OLD (1<<5) -#define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6) - -/* convenience macro for checking whether any form of old tuple was logged */ -#define XLH_UPDATE_CONTAINS_OLD \ - (XLH_UPDATE_CONTAINS_OLD_TUPLE | XLH_UPDATE_CONTAINS_OLD_KEY) - -/* - * xl_tdeheap_delete flag values, 8 bits are available. - */ -/* PD_ALL_VISIBLE was cleared */ -#define XLH_DELETE_ALL_VISIBLE_CLEARED (1<<0) -#define XLH_DELETE_CONTAINS_OLD_TUPLE (1<<1) -#define XLH_DELETE_CONTAINS_OLD_KEY (1<<2) -#define XLH_DELETE_IS_SUPER (1<<3) -#define XLH_DELETE_IS_PARTITION_MOVE (1<<4) - -/* convenience macro for checking whether any form of old tuple was logged */ -#define XLH_DELETE_CONTAINS_OLD \ - (XLH_DELETE_CONTAINS_OLD_TUPLE | XLH_DELETE_CONTAINS_OLD_KEY) - -/* This is what we need to know about delete */ -typedef struct xl_tdeheap_delete -{ - TransactionId xmax; /* xmax of the deleted tuple */ - OffsetNumber offnum; /* deleted tuple's offset */ - uint8 infobits_set; /* infomask bits */ - uint8 flags; -} xl_tdeheap_delete; - -#define SizeOfHeapDelete (offsetof(xl_tdeheap_delete, flags) + sizeof(uint8)) - -/* - * xl_tdeheap_truncate flag values, 8 bits are available. - */ -#define XLH_TRUNCATE_CASCADE (1<<0) -#define XLH_TRUNCATE_RESTART_SEQS (1<<1) - -/* - * For truncate we list all truncated relids in an array, followed by all - * sequence relids that need to be restarted, if any. - * All rels are always within the same database, so we just list dbid once. - */ -typedef struct xl_tdeheap_truncate -{ - Oid dbId; - uint32 nrelids; - uint8 flags; - Oid relids[FLEXIBLE_ARRAY_MEMBER]; -} xl_tdeheap_truncate; - -#define SizeOfHeapTruncate (offsetof(xl_tdeheap_truncate, relids)) - -/* - * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted - * or updated tuple in WAL; we can save a few bytes by reconstructing the - * fields that are available elsewhere in the WAL record, or perhaps just - * plain needn't be reconstructed. These are the fields we must store. - */ -typedef struct xl_tdeheap_header -{ - uint16 t_infomask2; - uint16 t_infomask; - uint8 t_hoff; -} xl_tdeheap_header; - -#define SizeOfHeapHeader (offsetof(xl_tdeheap_header, t_hoff) + sizeof(uint8)) - -/* This is what we need to know about insert */ -typedef struct xl_tdeheap_insert -{ - OffsetNumber offnum; /* inserted tuple's offset */ - uint8 flags; - - /* xl_tdeheap_header & TUPLE DATA in backup block 0 */ -} xl_tdeheap_insert; - -#define SizeOfHeapInsert (offsetof(xl_tdeheap_insert, flags) + sizeof(uint8)) - -/* - * This is what we need to know about a multi-insert. - * - * The main data of the record consists of this xl_tdeheap_multi_insert header. - * 'offsets' array is omitted if the whole page is reinitialized - * (XLOG_HEAP_INIT_PAGE). - * - * In block 0's data portion, there is an xl_multi_insert_tuple struct, - * followed by the tuple data for each tuple. There is padding to align - * each xl_multi_insert_tuple struct. - */ -typedef struct xl_tdeheap_multi_insert -{ - uint8 flags; - uint16 ntuples; - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; -} xl_tdeheap_multi_insert; - -#define SizeOfHeapMultiInsert offsetof(xl_tdeheap_multi_insert, offsets) - -typedef struct xl_multi_insert_tuple -{ - uint16 datalen; /* size of tuple data that follows */ - uint16 t_infomask2; - uint16 t_infomask; - uint8 t_hoff; - /* TUPLE DATA FOLLOWS AT END OF STRUCT */ -} xl_multi_insert_tuple; - -#define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8)) - -/* - * This is what we need to know about update|hot_update - * - * Backup blk 0: new page - * - * If XLH_UPDATE_PREFIX_FROM_OLD or XLH_UPDATE_SUFFIX_FROM_OLD flags are set, - * the prefix and/or suffix come first, as one or two uint16s. - * - * After that, xl_tdeheap_header and new tuple data follow. The new tuple - * data doesn't include the prefix and suffix, which are copied from the - * old tuple on replay. - * - * If XLH_UPDATE_CONTAINS_NEW_TUPLE flag is given, the tuple data is - * included even if a full-page image was taken. - * - * Backup blk 1: old page, if different. (no data, just a reference to the blk) - */ -typedef struct xl_tdeheap_update -{ - TransactionId old_xmax; /* xmax of the old tuple */ - OffsetNumber old_offnum; /* old tuple's offset */ - uint8 old_infobits_set; /* infomask bits to set on old tuple */ - uint8 flags; - TransactionId new_xmax; /* xmax of the new tuple */ - OffsetNumber new_offnum; /* new tuple's offset */ - - /* - * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags - * are set, xl_tdeheap_header and tuple data for the old tuple follow. - */ -} xl_tdeheap_update; - -#define SizeOfHeapUpdate (offsetof(xl_tdeheap_update, new_offnum) + sizeof(OffsetNumber)) - -/* - * This is what we need to know about page pruning (both during VACUUM and - * during opportunistic pruning) - * - * The array of OffsetNumbers following the fixed part of the record contains: - * * for each redirected item: the item offset, then the offset redirected to - * * for each now-dead item: the item offset - * * for each now-unused item: the item offset - * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused. - * Note that nunused is not explicitly stored, but may be found by reference - * to the total record length. - * - * Acquires a full cleanup lock. - */ -typedef struct xl_tdeheap_prune -{ - TransactionId snapshotConflictHorizon; - uint16 nredirected; - uint16 ndead; - bool isCatalogRel; /* to handle recovery conflict during logical - * decoding on standby */ - /* OFFSET NUMBERS are in the block reference 0 */ -} xl_tdeheap_prune; - -#define SizeOfHeapPrune (offsetof(xl_tdeheap_prune, isCatalogRel) + sizeof(bool)) - -/* - * The vacuum page record is similar to the prune record, but can only mark - * already LP_DEAD items LP_UNUSED (during VACUUM's second heap pass) - * - * Acquires an ordinary exclusive lock only. - */ -typedef struct xl_tdeheap_vacuum -{ - uint16 nunused; - /* OFFSET NUMBERS are in the block reference 0 */ -} xl_tdeheap_vacuum; - -#define SizeOfHeapVacuum (offsetof(xl_tdeheap_vacuum, nunused) + sizeof(uint16)) - -/* flags for infobits_set */ -#define XLHL_XMAX_IS_MULTI 0x01 -#define XLHL_XMAX_LOCK_ONLY 0x02 -#define XLHL_XMAX_EXCL_LOCK 0x04 -#define XLHL_XMAX_KEYSHR_LOCK 0x08 -#define XLHL_KEYS_UPDATED 0x10 - -/* flag bits for xl_tdeheap_lock / xl_tdeheap_lock_updated's flag field */ -#define XLH_LOCK_ALL_FROZEN_CLEARED 0x01 - -/* This is what we need to know about lock */ -typedef struct xl_tdeheap_lock -{ - TransactionId xmax; /* might be a MultiXactId */ - OffsetNumber offnum; /* locked tuple's offset on page */ - uint8 infobits_set; /* infomask and infomask2 bits to set */ - uint8 flags; /* XLH_LOCK_* flag bits */ -} xl_tdeheap_lock; - -#define SizeOfHeapLock (offsetof(xl_tdeheap_lock, flags) + sizeof(uint8)) - -/* This is what we need to know about locking an updated version of a row */ -typedef struct xl_tdeheap_lock_updated -{ - TransactionId xmax; - OffsetNumber offnum; - uint8 infobits_set; - uint8 flags; -} xl_tdeheap_lock_updated; - -#define SizeOfHeapLockUpdated (offsetof(xl_tdeheap_lock_updated, flags) + sizeof(uint8)) - -/* This is what we need to know about confirmation of speculative insertion */ -typedef struct xl_tdeheap_confirm -{ - OffsetNumber offnum; /* confirmed tuple's offset on page */ -} xl_tdeheap_confirm; - -#define SizeOfHeapConfirm (offsetof(xl_tdeheap_confirm, offnum) + sizeof(OffsetNumber)) - -/* This is what we need to know about in-place update */ -typedef struct xl_tdeheap_inplace -{ - OffsetNumber offnum; /* updated tuple's offset on page */ - /* TUPLE DATA FOLLOWS AT END OF STRUCT */ -} xl_tdeheap_inplace; - -#define SizeOfHeapInplace (offsetof(xl_tdeheap_inplace, offnum) + sizeof(OffsetNumber)) - -/* - * This struct represents a 'freeze plan', which describes how to freeze a - * group of one or more heap tuples (appears in xl_tdeheap_freeze_page record) - */ -/* 0x01 was XLH_FREEZE_XMIN */ -#define XLH_FREEZE_XVAC 0x02 -#define XLH_INVALID_XVAC 0x04 - -typedef struct xl_tdeheap_freeze_plan -{ - TransactionId xmax; - uint16 t_infomask2; - uint16 t_infomask; - uint8 frzflags; - - /* Length of individual page offset numbers array for this plan */ - uint16 ntuples; -} xl_tdeheap_freeze_plan; - -/* - * This is what we need to know about a block being frozen during vacuum - * - * Backup block 0's data contains an array of xl_tdeheap_freeze_plan structs - * (with nplans elements), followed by one or more page offset number arrays. - * Each such page offset number array corresponds to a single freeze plan - * (REDO routine freezes corresponding heap tuples using freeze plan). - */ -typedef struct xl_tdeheap_freeze_page -{ - TransactionId snapshotConflictHorizon; - uint16 nplans; - bool isCatalogRel; /* to handle recovery conflict during logical - * decoding on standby */ - - /* - * In payload of blk 0 : FREEZE PLANS and OFFSET NUMBER ARRAY - */ -} xl_tdeheap_freeze_page; - -#define SizeOfHeapFreezePage (offsetof(xl_tdeheap_freeze_page, isCatalogRel) + sizeof(bool)) - -/* - * This is what we need to know about setting a visibility map bit - * - * Backup blk 0: visibility map buffer - * Backup blk 1: heap buffer - */ -typedef struct xl_tdeheap_visible -{ - TransactionId snapshotConflictHorizon; - uint8 flags; -} xl_tdeheap_visible; - -#define SizeOfHeapVisible (offsetof(xl_tdeheap_visible, flags) + sizeof(uint8)) - -typedef struct xl_tdeheap_new_cid -{ - /* - * store toplevel xid so we don't have to merge cids from different - * transactions - */ - TransactionId top_xid; - CommandId cmin; - CommandId cmax; - CommandId combocid; /* just for debugging */ - - /* - * Store the relfilelocator/ctid pair to facilitate lookups. - */ - RelFileLocator target_locator; - ItemPointerData target_tid; -} xl_tdeheap_new_cid; - -#define SizeOfHeapNewCid (offsetof(xl_tdeheap_new_cid, target_tid) + sizeof(ItemPointerData)) - -/* logical rewrite xlog record header */ -typedef struct xl_tdeheap_rewrite_mapping -{ - TransactionId mapped_xid; /* xid that might need to see the row */ - Oid mapped_db; /* DbOid or InvalidOid for shared rels */ - Oid mapped_rel; /* Oid of the mapped relation */ - off_t offset; /* How far have we written so far */ - uint32 num_mappings; /* Number of in-memory mappings */ - XLogRecPtr start_lsn; /* Insert LSN at begin of rewrite */ -} xl_tdeheap_rewrite_mapping; - -extern void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, - TransactionId *snapshotConflictHorizon); - -extern void tdeheap_redo(XLogReaderState *record); -extern void tdeheap_desc(StringInfo buf, XLogReaderState *record); -extern const char *tdeheap_identify(uint8 info); -extern void tdeheap_mask(char *pagedata, BlockNumber blkno); -extern void tdeheap2_redo(XLogReaderState *record); -extern void tdeheap2_desc(StringInfo buf, XLogReaderState *record); -extern const char *tdeheap2_identify(uint8 info); -extern void tdeheap_xlog_logical_rewrite(XLogReaderState *r); - -extern XLogRecPtr log_tdeheap_visible(Relation rel, Buffer tdeheap_buffer, - Buffer vm_buffer, - TransactionId snapshotConflictHorizon, - uint8 vmflags); - -#endif /* PG_TDEAM_XLOG_H */ diff --git a/src/include/access/pg_tdetoast.h b/src/include/access/pg_tdetoast.h deleted file mode 100644 index c17a7816..00000000 --- a/src/include/access/pg_tdetoast.h +++ /dev/null @@ -1,149 +0,0 @@ -/*------------------------------------------------------------------------- - * - * heaptoast.h - * Heap-specific definitions for external and compressed storage - * of variable size attributes. - * - * Copyright (c) 2000-2023, PostgreSQL Global Development Group - * - * src/include/access/heaptoast.h - * - *------------------------------------------------------------------------- - */ -#ifndef PG_TDE_TOAST_H -#define PG_TDE_TOAST_H - -#include "access/htup_details.h" -#include "storage/lockdefs.h" -#include "utils/relcache.h" - -/* - * Find the maximum size of a tuple if there are to be N tuples per page. - */ -#define MaximumBytesPerTuple(tuplesPerPage) \ - MAXALIGN_DOWN((BLCKSZ - \ - MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData))) \ - / (tuplesPerPage)) - -/* - * These symbols control toaster activation. If a tuple is larger than - * TOAST_TUPLE_THRESHOLD, we will try to toast it down to no more than - * TOAST_TUPLE_TARGET bytes through compressing compressible fields and - * moving EXTENDED and EXTERNAL data out-of-line. - * - * The numbers need not be the same, though they currently are. It doesn't - * make sense for TARGET to exceed THRESHOLD, but it could be useful to make - * it be smaller. - * - * Currently we choose both values to match the largest tuple size for which - * TOAST_TUPLES_PER_PAGE tuples can fit on a heap page. - * - * XXX while these can be modified without initdb, some thought needs to be - * given to needs_toast_table() in toasting.c before unleashing random - * changes. Also see LOBLKSIZE in large_object.h, which can *not* be - * changed without initdb. - */ -#define TOAST_TUPLES_PER_PAGE 4 - -#define TOAST_TUPLE_THRESHOLD MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE) - -#define TOAST_TUPLE_TARGET TOAST_TUPLE_THRESHOLD - -/* - * The code will also consider moving MAIN data out-of-line, but only as a - * last resort if the previous steps haven't reached the target tuple size. - * In this phase we use a different target size, currently equal to the - * largest tuple that will fit on a heap page. This is reasonable since - * the user has told us to keep the data in-line if at all possible. - */ -#define TOAST_TUPLES_PER_PAGE_MAIN 1 - -#define TOAST_TUPLE_TARGET_MAIN MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE_MAIN) - -/* - * If an index value is larger than TOAST_INDEX_TARGET, we will try to - * compress it (we can't move it out-of-line, however). Note that this - * number is per-datum, not per-tuple, for simplicity in index_form_tuple(). - */ -#define TOAST_INDEX_TARGET (MaxHeapTupleSize / 16) - -/* - * When we store an oversize datum externally, we divide it into chunks - * containing at most TOAST_MAX_CHUNK_SIZE data bytes. This number *must* - * be small enough that the completed toast-table tuple (including the - * ID and sequence fields and all overhead) will fit on a page. - * The coding here sets the size on the theory that we want to fit - * EXTERN_TUPLES_PER_PAGE tuples of maximum size onto a page. - * - * NB: Changing TOAST_MAX_CHUNK_SIZE requires an initdb. - */ -#define EXTERN_TUPLES_PER_PAGE 4 /* tweak only this */ - -#define EXTERN_TUPLE_MAX_SIZE MaximumBytesPerTuple(EXTERN_TUPLES_PER_PAGE) - -#define TOAST_MAX_CHUNK_SIZE \ - (EXTERN_TUPLE_MAX_SIZE - \ - MAXALIGN(SizeofHeapTupleHeader) - \ - sizeof(Oid) - \ - sizeof(int32) - \ - VARHDRSZ) - -/* ---------- - * tdeheap_toast_insert_or_update - - * - * Called by tdeheap_insert() and tdeheap_update(). - * ---------- - */ -extern HeapTuple tdeheap_toast_insert_or_update(Relation rel, HeapTuple newtup, - HeapTuple oldtup, int options); - -/* ---------- - * tdeheap_toast_delete - - * - * Called by tdeheap_delete(). - * ---------- - */ -extern void tdeheap_toast_delete(Relation rel, HeapTuple oldtup, - bool is_speculative); - -/* ---------- - * toast_flatten_tuple - - * - * "Flatten" a tuple to contain no out-of-line toasted fields. - * (This does not eliminate compressed or short-header datums.) - * ---------- - */ -extern HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc); - -/* ---------- - * toast_flatten_tuple_to_datum - - * - * "Flatten" a tuple containing out-of-line toasted fields into a Datum. - * ---------- - */ -extern Datum toast_flatten_tuple_to_datum(HeapTupleHeader tup, - uint32 tup_len, - TupleDesc tupleDesc); - -/* ---------- - * toast_build_flattened_tuple - - * - * Build a tuple containing no out-of-line toasted fields. - * (This does not eliminate compressed or short-header datums.) - * ---------- - */ -extern HeapTuple toast_build_flattened_tuple(TupleDesc tupleDesc, - Datum *values, - bool *isnull); - -/* ---------- - * tdeheap_fetch_toast_slice - * - * Fetch a slice from a toast value stored in a heap table. - * ---------- - */ -extern void tdeheap_fetch_toast_slice(Relation toastrel, Oid valueid, - int32 attrsize, int32 sliceoffset, - int32 slicelength, struct varlena *result); - -#endif /* PG_TDE_TOAST_H */ From 6cbd7c879af77d66d9012c829123ec8321f6ef8c Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Sun, 4 Aug 2024 20:17:22 +0100 Subject: [PATCH 5/6] Added new merging script and updated build scripts accordingly Recent commits in the PG17 code added additional API changes, making the "single src directory with ifdefs" approach inpractical. This commit adds a new python based script (documented with comments in the file) to help with version specific merges, where the copied heap files reside in srcXX directories, where XX is the version. --- .gitignore | 3 +- Makefile.in | 18 +- meson.build | 23 +-- tools/heap_merge.sh | 409 -------------------------------------------- tools/repl.sed | 25 +++ tools/tool.py | 198 +++++++++++++++++++++ 6 files changed, 247 insertions(+), 429 deletions(-) delete mode 100644 tools/heap_merge.sh create mode 100644 tools/repl.sed create mode 100644 tools/tool.py diff --git a/.gitignore b/.gitignore index 5f9bebe5..684ce242 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,10 @@ *.so *.o +__pycache__ /config.cache /config.log /config.status /Makefile /autom4te.cache -/configure~ \ No newline at end of file +/configure~ diff --git a/Makefile.in b/Makefile.in index 5b4d98bf..2ac378cb 100644 --- a/Makefile.in +++ b/Makefile.in @@ -24,16 +24,16 @@ TAP_TESTS = 1 OBJS = src/encryption/enc_tde.o \ src/encryption/enc_aes.o \ src/access/pg_tde_slot.o \ -src/access/pg_tde_io.o \ -src/access/pg_tdeam_visibility.o \ src/access/pg_tde_tdemap.o \ -src/access/pg_tdeam.o \ -src/access/pg_tdetoast.o \ -src/access/pg_tde_prune.o \ -src/access/pg_tde_vacuumlazy.o \ -src/access/pg_tde_visibilitymap.o \ -src/access/pg_tde_rewrite.o \ -src/access/pg_tdeam_handler.o \ +src$(MAJORVERSION)/access/pg_tde_io.o \ +src$(MAJORVERSION)/access/pg_tdeam_visibility.o \ +src$(MAJORVERSION)/access/pg_tdeam.o \ +src$(MAJORVERSION)/access/pg_tdetoast.o \ +src$(MAJORVERSION)/access/pg_tde_prune.o \ +src$(MAJORVERSION)/access/pg_tde_vacuumlazy.o \ +src$(MAJORVERSION)/access/pg_tde_visibilitymap.o \ +src$(MAJORVERSION)/access/pg_tde_rewrite.o \ +src$(MAJORVERSION)/access/pg_tdeam_handler.o \ src/access/pg_tde_ddl.o \ src/access/pg_tde_xlog.o \ src/transam/pg_tde_xact_handler.o \ diff --git a/meson.build b/meson.build index 6445442d..b217ebae 100644 --- a/meson.build +++ b/meson.build @@ -13,20 +13,23 @@ conf_data.set_quoted('PACKAGE_TARNAME', 'pg_tde') configure_file(output : 'config.h', configuration : conf_data) +pg_version = meson.project_version().substring(0,2) +src_version = 'src' + pg_version + pg_tde_sources = files( 'src/pg_tde.c', 'src/transam/pg_tde_xact_handler.c', 'src/access/pg_tde_tdemap.c', 'src/access/pg_tde_slot.c', - 'src/access/pg_tdeam.c', - 'src/access/pg_tdeam_handler.c', - 'src/access/pg_tdeam_visibility.c', - 'src/access/pg_tdetoast.c', - 'src/access/pg_tde_io.c', - 'src/access/pg_tde_prune.c', - 'src/access/pg_tde_rewrite.c', - 'src/access/pg_tde_vacuumlazy.c', - 'src/access/pg_tde_visibilitymap.c', + src_version / 'access/pg_tdeam.c', + src_version / 'access/pg_tdeam_handler.c', + src_version / 'access/pg_tdeam_visibility.c', + src_version / 'access/pg_tdetoast.c', + src_version / 'access/pg_tde_io.c', + src_version / 'access/pg_tde_prune.c', + src_version / 'access/pg_tde_rewrite.c', + src_version / 'access/pg_tde_vacuumlazy.c', + src_version / 'access/pg_tde_visibilitymap.c', 'src/access/pg_tde_ddl.c', 'src/access/pg_tde_xlog.c', @@ -51,7 +54,7 @@ pg_tde_sources = files( 'src/pg_tde_event_capture.c', ) -incdir = include_directories('src/include', '.') +incdir = include_directories(src_version / 'include', 'src/include', '.') deps_update = {'dependencies': contrib_mod_args.get('dependencies') + [curldep]} diff --git a/tools/heap_merge.sh b/tools/heap_merge.sh deleted file mode 100644 index ad305e35..00000000 --- a/tools/heap_merge.sh +++ /dev/null @@ -1,409 +0,0 @@ -#!/bin/bash - -# SCRIPT: patch_generator.sh -#----------------------------- -# This script generates patch between two PG commits and applies it to -# the TDE extension source. - -set -o pipefail - -## GLOBAL VARIABLES -export TDE="tde" -export SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - -export WORKING_DIR="${WORKING_DIR:-$(mktemp -d -t $TDE)}" -export TDE_DIR="${WORKING_DIR}/tde" -export USER_TDE_DIR="" -export PG_COMMIT_BASE="${PG_COMMIT_BASE}" -export PG_COMMIT_LATEST="${PG_COMMIT_BASE}" -export TDE_COMMIT="${TDE_COMMIT}" - -export FILES_BASE_DIR="pg_base" -export FILES_LATEST_DIR="pg_latest" -export FILES_PATCH_DIR="pg_patches" -export TDE_DRY_RUN="--dry-run" -export APPLY_PATCHES_FORCE=0 - -# Script variables -total_patches=0 -total_patches_failed=0 - -declare -a patch_list_unclean=() - -declare -a pg_header_file_map=("visibilitymap.h" "rewriteheap.h" "heapam_xlog.h" "hio.h" "heapam.h" "heaptoast.h") -declare -a tde_header_file_map=("pg_tde_visibilitymap.h" "pg_tde_rewrite.h" "pg_tdeam_xlog.h" "pg_tde_io.h" "pg_tdeam.h" "pg_tdetoast.h") - -declare -a pg_c_file_map=("heapam.c" "heapam_handler.c" "heapam_visibility.c" "heaptoast.c" "hio.c" "pruneheap.c" "rewriteheap.c" "vacuumlazy.c" "visibilitymap.c") -declare -a tde_c_file_map=("pg_tdeam.c" "pg_tdeam_handler.c" "pg_tdeam_visibility.c" "pg_tdetoast.c" "pg_tde_io.c" "pg_tde_prune.c" "pg_tde_rewrite.c" "pg_tde_vacuumlazy.c" "pg_tde_visibilitymap.c") - - -## USAGE -usage() -{ - errorCode=${1:-0} - - cat << EOF - -usage: $0 OPTIONS - -This script generates file-wise patches between two PG commits and applies it to -the TDE extension source. - -By default, it only performs a dry run of the patch application. See the usage -options below for applying clean patches or forcefully applying all patches. - -It clones both PG and TDE repositories in the working directory. If TDE path is -specified either with its usage option or via the environment variable, then -the script will use the given TDE source code. - -* All working folders folders created will carry "$TDE" as part of the folder name. -* This simplies the manual cleanup process. - -OPTIONS can be: - - -h Show this message - - -a The patches are not applied by default. Specify this to - apply the generated patches. Otherwise, the script will - only perform a dryrun. - - -f Force apply patches. - - -b [PG_COMMIT_BASE] PG base commit hash/branch/tag for patch [REQUIRED] - -l [PG_COMMIT_LATEST] PG lastest commit hash/branch/tag for patch [REQUIRED] - -x [TDE_COMMIT] TDE commit hash/branch/tag to apply patch on [REQUIRED] - - -t [USER_TDE_DIR] Source directory for TDE [Default: Cloned under WORKING_DIR] - -w [WORKING_DIR] Script working folder [Default: $WORKING_DIR] - * a folder where patches and relevant log - files may be created. This folder will not be removed - by the script, so better to keep it in the temp folder. - -EOF - - if [[ $errorCode -ne 0 ]]; - then - exit_script $errorCode - fi -} - -# Perform any required cleanup and exit with the given error/success code -exit_script() -{ - # Reminder of manual cleanup - if [[ -d $WORKING_DIR ]]; - then - printf "\n%20s\n" | tr " " "-" - printf "The following folder was created by the script and may require manual removal.\n" - printf "* %s\n" $WORKING_DIR - printf "%20s\n" | tr " " "-" - fi - - # Exit with a given return code or 0 if none are provided. - exit ${1:-0} -} - -# Raise the error for a failure to checkout required source -checkout_validate() -{ - commit=$1 - retval=$2 - - if [[ $rteval -ne 0 ]]; - then - printf "%s is not a valid commit hash/branch/tag.\n" $commit - exit_script $retval - fi -} - -# Vaildate arguments to ensure that we can safely run the benchmark -validate_args() -{ - local USAGE_TEXT="See usage for details." - local PATH_ERROR_TEXT="path is not a valid directory." - - if [[ ! -z "$USER_TDE_DIR" ]]; - then - if [[ ! -d "$USER_TDE_DIR" ]]; - then - printf "TDE %s %s\n" $PATH_ERROR_TEXT $USAGE_TEXT >&2 - usage 1 - fi - elif [[ -z "$TDE_COMMIT" ]]; - then - printf "TDE_COMMIT is not specified. %s\n" $USAGE_TEXT >&2 - usage 1 - fi - - - if [[ ! -d "$WORKING_DIR" ]]; - then - printf "Working folder %s %s\n" $PATH_ERROR_TEXT $USAGE_TEXT >&2 - usage 1 - fi - - if [[ -z "$PG_COMMIT_BASE" ]]; - then - printf "PG_COMMIT_BASE is not specified. %s\n" $USAGE_TEXT >&2 - usage 1 - fi - - if [[ -z "$PG_COMMIT_LATEST" ]]; - then - printf "PG_COMMIT_LATEST is not specified. %s\n" $USAGE_TEXT >&2 - usage 1 - fi -} - -# Print the file mapping between PG and TDE -print_map() -{ - printf "\n" - printf "%50s\n" | tr " " "=" - printf "%s\n" "Heap Access to TDE File Map" - printf "%50s\n\n" | tr " " "=" - - printf "%s\n" "--- Header Files ---" - for (( i=0; i < ${#pg_header_file_map[@]}; i++ )); - do - printf "* %-20s --> %s\n" ${pg_header_file_map[$i]} ${tde_header_file_map[$i]} - done - - printf "\n" - printf "%s\n" "--- C Files ---" - for (( i=0; i < ${#pg_c_file_map[@]}; i++ )); - do - printf "* %-20s --> %s\n" ${pg_c_file_map[$i]} ${tde_c_file_map[$i]} - done - - printf "\n\n" -} - -# Copy files from the PG source to the a separate folder. -# This function expects that we don't have duplicate file names. -copy_files() -{ - local dest_folder=$1 - shift - local file_list=("$@") - retval=0 - - for f in "${file_list[@]}"; - do - find * -name $f -exec cp -rpv {} $dest_folder \; - retval=$? - - if [[ $retval -ne 0 ]]; - then - exit_script $retval - fi - done -} - -# Compare two files and generate a patch -generate_file_patch() -{ - f_base=$1 - f_latest=$2 - f_patch=$3 - - diff -u $f_base $f_latest > $f_patch - - if [[ ! -s $f_patch ]]; - then - rm -fv $f_patch - else - total_patches=$(expr $total_patches + 1) - fi -} - -# Apply a given patch on a given file -apply_file_patch() -{ - local file_to_patch=$1 - local patch_file=$2 - local apply_patch=${APPLY_PATCHES_FORCE} - - echo "===> $APPLY_PATCHES_FORCE ==> $apply_patch" - - if [[ -f $patch_file ]]; - then - find * -name $file_to_patch | xargs -I{} echo "patch -p1 -t --dry-run {} $patch_file" | sh - - if [[ $? -ne 0 ]]; - then - total_patches_failed=$(expr $total_patches_failed + 1) - patch_list_unclean+=($(basename $patch_file)) - patch_list_unclean+=($(basename $file_to_patch)) - elif [[ -z "$TDE_DRY_RUN" ]]; - then - apply_patch=1 - fi - - echo "ABOUT TO APPLY PATCH" - - if [[ $apply_patch -eq 1 ]]; - then - echo "APPLYING PACH" - find * -name $file_to_patch | xargs -I{} echo "patch -p1 -t {} $patch_file" | sh - fi - fi -} - -# Generate file-wise patches using the -generate_pg_patches() -{ - retval=0 - - mkdir $FILES_BASE_DIR - mkdir $FILES_LATEST_DIR - mkdir $FILES_PATCH_DIR - - git clone https://github.com/postgres/postgres.git - - # go into the postgres directory - pushd postgres - - # safety net to ensure that any changes introduced due to git configuration are cleaned up - git checkout . - - #checkout base source code - git checkout $PG_COMMIT_BASE - checkout_validate $PG_COMMIT_BASE $? - copy_files "$WORKING_DIR/$FILES_BASE_DIR" "${pg_header_file_map[@]}" - copy_files "$WORKING_DIR/$FILES_BASE_DIR" "${pg_c_file_map[@]}" - - # safety net to ensure that any changes introduced due to git configuration are cleaned up - git checkout . - - # do the latest checkout - git checkout $PG_COMMIT_LATEST - checkout_validate $PG_COMMIT_LATEST $? - copy_files "$WORKING_DIR/$FILES_LATEST_DIR" "${pg_header_file_map[@]}" - copy_files "$WORKING_DIR/$FILES_LATEST_DIR" "${pg_c_file_map[@]}" - - # go back to the old directory - popd - - # generate patches for the header files - for f in "${pg_header_file_map[@]}"; - do - generate_file_patch "$FILES_BASE_DIR/$f" "$FILES_LATEST_DIR/$f" "$FILES_PATCH_DIR/$f.patch" - done - - # generate patches for the c files - for f in "${pg_c_file_map[@]}"; - do - generate_file_patch "$FILES_BASE_DIR/$f" "$FILES_LATEST_DIR/$f" "$FILES_PATCH_DIR/$f.patch" - done -} - -# Apply patches to the TDE sources -tde_apply_patches() -{ - # check if the $TDE folder exists. If not, then we have to clone it - if [[ ! -d "$TDE_DIR" ]]; - then - t="$(basename $TDE_DIR)" - git clone https://github.com/Percona-Lab/pg_tde.git $t - fi - - pushd $TDE_DIR - - # do the required checkout - git checkout $TDE_COMMIT - checkout_validate $TDE_COMMIT $? - - # apply patches to the header files - for (( i=0; i < ${#pg_header_file_map[@]}; i++ )); - do - patch_file=$WORKING_DIR/$FILES_PATCH_DIR/${pg_header_file_map[$i]}.patch - apply_file_patch ${tde_header_file_map[$i]} $patch_file - done - - # apply patches to the header files - for (( i=0; i < ${#pg_c_file_map[@]}; i++ )); - do - patch_file=$WORKING_DIR/$FILES_PATCH_DIR/${pg_c_file_map[$i]}.patch - apply_file_patch ${tde_c_file_map[$i]} $patch_file - done -} - -# Check options passed in. -while getopts "haf t:b:l:w:x:" OPTION -do - case $OPTION in - h) - usage - exit_script 1 - ;; - - a) - TDE_DRY_RUN="" - ;; - - f) - APPLY_PATCHES_FORCE=1 - ;; - b) - PG_COMMIT_BASE=$OPTARG - ;; - l) - PG_COMMIT_LATEST=$OPTARG - ;; - t) - TDE_DIR=$OPTARG - ;; - w) - WORK_DIR=$OPTARG - ;; - x) - TDE_COMMIT=$OPTARG - ;; - - ?) - usage - exit_script - ;; - esac -done - -# Validate and update setup -validate_args - -# print the file map -print_map - -# Let's move to the working directory -pushd $WORKING_DIR - -# generate pg patches between the two commits -generate_pg_patches - -# apply patches -tde_apply_patches - -# We're done... -printf "\nJob completed!\n" - -printf "\n\n" -printf "%50s\n" | tr " " "=" -printf "RESULT SUMMARY\n" -printf "%50s\n" | tr " " "=" -printf "Patches Generated = %s\n" $total_patches -printf "Patches Applied = %s\n" $(expr $total_patches - $total_patches_failed) -printf "Patches Failed = %s\n" $total_patches_failed - -if [[ ${#patch_list_unclean[@]} -gt 0 ]]; -then - printf "=> Failed Patch List\n" -fi - -for (( i=0; i < ${#patch_list_unclean[@]}; i++ )); -do - printf "* %s --> %s\n" ${patch_list_unclean[$i]} ${patch_list_unclean[$(expr $i + 1)]} - i=$(expr $i + 1) -done - -# Perform clean up and exit. -exit_script 0 diff --git a/tools/repl.sed b/tools/repl.sed new file mode 100644 index 00000000..2cac8e82 --- /dev/null +++ b/tools/repl.sed @@ -0,0 +1,25 @@ +# These first few lines are only for the initial run, but should be harmless in later runs +s/\theap_/\ttdeheap_/g +s/\t\*heap_/\t*tdeheap_/g +s/ heap_/ tdeheap_/g +s/ \*heap_/ *tdeheap_/g +s/(heap_/ (tdeheap_/g +s/^heap_/tdeheap_/g +s/_heap_/_tdeheap_/g +s/-heap_/-tdeheap_/g +s/+heap_/+tdeheap_/g +s/!heap_/!tdeheap_/g +s/heapam_/pg_tdeam_/g +s/heap2_/tdeheap2_/g +s/heapgettup/tdeheapgettup/g +s/heapgetpage/tdeheapgetpage/g +s/visibilitymap_/tdeheap_visibilitymap_/g +s/RelationPutHeapTuple/tdeheap_RelationPutHeapTuple/g +s/RelationGetBufferForTuple/tdeheap_RelationGetBufferForTuple/g +s/TTSOpsBufferHeapTuple/TTSOpsTDEBufferHeapTuple/g +s/TTS_IS_BUFFERTUPLE/TTS_IS_TDE_BUFFERTUPLE/g +s/toast_tuple_externalize/tdeheap_toast_tuple_externalize/g +# Repairing error by earlier rule +s/num_tdeheap_tuples/num_heap_tuples/g +s/pgstat_update_tdeheap_dead_tuples/pgstat_update_heap_dead_tuples/g +s/tdeheap_xlog_deserialize_prune_and_freeze/heap_xlog_deserialize_prune_and_freeze/g \ No newline at end of file diff --git a/tools/tool.py b/tools/tool.py new file mode 100644 index 00000000..a66b0efb --- /dev/null +++ b/tools/tool.py @@ -0,0 +1,198 @@ +# Simple helper script for upstream merges to the copied heap code +# It implements a few simple steps which can be used to automate +# most operations +# +# Generally this script assumes that pg_tde is checked out as a +# submodule inside postgres, in the contrib/pg_tde directory. +# +# Most methods interact with the currently checked out version +# of postgres, this part is not automated at all. Select the +# correct commit before executing functions! +# +# == copy +# +# Copies the required heapam source files from the postgres repo, +# to the specified inside the pg_tde repo. Also +# renames the files, places them in the correct directory, and +# runs the automatic sed replacement script. +# +# The sed replacements only cover the name changes, mainly changing "heap" +# to "tdeheap". It doesn't apply the actual encryption changes! +# +# It also creates a file named "COMMIT" in the directory, which contains the +# commit hash used. +# +# == diff +# +# Runs diff on the tdeheap files between and , and places +# the results into +# +# The assumption is that contains the copied, but not TDEfied +# version of the files, while is the actual current TDEfied code, +# and that way this command creates the "tde patch" for the given commit. +# +# For example, assuming that we have the PG16 tde sources in the src16 +# directory, these steps create a diff for the current sources: +# 1. check out the src16/COMMIT commit +# 2. run `copy tmp_16dir` +# 3. run `diff tmp_16dir src16 diff16` +# 4. delete the tmp_16dir directory +# +# == apply +# +# Applies the diffs created by the diff command from the to the +# source directory. +# +# When the diff can't be applied cleanly, and there are conflicts, it still +# writes the file with conflicts, using the diff3 format (usual git conflict +# markers). which can be resolved manually. +# +# The recommended action in this case is to first create a commit with the +# conflicts as-is, and then create a separate commit with the conflicts +# resolved and the code working. +# +# This is mainly intended for version upgrades. +# For example, if the current version is 16, and the goal is creating the 17 +# version: +# 1. create the src16 diff using the steps described in the `diff` section +# 2. checkout the 17 version in the postgres repo +# 3. use the copy command to create a base directory for the 17 version +# 4. create a commit with the src17 basefiles +# 5. use the apply command to apply the patches +# 6. commit things with conflicts +# 7. resolve the conflicts as needed +# 8. commit resolved/working sources + + +import shutil +import os +import subprocess +import sys + +tools_directory = os.path.dirname(os.path.realpath(__file__)) + +pg_root = tools_directory + "/../../../" +heapam_src_dir = pg_root + "src/backend/access/heap/" +heapam_inc_dir = pg_root + "src/include/access/" + +tde_root = tools_directory + "/../" + +heapam_headers = { + "visibilitymap.h": "pg_tde_visibilitymap.h", + "rewriteheap.h": "pg_tde_rewrite.h", + "heapam_xlog.h": "pg_tdeam_xlog.h", + "hio.h": "pg_tde_io.h", + "heapam.h": "pg_tdeam.h", + "heaptoast.h": "pg_tdetoast.h" +} + +heapam_sources = { + "heapam.c": "pg_tdeam.c", + "heapam_handler.c": "pg_tdeam_handler.c", + "heapam_visibility.c": "pg_tdeam_visibility.c", + "heaptoast.c": "pg_tdetoast.c", + "hio.c": "pg_tde_io.c", + "pruneheap.c": "pg_tde_prune.c", + "rewriteheap.c": "pg_tde_rewrite.c", + "vacuumlazy.c": "pg_tde_vacuumlazy.c", + "visibilitymap.c": "pg_tde_visibilitymap.c", +} + +def copy_and_sed_things(files, src, dst): + os.makedirs(dst, exist_ok=True) + for original,copy in files.items(): + print(" - ", original, "=>", copy) + shutil.copyfile(src+original, dst+copy) + subprocess.call(["sed", "-i", "-f", tools_directory + "/repl.sed", dst+copy]) + +def copy_upstream_things(dstdir): + print("Processing headers") + copy_and_sed_things(heapam_headers, heapam_inc_dir, tde_root + dstdir + "/include/access/") + print("Processing sources") + copy_and_sed_things(heapam_sources, heapam_src_dir, tde_root + dstdir + "/access/") + # Also create a commit file + cwd = os.getcwd() + os.chdir(pg_root) + commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]) + os.chdir(cwd) + f = open(tde_root + dstdir + "/COMMIT", "w") + f.write(commit_hash.decode("utf-8")) + f.close() + + +def save_diffs(files, src, dst, diffdir): + os.makedirs(tde_root + "/" + diffdir, exist_ok=True) + for _,copy in files.items(): + print(" - ", copy + ".patch") + diff = subprocess.run(["diff", "-u", tde_root+src+"/"+copy, tde_root+dst+"/"+copy], stdout = subprocess.PIPE, stderr=subprocess.PIPE, check=False) + f = open(tde_root + "/" + diffdir + "/" + copy + ".patch", "w") + f.write(diff.stdout.decode("utf-8")) + f.close() + +def diff_things(src, dst, diffdir): + print("Processing headers") + save_diffs(heapam_headers, src + "/include/access/", dst + "/include/access/", diffdir) + print("Processing sources") + save_diffs(heapam_sources, src + "/access/", dst + "/access/", diffdir) + +def apply_diffs(files, dst, diffdir): + for _,copy in files.items(): + print(" - ", copy + ".patch") + patch = subprocess.run(["patch", "--merge=diff3", "-l", "--no-backup-if-mismatch", tde_root+dst+"/"+copy, tde_root+"/"+diffdir+"/"+copy+".patch"], stdout = subprocess.PIPE, stderr=subprocess.PIPE, check=False) + print(patch.stdout.decode("utf-8")) + print(patch.stderr.decode("utf-8")) + +def apply_things(dst, diffdir): + print("Processing headers") + apply_diffs(heapam_headers, dst + "/include/access/", diffdir) + print("Processing sources") + apply_diffs(heapam_sources, dst + "/access/", diffdir) + +def rm_files(files, src): + for _,copy in files.items(): + print(" - RM ", copy) + os.remove(tde_root+src+"/"+copy) + +def rm_things(srcdir): + print("Processing headers") + rm_files(heapam_headers, srcdir + "/include/access/") + print("Processing sources") + rm_files(heapam_sources, srcdir + "/access/") + +if len(sys.argv) < 2: + print("No command given! Commands:") + print(" - copy") + print(" - diff") + print(" - ppply") + print(" - rm ") + exit() + +if sys.argv[1] == "copy": + if len(sys.argv) < 3: + print("No target directory given!") + print("Usage: tool.py copy ") + exit() + copy_upstream_things(sys.argv[2]) + +if sys.argv[1] == "diff": + if len(sys.argv) < 5: + print("Not enough parameters!") + print("Usage: tool.py diff ") + exit() + diff_things(sys.argv[2], sys.argv[3], sys.argv[4]) + +if sys.argv[1] == "apply": + if len(sys.argv) < 4: + print("Not enough parameters!") + print("Usage: tool.py patch ") + exit() + apply_things(sys.argv[2], sys.argv[3]) + + + +if sys.argv[1] == "rm": + if len(sys.argv) < 3: + print("No target directory given!") + print("Usage: tool.py rm ") + exit() + rm_things(sys.argv[2]) \ No newline at end of file From a9b623fccb5f914247031b007836b9863f9f64c2 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Mon, 5 Aug 2024 19:51:11 +0100 Subject: [PATCH 6/6] Updated CI runners --- .github/workflows/postgresql-16-pgdg-package-pgxs.yml | 2 +- .github/workflows/postgresql-16-src-make-macos.yml | 2 +- .github/workflows/postgresql-16-src-make-ssl11.yml | 2 +- .github/workflows/postgresql-16-src-make.yml | 2 +- Makefile.in | 4 ++-- docker/Dockerfile | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/postgresql-16-pgdg-package-pgxs.yml b/.github/workflows/postgresql-16-pgdg-package-pgxs.yml index 2a9bc8f2..3439f1b2 100644 --- a/.github/workflows/postgresql-16-pgdg-package-pgxs.yml +++ b/.github/workflows/postgresql-16-pgdg-package-pgxs.yml @@ -56,7 +56,7 @@ jobs: run: | sudo -u postgres bash -c './configure' sudo -u postgres bash -c 'make USE_PGXS=1' - sudo make USE_PGXS=1 install + sudo make USE_PGXS=1 MAJORVERSION=16 install working-directory: src/pg_tde - name: Start pg_tde tests diff --git a/.github/workflows/postgresql-16-src-make-macos.yml b/.github/workflows/postgresql-16-src-make-macos.yml index d20ee7e5..bc6179de 100644 --- a/.github/workflows/postgresql-16-src-make-macos.yml +++ b/.github/workflows/postgresql-16-src-make-macos.yml @@ -39,7 +39,7 @@ jobs: - name: Build pg_tde run: | ./configure - make -j + make -j MAJORVERSION=16 sudo make install working-directory: src/contrib/pg_tde diff --git a/.github/workflows/postgresql-16-src-make-ssl11.yml b/.github/workflows/postgresql-16-src-make-ssl11.yml index 06ede237..2b7f4a5d 100644 --- a/.github/workflows/postgresql-16-src-make-ssl11.yml +++ b/.github/workflows/postgresql-16-src-make-ssl11.yml @@ -58,7 +58,7 @@ jobs: - name: Build pg_tde run: | ./configure - make -j + make -j MAJORVERSION=16 sudo make install working-directory: src/contrib/pg_tde diff --git a/.github/workflows/postgresql-16-src-make.yml b/.github/workflows/postgresql-16-src-make.yml index 0499caf9..668523d6 100644 --- a/.github/workflows/postgresql-16-src-make.yml +++ b/.github/workflows/postgresql-16-src-make.yml @@ -58,7 +58,7 @@ jobs: - name: Build pg_tde run: | ./configure - make -j + make -j MAJORVERSION=16 sudo make install working-directory: src/contrib/pg_tde diff --git a/Makefile.in b/Makefile.in index 2ac378cb..6c79023e 100644 --- a/Makefile.in +++ b/Makefile.in @@ -57,12 +57,12 @@ override PG_CPPFLAGS += @tde_CPPFLAGS@ ifdef USE_PGXS PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) -override PG_CPPFLAGS += -I$(CURDIR)/src/include +override PG_CPPFLAGS += -I$(CURDIR)/src/include -I$(CURDIR)/src$(MAJORVERSION)/include include $(PGXS) else subdir = contrib/pg_tde top_builddir = ../.. -override PG_CPPFLAGS += -I$(top_srcdir)/$(subdir)/src/include +override PG_CPPFLAGS += -I$(top_srcdir)/$(subdir)/src/include -I$(top_srcdir)/$(subdir)/src$(MAJORVERSION)/include include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif diff --git a/docker/Dockerfile b/docker/Dockerfile index c8b46154..516f5987 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -14,7 +14,7 @@ WORKDIR /opt/pg_tde COPY . . RUN ./configure && \ - make USE_PGXS=1 && \ + make USE_PGXS=1 MAJORVERSION=16 && \ make USE_PGXS=1 install RUN cp /usr/share/postgresql/postgresql.conf.sample /etc/postgresql/postgresql.conf; \ echo "shared_preload_libraries = 'pg_tde'" >> /etc/postgresql/postgresql.conf; \