diff --git a/.asf.yaml b/.asf.yaml index a2a3064783f0e98..3892aca2eddb77c 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -152,13 +152,13 @@ github: - LemonLiTree - Yukang-Lian - TangSiyang2001 - - Lchangliang - freemandealer - shuke987 - wm1581066 - KassieZ - yujun777 - doris-robot + - LiBinfeng-01 notifications: pullrequests_status: commits@doris.apache.org diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 98febd914c2724b..8c5fc5e3a98e427 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,63 @@ -## Proposed changes +### What problem does this PR solve? + + + Issue Number: close #xxx - + +Related PR: #xxx + +Problem Summary: + +### Check List (For Committer) + +- Test + + - [ ] Regression test + - [ ] Unit Test + - [ ] Manual test (add detailed scripts or steps below) + - [ ] No need to test or manual test. Explain why: + - [ ] This is a refactor/code format and no logic has been changed. + - [ ] Previous test can cover this change. + - [ ] No colde files have been changed. + - [ ] Other reason + +- Behavior changed: + + - [ ] No. + - [ ] Yes. + +- Does this need documentation? + + - [ ] No. + - [ ] Yes. + +- Release note + + + + None + +### Check List (For Reviewer who merge this PR) + +- [ ] Confirm the release note +- [ ] Confirm test cases +- [ ] Confirm document +- [ ] Add branch pick label diff --git a/.github/workflows/auto-cherry-pick.yml b/.github/workflows/auto-cherry-pick.yml index 7d97e498ba3263a..f76c88934fdc9ef 100644 --- a/.github/workflows/auto-cherry-pick.yml +++ b/.github/workflows/auto-cherry-pick.yml @@ -30,7 +30,7 @@ permissions: jobs: auto_cherry_pick: runs-on: ubuntu-latest - if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') && github.event.pull_request.merged == true }} + if: ${{ (contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') || contains(github.event.pull_request.labels.*.name, 'dev/2.1.x')) && github.event.pull_request.merged == true }} steps: - name: Checkout repository uses: actions/checkout@v3 @@ -45,7 +45,7 @@ jobs: pip install PyGithub - name: Check SHA run: | - expected_sha="80b7c6087f2a3e4f4c7f035a52e8e7b05ce00f27aa5c1bd52179df685c912447f94a96145fd3204a3958d8ed9777de5a5183b120e99e0e95bbca0366d69b0ac0" + expected_sha="4e4c0d7689b765c7f0677d75d23222555afa9286af46cf77ced66fa247a298d9f8a8c86830d0ce55f70e5f09532b54fbafee040c0343833077cbc7e214d486d2" calculated_sha=$(sha512sum tools/auto-pick-script.py | awk '{ print $1 }') if [ "$calculated_sha" != "$expected_sha" ]; then echo "SHA mismatch! Expected: $expected_sha, but got: $calculated_sha" @@ -53,10 +53,19 @@ jobs: else echo "SHA matches: $calculated_sha" fi - - name: Auto cherry-pick + - name: Auto cherry-pick to branch-3.0 + if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_NAME: ${{ github.repository }} CONFLICT_LABEL: cherry-pick-conflict-in-3.0 run: | python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-3.0 + - name: Auto cherry-pick to branch-2.1 + if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/2.1.x') }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO_NAME: ${{ github.repository }} + CONFLICT_LABEL: cherry-pick-conflict-in-2.1.x + run: | + python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-2.1 diff --git a/.github/workflows/build-extension.yml b/.github/workflows/build-extension.yml index 14998f24144b735..d12fe7d9d713872 100644 --- a/.github/workflows/build-extension.yml +++ b/.github/workflows/build-extension.yml @@ -20,7 +20,9 @@ name: Build Extensions on: pull_request: - + workflow_dispatch: + issue_comment: + types: [ created ] concurrency: group: ${{ github.ref }} (Build Extensions) cancel-in-progress: true @@ -29,6 +31,12 @@ jobs: changes: name: Detect Changes runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') outputs: broker_changes: ${{ steps.filter.outputs.broker_changes }} docs_changes: ${{ steps.filter.outputs.docs_changes }} diff --git a/.github/workflows/build-thirdparty.yml b/.github/workflows/build-thirdparty.yml index 991b5089035699f..7bc5d8a8182a719 100644 --- a/.github/workflows/build-thirdparty.yml +++ b/.github/workflows/build-thirdparty.yml @@ -19,6 +19,9 @@ name: Build Third Party Libraries on: pull_request: + workflow_dispatch: + issue_comment: + types: [ created ] concurrency: group: ${{ github.ref }} (Build Third Party Libraries) @@ -28,6 +31,12 @@ jobs: changes: name: Detect Changes runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') outputs: thirdparty_changes: ${{ steps.filter.outputs.thirdparty_changes }} steps: diff --git a/.github/workflows/checkstyle.yaml b/.github/workflows/checkstyle.yaml index 13ab46b2cd50b22..a53a19d82649b9b 100644 --- a/.github/workflows/checkstyle.yaml +++ b/.github/workflows/checkstyle.yaml @@ -20,11 +20,20 @@ name: FE Code Style Checker on: pull_request: + workflow_dispatch: + issue_comment: + types: [ created ] jobs: java-checkstyle: name: "CheckStyle" runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') steps: - name: Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index adc77450d78c013..a81d64e4e2b1f18 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -19,12 +19,22 @@ --- name: Code Formatter -on: [push, pull_request_target] - +on: + pull_request: + pull_request_target: + workflow_dispatch: + issue_comment: + types: [ created ] jobs: clang-format: name: "Clang Formatter" runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || (github.event_name == 'pull_request_target') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" if: ${{ github.event_name != 'pull_request_target' }} diff --git a/.github/workflows/license-eyes.yml b/.github/workflows/license-eyes.yml index 890efb2d9d11962..c17081fc75b9e82 100644 --- a/.github/workflows/license-eyes.yml +++ b/.github/workflows/license-eyes.yml @@ -22,10 +22,21 @@ on: push: branches: - master + workflow_dispatch: + issue_comment: + types: [ created ] + jobs: license-check: name: "License Check" runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request_target') || + (github.event_name == 'push' && github.ref == 'refs/heads/master') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" if: ${{ github.event_name != 'pull_request_target' }} diff --git a/be/src/agent/agent_server.cpp b/be/src/agent/agent_server.cpp index 9d36148b64f3051..361a8ab93a90a69 100644 --- a/be/src/agent/agent_server.cpp +++ b/be/src/agent/agent_server.cpp @@ -33,6 +33,7 @@ #include "agent/utils.h" #include "agent/workload_group_listener.h" #include "agent/workload_sched_policy_listener.h" +#include "cloud/config.h" #include "common/config.h" #include "common/logging.h" #include "common/status.h" @@ -193,7 +194,7 @@ void AgentServer::start_workers(StorageEngine& engine, ExecEnv* exec_env) { "REPORT_DISK_STATE", _master_info, config::report_disk_state_interval_seconds, [&engine, &master_info = _master_info] { report_disk_callback(engine, master_info); })); _report_workers.push_back(std::make_unique( - "REPORT_OLAP_TABLE", _master_info, config::report_tablet_interval_seconds,[&engine, &master_info = _master_info] { report_tablet_callback(engine, master_info); })); + "REPORT_OLAP_TABLET", _master_info, config::report_tablet_interval_seconds,[&engine, &master_info = _master_info] { report_tablet_callback(engine, master_info); })); // clang-format on } @@ -211,6 +212,10 @@ void AgentServer::cloud_start_workers(CloudStorageEngine& engine, ExecEnv* exec_ "CALC_DBM_TASK", config::calc_delete_bitmap_worker_count, [&engine](auto&& task) { return calc_delete_bitmap_callback(engine, task); }); + // cloud, drop tablet just clean clear_cache, so just one thread do it + _workers[TTaskType::DROP] = std::make_unique( + "DROP_TABLE", 1, [&engine](auto&& task) { return drop_tablet_callback(engine, task); }); + _report_workers.push_back(std::make_unique( "REPORT_TASK", _master_info, config::report_task_interval_seconds, [&master_info = _master_info] { report_task_callback(master_info); })); @@ -218,6 +223,14 @@ void AgentServer::cloud_start_workers(CloudStorageEngine& engine, ExecEnv* exec_ _report_workers.push_back(std::make_unique( "REPORT_DISK_STATE", _master_info, config::report_disk_state_interval_seconds, [&engine, &master_info = _master_info] { report_disk_callback(engine, master_info); })); + + if (config::enable_cloud_tablet_report) { + _report_workers.push_back(std::make_unique( + "REPORT_OLAP_TABLET", _master_info, config::report_tablet_interval_seconds, + [&engine, &master_info = _master_info] { + report_tablet_callback(engine, master_info); + })); + } } // TODO(lingbin): each task in the batch may have it own status or FE must check and diff --git a/be/src/agent/heartbeat_server.cpp b/be/src/agent/heartbeat_server.cpp index 146604aaab20f44..78002ed08fe0df5 100644 --- a/be/src/agent/heartbeat_server.cpp +++ b/be/src/agent/heartbeat_server.cpp @@ -26,6 +26,7 @@ #include #include +#include "cloud/cloud_tablet_mgr.h" #include "cloud/config.h" #include "common/config.h" #include "common/status.h" @@ -275,6 +276,11 @@ Status HeartbeatServer::_heartbeat(const TMasterInfo& master_info) { LOG(INFO) << "set config cloud_unique_id " << master_info.cloud_unique_id << " " << st; } + if (master_info.__isset.tablet_report_inactive_duration_ms) { + doris::g_tablet_report_inactive_duration_ms = + master_info.tablet_report_inactive_duration_ms; + } + if (need_report) { LOG(INFO) << "Master FE is changed or restarted. report tablet and disk info immediately"; _engine.notify_listeners(); diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index 5906511ce157949..d9efe6dbedde24d 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -48,6 +48,8 @@ #include "cloud/cloud_delete_task.h" #include "cloud/cloud_engine_calc_delete_bitmap_task.h" #include "cloud/cloud_schema_change_job.h" +#include "cloud/cloud_tablet_mgr.h" +#include "cloud/config.h" #include "common/config.h" #include "common/logging.h" #include "common/status.h" @@ -116,6 +118,10 @@ bool register_task_info(const TTaskType::type task_type, int64_t signature) { // no need to report task of these types return true; } + if (task_type == TTaskType::type::DROP && config::is_cloud_mode()) { + // cloud no need to report drop task status + return true; + } if (signature == -1) { // No need to report task with unintialized signature return true; @@ -1134,6 +1140,46 @@ void report_tablet_callback(StorageEngine& engine, const TMasterInfo& master_inf } } +void report_tablet_callback(CloudStorageEngine& engine, const TMasterInfo& master_info) { + // Random sleep 1~5 seconds before doing report. + // In order to avoid the problem that the FE receives many report requests at the same time + // and can not be processed. + if (config::report_random_wait) { + random_sleep(5); + } + + TReportRequest request; + request.__set_backend(BackendOptions::get_local_backend()); + request.__isset.tablets = true; + + increase_report_version(); + uint64_t report_version; + uint64_t total_num_tablets = 0; + for (int i = 0; i < 5; i++) { + request.tablets.clear(); + report_version = s_report_version; + engine.tablet_mgr().build_all_report_tablets_info(&request.tablets, &total_num_tablets); + if (report_version == s_report_version) { + break; + } + } + + if (report_version < s_report_version) { + LOG(WARNING) << "report version " << report_version << " change to " << s_report_version; + DorisMetrics::instance()->report_all_tablets_requests_skip->increment(1); + return; + } + + request.__set_report_version(report_version); + request.__set_num_tablets(total_num_tablets); + + bool succ = handle_report(request, master_info, "tablet"); + report_tablet_total << 1; + if (!succ) [[unlikely]] { + report_tablet_failed << 1; + } +} + void upload_callback(StorageEngine& engine, ExecEnv* env, const TAgentTaskRequest& req) { const auto& upload_request = req.upload_req; @@ -1610,6 +1656,21 @@ void drop_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) { remove_task_info(req.task_type, req.signature); } +void drop_tablet_callback(CloudStorageEngine& engine, const TAgentTaskRequest& req) { + const auto& drop_tablet_req = req.drop_tablet_req; + DBUG_EXECUTE_IF("WorkPoolCloudDropTablet.drop_tablet_callback.failed", { + LOG_WARNING("WorkPoolCloudDropTablet.drop_tablet_callback.failed") + .tag("tablet_id", drop_tablet_req.tablet_id); + return; + }); + // 1. erase lru from tablet mgr + // TODO(dx) clean tablet file cache + // get tablet's info(such as cachekey, tablet id, rsid) + engine.tablet_mgr().erase_tablet(drop_tablet_req.tablet_id); + // 2. gen clean file cache task + return; +} + void push_callback(StorageEngine& engine, const TAgentTaskRequest& req) { const auto& push_req = req.push_req; diff --git a/be/src/agent/task_worker_pool.h b/be/src/agent/task_worker_pool.h index f51d6c2a4c0dc04..c50ac57ffe9b743 100644 --- a/be/src/agent/task_worker_pool.h +++ b/be/src/agent/task_worker_pool.h @@ -155,6 +155,8 @@ void create_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) void drop_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req); +void drop_tablet_callback(CloudStorageEngine& engine, const TAgentTaskRequest& req); + void clear_transaction_task_callback(StorageEngine& engine, const TAgentTaskRequest& req); void push_callback(StorageEngine& engine, const TAgentTaskRequest& req); @@ -188,6 +190,8 @@ void report_disk_callback(CloudStorageEngine& engine, const TMasterInfo& master_ void report_tablet_callback(StorageEngine& engine, const TMasterInfo& master_info); +void report_tablet_callback(CloudStorageEngine& engine, const TMasterInfo& master_info); + void calc_delete_bitmap_callback(CloudStorageEngine& engine, const TAgentTaskRequest& req); } // namespace doris diff --git a/be/src/cloud/cloud_base_compaction.cpp b/be/src/cloud/cloud_base_compaction.cpp index f431eaf850bbd19..88d83000e95dfaa 100644 --- a/be/src/cloud/cloud_base_compaction.cpp +++ b/be/src/cloud/cloud_base_compaction.cpp @@ -124,7 +124,8 @@ Status CloudBaseCompaction::prepare_compact() { for (auto& rs : _input_rowsets) { _input_row_num += rs->num_rows(); _input_segments += rs->num_segments(); - _input_rowsets_size += rs->data_disk_size(); + _input_rowsets_data_size += rs->data_disk_size(); + _input_rowsets_total_size += rs->total_disk_size(); } LOG_INFO("start CloudBaseCompaction, tablet_id={}, range=[{}-{}]", _tablet->tablet_id(), _input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()) @@ -132,7 +133,9 @@ Status CloudBaseCompaction::prepare_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size); + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size); return st; } @@ -270,17 +273,21 @@ Status CloudBaseCompaction::execute_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total", _input_rowsets_total_size) .tag("output_rows", _output_rowset->num_rows()) .tag("output_segments", _output_rowset->num_segments()) - .tag("output_data_size", _output_rowset->data_disk_size()); + .tag("output_rowset_data_size", _output_rowset->data_disk_size()) + .tag("output_rowset_index_size", _output_rowset->index_disk_size()) + .tag("output_rowset_total_size", _output_rowset->total_disk_size()); //_compaction_succeed = true; _state = CompactionState::SUCCESS; DorisMetrics::instance()->base_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->base_compaction_bytes_total->increment(_input_rowsets_size); - base_output_size << _output_rowset->data_disk_size(); + DorisMetrics::instance()->base_compaction_bytes_total->increment(_input_rowsets_total_size); + base_output_size << _output_rowset->total_disk_size(); return Status::OK(); } @@ -302,8 +309,8 @@ Status CloudBaseCompaction::modify_rowsets() { compaction_job->set_output_cumulative_point(cloud_tablet()->cumulative_layer_point()); compaction_job->set_num_input_rows(_input_row_num); compaction_job->set_num_output_rows(_output_rowset->num_rows()); - compaction_job->set_size_input_rowsets(_input_rowsets_size); - compaction_job->set_size_output_rowsets(_output_rowset->data_disk_size()); + compaction_job->set_size_input_rowsets(_input_rowsets_total_size); + compaction_job->set_size_output_rowsets(_output_rowset->total_disk_size()); compaction_job->set_num_input_segments(_input_segments); compaction_job->set_num_output_segments(_output_rowset->num_segments()); compaction_job->set_num_input_rowsets(_input_rowsets.size()); diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp b/be/src/cloud/cloud_cumulative_compaction.cpp index 7910d94534e086b..8eb925776934874 100644 --- a/be/src/cloud/cloud_cumulative_compaction.cpp +++ b/be/src/cloud/cloud_cumulative_compaction.cpp @@ -164,7 +164,9 @@ Status CloudCumulativeCompaction::prepare_compact() { for (auto& rs : _input_rowsets) { _input_row_num += rs->num_rows(); _input_segments += rs->num_segments(); - _input_rowsets_size += rs->data_disk_size(); + _input_rowsets_data_size += rs->data_disk_size(); + _input_rowsets_index_size += rs->index_disk_size(); + _input_rowsets_total_size += rs->total_disk_size(); } LOG_INFO("start CloudCumulativeCompaction, tablet_id={}, range=[{}-{}]", _tablet->tablet_id(), _input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()) @@ -172,7 +174,9 @@ Status CloudCumulativeCompaction::prepare_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size) .tag("tablet_max_version", cloud_tablet()->max_version_unlocked()) .tag("cumulative_point", cloud_tablet()->cumulative_layer_point()) .tag("num_rowsets", cloud_tablet()->fetch_add_approximate_num_rowsets(0)) @@ -201,10 +205,14 @@ Status CloudCumulativeCompaction::execute_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size) .tag("output_rows", _output_rowset->num_rows()) .tag("output_segments", _output_rowset->num_segments()) - .tag("output_data_size", _output_rowset->data_disk_size()) + .tag("output_rowset_data_size", _output_rowset->data_disk_size()) + .tag("output_rowset_index_size", _output_rowset->index_disk_size()) + .tag("output_rowset_total_size", _output_rowset->total_disk_size()) .tag("tablet_max_version", _tablet->max_version_unlocked()) .tag("cumulative_point", cloud_tablet()->cumulative_layer_point()) .tag("num_rowsets", cloud_tablet()->fetch_add_approximate_num_rowsets(0)) @@ -213,8 +221,9 @@ Status CloudCumulativeCompaction::execute_compact() { _state = CompactionState::SUCCESS; DorisMetrics::instance()->cumulative_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->cumulative_compaction_bytes_total->increment(_input_rowsets_size); - cumu_output_size << _output_rowset->data_disk_size(); + DorisMetrics::instance()->cumulative_compaction_bytes_total->increment( + _input_rowsets_total_size); + cumu_output_size << _output_rowset->total_disk_size(); return Status::OK(); } @@ -243,8 +252,8 @@ Status CloudCumulativeCompaction::modify_rowsets() { compaction_job->set_output_cumulative_point(new_cumulative_point); compaction_job->set_num_input_rows(_input_row_num); compaction_job->set_num_output_rows(_output_rowset->num_rows()); - compaction_job->set_size_input_rowsets(_input_rowsets_size); - compaction_job->set_size_output_rowsets(_output_rowset->data_disk_size()); + compaction_job->set_size_input_rowsets(_input_rowsets_total_size); + compaction_job->set_size_output_rowsets(_output_rowset->total_disk_size()); compaction_job->set_num_input_segments(_input_segments); compaction_job->set_num_output_segments(_output_rowset->num_segments()); compaction_job->set_num_input_rowsets(_input_rowsets.size()); diff --git a/be/src/cloud/cloud_cumulative_compaction_policy.cpp b/be/src/cloud/cloud_cumulative_compaction_policy.cpp index f9af469e56f60a1..5a9879387b23278 100644 --- a/be/src/cloud/cloud_cumulative_compaction_policy.cpp +++ b/be/src/cloud/cloud_cumulative_compaction_policy.cpp @@ -209,7 +209,7 @@ int64_t CloudSizeBasedCumulativeCompactionPolicy::new_cumulative_point( // if rowsets have no delete version, check output_rowset total disk size satisfies promotion size. return output_rowset->start_version() == last_cumulative_point && (last_delete_version.first != -1 || - output_rowset->data_disk_size() >= cloud_promotion_size(tablet) || + output_rowset->total_disk_size() >= cloud_promotion_size(tablet) || satisfy_promotion_version) ? output_rowset->end_version() + 1 : last_cumulative_point; diff --git a/be/src/cloud/cloud_full_compaction.cpp b/be/src/cloud/cloud_full_compaction.cpp index f22c449223c448b..c27b728c93d29b1 100644 --- a/be/src/cloud/cloud_full_compaction.cpp +++ b/be/src/cloud/cloud_full_compaction.cpp @@ -98,7 +98,9 @@ Status CloudFullCompaction::prepare_compact() { for (auto& rs : _input_rowsets) { _input_row_num += rs->num_rows(); _input_segments += rs->num_segments(); - _input_rowsets_size += rs->data_disk_size(); + _input_rowsets_data_size += rs->data_disk_size(); + _input_rowsets_index_size += rs->index_disk_size(); + _input_rowsets_total_size += rs->total_disk_size(); } LOG_INFO("start CloudFullCompaction, tablet_id={}, range=[{}-{}]", _tablet->tablet_id(), _input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()) @@ -106,7 +108,9 @@ Status CloudFullCompaction::prepare_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size); + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size); return st; } @@ -162,16 +166,20 @@ Status CloudFullCompaction::execute_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size) .tag("output_rows", _output_rowset->num_rows()) .tag("output_segments", _output_rowset->num_segments()) - .tag("output_data_size", _output_rowset->data_disk_size()); + .tag("output_rowset_data_size", _output_rowset->data_disk_size()) + .tag("output_rowset_index_size", _output_rowset->index_disk_size()) + .tag("output_rowset_total_size", _output_rowset->total_disk_size()); _state = CompactionState::SUCCESS; DorisMetrics::instance()->full_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->full_compaction_bytes_total->increment(_input_rowsets_size); - full_output_size << _output_rowset->data_disk_size(); + DorisMetrics::instance()->full_compaction_bytes_total->increment(_input_rowsets_total_size); + full_output_size << _output_rowset->total_disk_size(); return Status::OK(); } @@ -193,8 +201,8 @@ Status CloudFullCompaction::modify_rowsets() { compaction_job->set_output_cumulative_point(_output_rowset->end_version() + 1); compaction_job->set_num_input_rows(_input_row_num); compaction_job->set_num_output_rows(_output_rowset->num_rows()); - compaction_job->set_size_input_rowsets(_input_rowsets_size); - compaction_job->set_size_output_rowsets(_output_rowset->data_disk_size()); + compaction_job->set_size_input_rowsets(_input_rowsets_total_size); + compaction_job->set_size_output_rowsets(_output_rowset->total_disk_size()); DBUG_EXECUTE_IF("CloudFullCompaction::modify_rowsets.wrong_compaction_data_size", { compaction_job->set_size_input_rowsets(1); compaction_job->set_size_output_rowsets(10000001); @@ -345,7 +353,7 @@ Status CloudFullCompaction::_cloud_full_compaction_update_delete_bitmap(int64_t .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size) .tag("update_bitmap_size", delete_bitmap->delete_bitmap.size()); _tablet->tablet_meta()->delete_bitmap().merge(*delete_bitmap); return Status::OK(); diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index 7dc9a4f11a157c3..74d14911f62b98b 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ #include "cloud/cloud_tablet.h" #include "cloud/config.h" #include "cloud/pb_convert.h" +#include "common/config.h" #include "common/logging.h" #include "common/status.h" #include "cpp/sync_point.h" @@ -51,6 +53,7 @@ #include "olap/olap_common.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/storage_engine.h" #include "olap/tablet_meta.h" #include "runtime/client_cache.h" @@ -292,6 +295,9 @@ static std::string debug_info(const Request& req) { return fmt::format(" tablet_id={}", req.rowset_meta().tablet_id()); } else if constexpr (is_any_v) { return fmt::format(" tablet_id={}", req.tablet_id()); + } else if constexpr (is_any_v) { + return fmt::format(" table_id={}, tablet_id={}, lock_id={}", req.table_id(), + req.tablet_id(), req.lock_id()); } else { static_assert(!sizeof(Request)); } @@ -410,6 +416,10 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ req.set_cumulative_point(tablet->cumulative_layer_point()); } req.set_end_version(-1); + // backend side use schema dict + if (config::variant_use_cloud_schema_dict) { + req.set_schema_op(GetRowsetRequest::RETURN_DICT); + } VLOG_DEBUG << "send GetRowsetRequest: " << req.ShortDebugString(); stub->get_rowset(&cntl, &req, &resp, nullptr); @@ -524,7 +534,8 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ existed_rowset->rowset_id().to_string() == cloud_rs_meta_pb.rowset_id_v2()) { continue; // Same rowset, skip it } - RowsetMetaPB meta_pb = cloud_rowset_meta_to_doris(cloud_rs_meta_pb); + RowsetMetaPB meta_pb = cloud_rowset_meta_to_doris( + cloud_rs_meta_pb, resp.has_schema_dict() ? &resp.schema_dict() : nullptr); auto rs_meta = std::make_shared(); rs_meta->init_from_pb(meta_pb); RowsetSharedPtr rowset; @@ -750,6 +761,7 @@ Status CloudMetaMgr::commit_rowset(const RowsetMeta& rs_meta, Status ret_st; TEST_INJECTION_POINT_RETURN_WITH_VALUE("CloudMetaMgr::commit_rowset", ret_st); } + check_table_size_correctness(rs_meta); CreateRowsetRequest req; CreateRowsetResponse resp; req.set_cloud_unique_id(config::cloud_unique_id); @@ -1107,6 +1119,25 @@ Status CloudMetaMgr::get_delete_bitmap_update_lock(const CloudTablet& tablet, in return st; } +Status CloudMetaMgr::remove_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id, + int64_t initiator) { + VLOG_DEBUG << "remove_delete_bitmap_update_lock , tablet_id: " << tablet.tablet_id() + << ",lock_id:" << lock_id; + RemoveDeleteBitmapUpdateLockRequest req; + RemoveDeleteBitmapUpdateLockResponse res; + req.set_cloud_unique_id(config::cloud_unique_id); + req.set_tablet_id(tablet.tablet_id()); + req.set_lock_id(lock_id); + req.set_initiator(initiator); + auto st = retry_rpc("remove delete bitmap update lock", req, &res, + &MetaService_Stub::remove_delete_bitmap_update_lock); + if (!st.ok()) { + LOG(WARNING) << "remove delete bitmap update lock fail,tablet_id=" << tablet.tablet_id() + << " lock_id=" << lock_id << " st=" << st.to_string(); + } + return st; +} + Status CloudMetaMgr::remove_old_version_delete_bitmap( int64_t tablet_id, const std::vector>& to_delete) { @@ -1125,4 +1156,124 @@ Status CloudMetaMgr::remove_old_version_delete_bitmap( return st; } +void CloudMetaMgr::check_table_size_correctness(const RowsetMeta& rs_meta) { + if (!config::enable_table_size_correctness_check) { + return; + } + int64_t total_segment_size = get_segment_file_size(rs_meta); + int64_t total_inverted_index_size = get_inverted_index_file_szie(rs_meta); + if (rs_meta.data_disk_size() != total_segment_size || + rs_meta.index_disk_size() != total_inverted_index_size || + rs_meta.data_disk_size() + rs_meta.index_disk_size() != rs_meta.total_disk_size()) { + LOG(WARNING) << "[Cloud table table size check failed]:" + << " tablet id: " << rs_meta.tablet_id() + << ", rowset id:" << rs_meta.rowset_id() + << ", rowset data disk size:" << rs_meta.data_disk_size() + << ", rowset real data disk size:" << total_segment_size + << ", rowset index disk size:" << rs_meta.index_disk_size() + << ", rowset real index disk size:" << total_inverted_index_size + << ", rowset total disk size:" << rs_meta.total_disk_size() + << ", rowset segment path:" + << StorageResource().remote_segment_path(rs_meta.tablet_id(), + rs_meta.rowset_id().to_string(), 0); + DCHECK(false); + } +} + +int64_t CloudMetaMgr::get_segment_file_size(const RowsetMeta& rs_meta) { + int64_t total_segment_size = 0; + const auto fs = const_cast(rs_meta).fs(); + if (!fs) { + LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta.resource_id(); + } + for (int64_t seg_id = 0; seg_id < rs_meta.num_segments(); seg_id++) { + std::string segment_path = StorageResource().remote_segment_path( + rs_meta.tablet_id(), rs_meta.rowset_id().to_string(), seg_id); + int64_t segment_file_size = 0; + auto st = fs->file_size(segment_path, &segment_file_size); + if (!st.ok()) { + segment_file_size = 0; + if (st.is()) { + LOG(INFO) << "cloud table size correctness check get segment size 0 because " + "file not exist! msg:" + << st.msg() << ", segment path:" << segment_path; + } else { + LOG(WARNING) << "cloud table size correctness check get segment size failed! msg:" + << st.msg() << ", segment path:" << segment_path; + } + } + total_segment_size += segment_file_size; + } + return total_segment_size; +} + +int64_t CloudMetaMgr::get_inverted_index_file_szie(const RowsetMeta& rs_meta) { + int64_t total_inverted_index_size = 0; + const auto fs = const_cast(rs_meta).fs(); + if (!fs) { + LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta.resource_id(); + } + if (rs_meta.tablet_schema()->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { + auto indices = rs_meta.tablet_schema()->indexes(); + for (auto& index : indices) { + // only get file_size for inverted index + if (index.index_type() != IndexType::INVERTED) { + continue; + } + for (int seg_id = 0; seg_id < rs_meta.num_segments(); ++seg_id) { + std::string segment_path = StorageResource().remote_segment_path( + rs_meta.tablet_id(), rs_meta.rowset_id().to_string(), seg_id); + int64_t file_size = 0; + + std::string inverted_index_file_path = + InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_path), + index.index_id(), index.get_index_suffix()); + auto st = fs->file_size(inverted_index_file_path, &file_size); + if (!st.ok()) { + file_size = 0; + if (st.is()) { + LOG(INFO) << "cloud table size correctness check get inverted index v1 " + "0 because file not exist! msg:" + << st.msg() + << ", inverted index path:" << inverted_index_file_path; + } else { + LOG(WARNING) + << "cloud table size correctness check get inverted index v1 " + "size failed! msg:" + << st.msg() << ", inverted index path:" << inverted_index_file_path; + } + } + total_inverted_index_size += file_size; + } + } + } else { + for (int seg_id = 0; seg_id < rs_meta.num_segments(); ++seg_id) { + int64_t file_size = 0; + std::string segment_path = StorageResource().remote_segment_path( + rs_meta.tablet_id(), rs_meta.rowset_id().to_string(), seg_id); + + std::string inverted_index_file_path = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_path)); + auto st = fs->file_size(inverted_index_file_path, &file_size); + if (!st.ok()) { + file_size = 0; + if (st.is()) { + LOG(INFO) << "cloud table size correctness check get inverted index v2 " + "0 because file not exist! msg:" + << st.msg() << ", inverted index path:" << inverted_index_file_path; + } else { + LOG(WARNING) << "cloud table size correctness check get inverted index v2 " + "size failed! msg:" + << st.msg() + << ", inverted index path:" << inverted_index_file_path; + } + } + total_inverted_index_size += file_size; + } + } + return total_inverted_index_size; +} + } // namespace doris::cloud diff --git a/be/src/cloud/cloud_meta_mgr.h b/be/src/cloud/cloud_meta_mgr.h index 79cdb3fd3d1f8c0..a6d7ccd201f6088 100644 --- a/be/src/cloud/cloud_meta_mgr.h +++ b/be/src/cloud/cloud_meta_mgr.h @@ -101,6 +101,9 @@ class CloudMetaMgr { Status get_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id, int64_t initiator); + Status remove_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id, + int64_t initiator); + Status remove_old_version_delete_bitmap( int64_t tablet_id, const std::vector>& to_delete); @@ -113,6 +116,9 @@ class CloudMetaMgr { Status sync_tablet_delete_bitmap(CloudTablet* tablet, int64_t old_max_version, std::ranges::range auto&& rs_metas, const TabletStatsPB& stats, const TabletIndexPB& idx, DeleteBitmap* delete_bitmap); + void check_table_size_correctness(const RowsetMeta& rs_meta); + int64_t get_segment_file_size(const RowsetMeta& rs_meta); + int64_t get_inverted_index_file_szie(const RowsetMeta& rs_meta); }; } // namespace cloud diff --git a/be/src/cloud/cloud_rowset_builder.cpp b/be/src/cloud/cloud_rowset_builder.cpp index 192da0f17efa825..2e6764b33aa79cb 100644 --- a/be/src/cloud/cloud_rowset_builder.cpp +++ b/be/src/cloud/cloud_rowset_builder.cpp @@ -106,7 +106,7 @@ void CloudRowsetBuilder::update_tablet_stats() { tablet->fetch_add_approximate_num_rowsets(1); tablet->fetch_add_approximate_num_segments(_rowset->num_segments()); tablet->fetch_add_approximate_num_rows(_rowset->num_rows()); - tablet->fetch_add_approximate_data_size(_rowset->data_disk_size()); + tablet->fetch_add_approximate_data_size(_rowset->total_disk_size()); tablet->fetch_add_approximate_cumu_num_rowsets(1); tablet->fetch_add_approximate_cumu_num_deltas(_rowset->num_segments()); tablet->write_count.fetch_add(1, std::memory_order_relaxed); diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index b7e3be93e853bb9..896804578d7db9c 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -344,7 +344,7 @@ Status CloudSchemaChangeJob::_convert_historical_rowsets(const SchemaChangeParam sc_job->add_txn_ids(rs->txn_id()); sc_job->add_output_versions(rs->end_version()); num_output_rows += rs->num_rows(); - size_output_rowsets += rs->data_disk_size(); + size_output_rowsets += rs->total_disk_size(); num_output_segments += rs->num_segments(); } sc_job->set_num_output_rows(num_output_rows); diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index b944db87030c29b..b467703637c9618 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -108,6 +108,36 @@ Status CloudTablet::capture_rs_readers(const Version& spec_version, return capture_rs_readers_unlocked(version_path, rs_splits); } +Status CloudTablet::merge_rowsets_schema() { + // Find the rowset with the max version + auto max_version_rowset = + std::max_element( + _rs_version_map.begin(), _rs_version_map.end(), + [](const auto& a, const auto& b) { + return !a.second->tablet_schema() + ? true + : (!b.second->tablet_schema() + ? false + : a.second->tablet_schema()->schema_version() < + b.second->tablet_schema() + ->schema_version()); + }) + ->second; + TabletSchemaSPtr max_version_schema = max_version_rowset->tablet_schema(); + // If the schema has variant columns, perform a merge to create a wide tablet schema + if (max_version_schema->num_variant_columns() > 0) { + std::vector schemas; + std::transform(_rs_version_map.begin(), _rs_version_map.end(), std::back_inserter(schemas), + [](const auto& rs_meta) { return rs_meta.second->tablet_schema(); }); + // Merge the collected schemas to obtain the least common schema + RETURN_IF_ERROR(vectorized::schema_util::get_least_common_schema(schemas, nullptr, + max_version_schema)); + VLOG_DEBUG << "dump schema: " << max_version_schema->dump_full_schema(); + _merged_tablet_schema = max_version_schema; + } + return Status::OK(); +} + // There are only two tablet_states RUNNING and NOT_READY in cloud mode // This function will erase the tablet from `CloudTabletMgr` when it can't find this tablet in MS. Status CloudTablet::sync_rowsets(int64_t query_version, bool warmup_delta_data) { @@ -133,6 +163,10 @@ Status CloudTablet::sync_rowsets(int64_t query_version, bool warmup_delta_data) if (st.is()) { clear_cache(); } + + // Merge all rowset schemas within a CloudTablet + RETURN_IF_ERROR(merge_rowsets_schema()); + return st; } @@ -188,16 +222,7 @@ Status CloudTablet::sync_if_not_running() { } TabletSchemaSPtr CloudTablet::merged_tablet_schema() const { - std::shared_lock rdlock(_meta_lock); - TabletSchemaSPtr target_schema; - std::vector schemas; - for (const auto& [_, rowset] : _rs_version_map) { - schemas.push_back(rowset->tablet_schema()); - } - // get the max version schema and merge all schema - static_cast( - vectorized::schema_util::get_least_common_schema(schemas, nullptr, target_schema)); - return target_schema; + return _merged_tablet_schema; } void CloudTablet::add_rowsets(std::vector to_add, bool version_overlap, @@ -412,7 +437,7 @@ int CloudTablet::delete_expired_stale_rowsets() { void CloudTablet::update_base_size(const Rowset& rs) { // Define base rowset as the rowset of version [2-x] if (rs.start_version() == 2) { - _base_size = rs.data_disk_size(); + _base_size = rs.total_disk_size(); } } @@ -872,4 +897,12 @@ Status CloudTablet::sync_meta() { return Status::OK(); } +void CloudTablet::build_tablet_report_info(TTabletInfo* tablet_info) { + std::shared_lock rdlock(_meta_lock); + tablet_info->__set_total_version_count(_tablet_meta->version_count()); + tablet_info->__set_tablet_id(_tablet_meta->tablet_id()); + // Currently, this information will not be used by the cloud report, + // but it may be used in the future. +} + } // namespace doris diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 53747dc19e27dea..5f4785b62d23746 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -196,10 +196,13 @@ class CloudTablet final : public BaseTablet { int64_t last_base_compaction_success_time_ms = 0; int64_t last_cumu_compaction_success_time_ms = 0; int64_t last_cumu_no_suitable_version_ms = 0; + int64_t last_access_time_ms = 0; // Return merged extended schema TabletSchemaSPtr merged_tablet_schema() const override; + void build_tablet_report_info(TTabletInfo* tablet_info); + private: // FIXME(plat1ko): No need to record base size if rowsets are ordered by version void update_base_size(const Rowset& rs); @@ -208,6 +211,9 @@ class CloudTablet final : public BaseTablet { Status sync_if_not_running(); + // Merge all rowset schemas within a CloudTablet + Status merge_rowsets_schema(); + CloudStorageEngine& _engine; // this mutex MUST ONLY be used when sync meta @@ -246,6 +252,9 @@ class CloudTablet final : public BaseTablet { std::mutex _base_compaction_lock; std::mutex _cumulative_compaction_lock; mutable std::mutex _rowset_update_lock; + + // Schema will be merged from all rowsets when sync_rowsets + TabletSchemaSPtr _merged_tablet_schema; }; using CloudTabletSPtr = std::shared_ptr; diff --git a/be/src/cloud/cloud_tablet_mgr.cpp b/be/src/cloud/cloud_tablet_mgr.cpp index e5c31785c1eb1c0..7ecb72e62fd5deb 100644 --- a/be/src/cloud/cloud_tablet_mgr.cpp +++ b/be/src/cloud/cloud_tablet_mgr.cpp @@ -28,6 +28,7 @@ #include "runtime/memory/cache_policy.h" namespace doris { +uint64_t g_tablet_report_inactive_duration_ms = 0; namespace { // port from @@ -142,6 +143,12 @@ CloudTabletMgr::CloudTabletMgr(CloudStorageEngine& engine) CloudTabletMgr::~CloudTabletMgr() = default; +void set_tablet_access_time_ms(CloudTablet* tablet) { + using namespace std::chrono; + int64_t now = duration_cast(system_clock::now().time_since_epoch()).count(); + tablet->last_access_time_ms = now; +} + Result> CloudTabletMgr::get_tablet(int64_t tablet_id, bool warmup_data) { // LRU value type. `Value`'s lifetime MUST NOT be longer than `CloudTabletMgr` @@ -181,8 +188,11 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i auto* handle = _cache->insert(key, value.release(), 1, sizeof(CloudTablet), CachePriority::NORMAL); - auto ret = std::shared_ptr( - tablet.get(), [this, handle](...) { _cache->release(handle); }); + auto ret = + std::shared_ptr(tablet.get(), [this, handle](CloudTablet* tablet) { + set_tablet_access_time_ms(tablet); + _cache->release(handle); + }); _tablet_map->put(std::move(tablet)); return ret; }; @@ -191,12 +201,16 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i if (tablet == nullptr) { return ResultError(Status::InternalError("failed to get tablet {}", tablet_id)); } + set_tablet_access_time_ms(tablet.get()); return tablet; } CloudTablet* tablet_raw_ptr = reinterpret_cast(_cache->value(handle))->tablet.get(); - auto tablet = std::shared_ptr(tablet_raw_ptr, - [this, handle](...) { _cache->release(handle); }); + set_tablet_access_time_ms(tablet_raw_ptr); + auto tablet = std::shared_ptr(tablet_raw_ptr, [this, handle](CloudTablet* tablet) { + set_tablet_access_time_ms(tablet); + _cache->release(handle); + }); return tablet; } @@ -357,4 +371,54 @@ Status CloudTabletMgr::get_topn_tablets_to_compact( return Status::OK(); } +void CloudTabletMgr::build_all_report_tablets_info(std::map* tablets_info, + uint64_t* tablet_num) { + DCHECK(tablets_info != nullptr); + VLOG_NOTICE << "begin to build all report cloud tablets info"; + + HistogramStat tablet_version_num_hist; + + auto handler = [&](const std::weak_ptr& tablet_wk) { + auto tablet = tablet_wk.lock(); + if (!tablet) return; + (*tablet_num)++; + TTabletInfo tablet_info; + tablet->build_tablet_report_info(&tablet_info); + using namespace std::chrono; + int64_t now = duration_cast(system_clock::now().time_since_epoch()).count(); + if (now - g_tablet_report_inactive_duration_ms * 1000 < tablet->last_access_time_ms) { + // the tablet is still being accessed and used in recently, so not report it + return; + } + auto& t_tablet = (*tablets_info)[tablet->tablet_id()]; + // On the cloud, a specific BE has only one tablet replica; + // there are no multiple replicas for a specific BE. + // This is only to reuse the non-cloud report protocol. + tablet_version_num_hist.add(tablet_info.total_version_count); + t_tablet.tablet_infos.emplace_back(std::move(tablet_info)); + }; + + auto weak_tablets = get_weak_tablets(); + std::for_each(weak_tablets.begin(), weak_tablets.end(), handler); + + DorisMetrics::instance()->tablet_version_num_distribution->set_histogram( + tablet_version_num_hist); + LOG(INFO) << "success to build all cloud report tablets info. all_tablet_count=" << *tablet_num + << " exceed drop time limit count=" << tablets_info->size(); +} + +void CloudTabletMgr::get_tablet_info(int64_t num_tablets, std::vector* tablets_info) { + auto weak_tablets = get_weak_tablets(); + for (auto& weak_tablet : weak_tablets) { + auto tablet = weak_tablet.lock(); + if (tablet == nullptr) { + continue; + } + if (tablets_info->size() >= num_tablets) { + return; + } + tablets_info->push_back(tablet->get_tablet_info()); + } +} + } // namespace doris diff --git a/be/src/cloud/cloud_tablet_mgr.h b/be/src/cloud/cloud_tablet_mgr.h index 976d483b36c143c..903f372cbdec5fb 100644 --- a/be/src/cloud/cloud_tablet_mgr.h +++ b/be/src/cloud/cloud_tablet_mgr.h @@ -17,6 +17,9 @@ #pragma once +#include +#include + #include #include #include @@ -31,6 +34,8 @@ class CloudStorageEngine; class LRUCachePolicy; class CountDownLatch; +extern uint64_t g_tablet_report_inactive_duration_ms; + class CloudTabletMgr { public: CloudTabletMgr(CloudStorageEngine& engine); @@ -65,6 +70,17 @@ class CloudTabletMgr { std::vector>* tablets, int64_t* max_score); + /** + * Gets tablets info and total tablet num that are reported + * + * @param tablets_info used by report + * @param tablet_num tablets in be tabletMgr, total num + */ + void build_all_report_tablets_info(std::map* tablets_info, + uint64_t* tablet_num); + + void get_tablet_info(int64_t num_tablets, std::vector* tablets_info); + private: CloudStorageEngine& _engine; diff --git a/be/src/cloud/config.cpp b/be/src/cloud/config.cpp index e724dbea84e10ce..32e3250f87c2586 100644 --- a/be/src/cloud/config.cpp +++ b/be/src/cloud/config.cpp @@ -75,4 +75,5 @@ DEFINE_mInt32(tablet_txn_info_min_expired_seconds, "120"); DEFINE_mBool(enable_use_cloud_unique_id_from_fe, "true"); +DEFINE_mBool(enable_cloud_tablet_report, "true"); } // namespace doris::config diff --git a/be/src/cloud/config.h b/be/src/cloud/config.h index 86197f924d0cad0..8af967afb8c67b0 100644 --- a/be/src/cloud/config.h +++ b/be/src/cloud/config.h @@ -108,4 +108,6 @@ DECLARE_mInt32(tablet_txn_info_min_expired_seconds); DECLARE_mBool(enable_use_cloud_unique_id_from_fe); +DECLARE_Bool(enable_cloud_tablet_report); + } // namespace doris::config diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index 466e932fb2fd9a7..1f780824e32c3db 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -17,6 +17,7 @@ #include "cloud/pb_convert.h" +#include #include #include @@ -138,19 +139,54 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { out->mutable_inverted_index_file_info()->Swap(in.mutable_inverted_index_file_info()); } -RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in) { +static void fill_schema_with_dict(const RowsetMetaCloudPB& in, RowsetMetaPB* out, + const SchemaCloudDictionary& dict) { + std::unordered_map unique_id_map; + //init map + for (ColumnPB& column : *out->mutable_tablet_schema()->mutable_column()) { + unique_id_map[column.unique_id()] = &column; + } + // column info + for (size_t i = 0; i < in.schema_dict_key_list().column_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().column_dict_key_list(i); + const ColumnPB& dict_val = dict.column_dict().at(dict_key); + ColumnPB& to_add = *out->mutable_tablet_schema()->add_column(); + to_add = dict_val; + VLOG_DEBUG << "fill dict column " << dict_val.ShortDebugString(); + } + + // index info + for (size_t i = 0; i < in.schema_dict_key_list().index_info_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().index_info_dict_key_list(i); + const TabletIndexPB& dict_val = dict.index_dict().at(dict_key); + *out->mutable_tablet_schema()->add_index() = dict_val; + VLOG_DEBUG << "fill dict index " << dict_val.ShortDebugString(); + } + + // sparse column info + for (size_t i = 0; i < in.schema_dict_key_list().sparse_column_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().sparse_column_dict_key_list(i); + const ColumnPB& dict_val = dict.column_dict().at(dict_key); + *unique_id_map.at(dict_val.parent_unique_id())->add_sparse_columns() = dict_val; + VLOG_DEBUG << "fill dict sparse column" << dict_val.ShortDebugString(); + } +} + +RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict) { RowsetMetaPB out; - cloud_rowset_meta_to_doris(&out, in); + cloud_rowset_meta_to_doris(&out, in, dict); return out; } -RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&& in) { +RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&& in, const SchemaCloudDictionary* dict) { RowsetMetaPB out; - cloud_rowset_meta_to_doris(&out, std::move(in)); + cloud_rowset_meta_to_doris(&out, std::move(in), dict); return out; } -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) { +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict) { // ATTN: please keep the set order aligned with the definition of proto `TabletSchemaCloudPB`. out->set_rowset_id(in.rowset_id()); out->set_partition_id(in.partition_id()); @@ -185,6 +221,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) if (in.has_tablet_schema()) { cloud_tablet_schema_to_doris(out->mutable_tablet_schema(), in.tablet_schema()); } + if (dict != nullptr) { + fill_schema_with_dict(in, out, *dict); + } out->set_txn_expiration(in.txn_expiration()); out->set_segments_overlap_pb(in.segments_overlap_pb()); out->mutable_segments_file_size()->CopyFrom(in.segments_file_size()); @@ -198,7 +237,8 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) out->mutable_inverted_index_file_info()->CopyFrom(in.inverted_index_file_info()); } -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in, + const SchemaCloudDictionary* dict) { // ATTN: please keep the set order aligned with the definition of proto `TabletSchemaCloudPB`. out->set_rowset_id(in.rowset_id()); out->set_partition_id(in.partition_id()); @@ -234,6 +274,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { cloud_tablet_schema_to_doris(out->mutable_tablet_schema(), std::move(*in.mutable_tablet_schema())); } + if (dict != nullptr) { + fill_schema_with_dict(in, out, *dict); + } out->set_txn_expiration(in.txn_expiration()); out->set_segments_overlap_pb(in.segments_overlap_pb()); out->mutable_segments_file_size()->Swap(in.mutable_segments_file_size()); diff --git a/be/src/cloud/pb_convert.h b/be/src/cloud/pb_convert.h index 0cfa033f2930a0f..31fe43adb11a6da 100644 --- a/be/src/cloud/pb_convert.h +++ b/be/src/cloud/pb_convert.h @@ -24,10 +24,14 @@ RowsetMetaCloudPB doris_rowset_meta_to_cloud(const RowsetMetaPB&); RowsetMetaCloudPB doris_rowset_meta_to_cloud(RowsetMetaPB&&); void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in); void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in); -RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB&); -RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&&); -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in); -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in); +RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB&, + const SchemaCloudDictionary* dict = nullptr); +RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&&, + const SchemaCloudDictionary* dict = nullptr); +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict = nullptr); +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in, + const SchemaCloudDictionary* dict = nullptr); // TabletSchemaPB <=> TabletSchemaCloudPB TabletSchemaCloudPB doris_tablet_schema_to_cloud(const TabletSchemaPB&); diff --git a/be/src/common/compile_check_begin.h b/be/src/common/compile_check_begin.h index d3b7f60439c74e5..6da403f28948857 100644 --- a/be/src/common/compile_check_begin.h +++ b/be/src/common/compile_check_begin.h @@ -15,10 +15,16 @@ // specific language governing permissions and limitations // under the License. +#ifdef COMPILE_CHECK +#error The handling of compile_check_begin.h and compile_check_end.h is not done correctly. +#endif + +#define COMPILE_CHECK #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic error "-Wconversion" #pragma clang diagnostic ignored "-Wsign-conversion" #pragma clang diagnostic ignored "-Wfloat-conversion" #endif + //#include "common/compile_check_begin.h" \ No newline at end of file diff --git a/be/src/common/compile_check_end.h b/be/src/common/compile_check_end.h index 6cba13c7f669c53..0897965dc74a3dc 100644 --- a/be/src/common/compile_check_end.h +++ b/be/src/common/compile_check_end.h @@ -18,4 +18,6 @@ #ifdef __clang__ #pragma clang diagnostic pop #endif +#undef COMPILE_CHECK + // #include "common/compile_check_end.h" \ No newline at end of file diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index f7c17aefee8e054..32604a65e58dae0 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -540,7 +540,6 @@ DEFINE_mInt32(streaming_load_rpc_max_alive_time_sec, "1200"); DEFINE_Int32(tablet_writer_open_rpc_timeout_sec, "60"); // You can ignore brpc error '[E1011]The server is overcrowded' when writing data. DEFINE_mBool(tablet_writer_ignore_eovercrowded, "true"); -DEFINE_mBool(exchange_sink_ignore_eovercrowded, "true"); DEFINE_mInt32(slave_replica_writer_rpc_timeout_sec, "60"); // Whether to enable stream load record function, the default is false. // False: disable stream load record @@ -903,7 +902,8 @@ DEFINE_mInt64(small_column_size_buffer, "100"); // Perform the always_true check at intervals determined by runtime_filter_sampling_frequency DEFINE_mInt32(runtime_filter_sampling_frequency, "64"); - +DEFINE_mInt32(execution_max_rpc_timeout_sec, "3600"); +DEFINE_mBool(execution_ignore_eovercrowded, "true"); // cooldown task configs DEFINE_Int32(cooldown_thread_num, "5"); DEFINE_mInt64(generate_cooldown_task_interval_sec, "20"); @@ -982,6 +982,8 @@ DEFINE_Int32(pipeline_executor_size, "0"); DEFINE_Bool(enable_workload_group_for_scan, "false"); DEFINE_mInt64(workload_group_scan_task_wait_timeout_ms, "10000"); +// Whether use schema dict in backend side instead of MetaService side(cloud mode) +DEFINE_mBool(variant_use_cloud_schema_dict, "true"); DEFINE_mDouble(variant_ratio_of_defaults_as_sparse_column, "1"); DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048"); DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false"); @@ -1009,7 +1011,7 @@ DEFINE_Bool(enable_file_cache_query_limit, "false"); DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "90"); DEFINE_mInt32(file_cache_exit_disk_resource_limit_mode_percent, "80"); DEFINE_mBool(enable_read_cache_file_directly, "false"); -DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "false"); +DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "true"); DEFINE_mInt64(file_cache_ttl_valid_check_interval_second, "0"); // zero for not checking // If true, evict the ttl cache using LRU when full. // Otherwise, only expiration can evict ttl and new data won't add to cache when full. @@ -1290,7 +1292,7 @@ DEFINE_Int64(num_s3_file_upload_thread_pool_min_thread, "16"); // The max thread num for S3FileUploadThreadPool DEFINE_Int64(num_s3_file_upload_thread_pool_max_thread, "64"); // The max ratio for ttl cache's size -DEFINE_mInt64(max_ttl_cache_ratio, "90"); +DEFINE_mInt64(max_ttl_cache_ratio, "50"); // The maximum jvm heap usage ratio for hdfs write workload DEFINE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio, "0.5"); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1356,6 +1358,8 @@ DEFINE_mInt32(check_score_rounds_num, "1000"); DEFINE_Int32(query_cache_size, "512"); DEFINE_mBool(enable_delete_bitmap_merge_on_compaction, "false"); +// Enable validation to check the correctness of table size. +DEFINE_Bool(enable_table_size_correctness_check, "false"); // clang-format off #ifdef BE_TEST diff --git a/be/src/common/config.h b/be/src/common/config.h index 5f73c31dcdbe342..c4fa4f9467280a8 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -587,7 +587,6 @@ DECLARE_mInt32(streaming_load_rpc_max_alive_time_sec); DECLARE_Int32(tablet_writer_open_rpc_timeout_sec); // You can ignore brpc error '[E1011]The server is overcrowded' when writing data. DECLARE_mBool(tablet_writer_ignore_eovercrowded); -DECLARE_mBool(exchange_sink_ignore_eovercrowded); DECLARE_mInt32(slave_replica_writer_rpc_timeout_sec); // Whether to enable stream load record function, the default is false. // False: disable stream load record @@ -958,6 +957,8 @@ DECLARE_mInt64(big_column_size_buffer); DECLARE_mInt64(small_column_size_buffer); DECLARE_mInt32(runtime_filter_sampling_frequency); +DECLARE_mInt32(execution_max_rpc_timeout_sec); +DECLARE_mBool(execution_ignore_eovercrowded); // cooldown task configs DECLARE_Int32(cooldown_thread_num); @@ -1181,6 +1182,7 @@ DECLARE_mInt64(LZ4_HC_compression_level); // Threshold of a column as sparse column // Notice: TEST ONLY DECLARE_mDouble(variant_ratio_of_defaults_as_sparse_column); +DECLARE_mBool(variant_use_cloud_schema_dict); // Threshold to estimate a column is sparsed // Notice: TEST ONLY DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column); @@ -1442,6 +1444,8 @@ DECLARE_mInt32(check_score_rounds_num); DECLARE_Int32(query_cache_size); DECLARE_mBool(enable_delete_bitmap_merge_on_compaction); +// Enable validation to check the correctness of table size. +DECLARE_Bool(enable_table_size_correctness_check); #ifdef BE_TEST // test s3 diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 84a964f5c3865ce..fb82450ac4d600d 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -1146,8 +1146,11 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt request->set_filter_size(local_filter_size); request->set_filter_id(_filter_id); - callback->cntl_->set_timeout_ms(std::min(3600, state->execution_timeout()) * 1000); - callback->cntl_->ignore_eovercrowded(); + + callback->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(state->execution_timeout())); + if (config::execution_ignore_eovercrowded) { + callback->cntl_->ignore_eovercrowded(); + } stub->send_filter_size(closure->cntl_.get(), closure->request_.get(), closure->response_.get(), closure.get()); @@ -1181,11 +1184,14 @@ Status IRuntimeFilter::push_to_remote(const TNetworkAddress* addr) { pfragment_instance_id->set_lo((int64_t)this); merge_filter_request->set_filter_id(_filter_id); - merge_filter_request->set_is_pipeline(true); auto column_type = _wrapper->column_type(); RETURN_IF_CATCH_EXCEPTION(merge_filter_request->set_column_type(to_proto(column_type))); - merge_filter_callback->cntl_->set_timeout_ms(wait_time_ms()); - merge_filter_callback->cntl_->ignore_eovercrowded(); + + merge_filter_callback->cntl_->set_timeout_ms( + get_execution_rpc_timeout_ms(_state->execution_timeout)); + if (config::execution_ignore_eovercrowded) { + merge_filter_callback->cntl_->ignore_eovercrowded(); + } if (get_ignored()) { merge_filter_request->set_filter_type(PFilterType::UNKNOW_FILTER); diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h index cb7944409ac2d74..42c5f598633ad9f 100644 --- a/be/src/exprs/runtime_filter_slots.h +++ b/be/src/exprs/runtime_filter_slots.h @@ -77,6 +77,10 @@ class VRuntimeFilterSlots { if (filter->get_real_type() != RuntimeFilterType::IN_FILTER) { continue; } + if (!filter->need_sync_filter_size() && + filter->type() == RuntimeFilterType::IN_OR_BLOOM_FILTER) { + continue; + } if (has_in_filter.contains(filter->expr_order())) { filter->set_ignored(); continue; @@ -84,7 +88,7 @@ class VRuntimeFilterSlots { has_in_filter.insert(filter->expr_order()); } - // process ignore filter when it has IN_FILTER on same expr, and init bloom filter size + // process ignore filter when it has IN_FILTER on same expr for (auto filter : _runtime_filters) { if (filter->get_ignored()) { continue; diff --git a/be/src/http/action/adjust_log_level.cpp b/be/src/http/action/adjust_log_level.cpp index 687639a9b58deaa..a8644a0fb5f52a3 100644 --- a/be/src/http/action/adjust_log_level.cpp +++ b/be/src/http/action/adjust_log_level.cpp @@ -17,8 +17,9 @@ #include +#include + #include "common/logging.h" -#include "common/status.h" #include "http/http_channel.h" #include "http/http_request.h" @@ -26,7 +27,7 @@ namespace doris { // **Note**: If the module_name does not exist in the vlog modules, vlog // would create corresponding module for it. -int handle_request(HttpRequest* req) { +std::tuple handle_request(HttpRequest* req) { auto parse_param = [&req](std::string param) { const auto& value = req->param(param); if (value.empty()) { @@ -38,13 +39,16 @@ int handle_request(HttpRequest* req) { const auto& module = parse_param("module"); const auto& level = parse_param("level"); int new_level = std::stoi(level); - return google::SetVLOGLevel(module.c_str(), new_level); + return std::make_tuple(module, google::SetVLOGLevel(module.c_str(), new_level), new_level); } void AdjustLogLevelAction::handle(HttpRequest* req) { try { - auto old_level = handle_request(req); - auto msg = fmt::format("adjust log level success, origin level is {}", old_level); + auto handle_result = handle_request(req); + auto msg = + fmt::format("adjust vlog of {} from {} to {} succeed", std::get<0>(handle_result), + std::get<1>(handle_result), std::get<2>(handle_result)); + LOG(INFO) << msg; HttpChannel::send_reply(req, msg); } catch (const std::exception& e) { HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, e.what()); diff --git a/be/src/http/action/tablets_info_action.cpp b/be/src/http/action/tablets_info_action.cpp index 9c27c1de9a02b35..672b03ce6ceaedb 100644 --- a/be/src/http/action/tablets_info_action.cpp +++ b/be/src/http/action/tablets_info_action.cpp @@ -24,6 +24,8 @@ #include #include +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet_mgr.h" #include "cloud/config.h" #include "http/http_channel.h" #include "http/http_headers.h" @@ -51,12 +53,6 @@ void TabletsInfoAction::handle(HttpRequest* req) { EasyJson TabletsInfoAction::get_tablets_info(string tablet_num_to_return) { EasyJson tablets_info_ej; - if (config::is_cloud_mode()) { - // TODO(plat1ko): CloudStorageEngine - tablets_info_ej["msg"] = "TabletsInfoAction::get_tablets_info is not implemented"; - tablets_info_ej["code"] = 0; - return tablets_info_ej; - } int64_t number; std::string msg; @@ -74,9 +70,15 @@ EasyJson TabletsInfoAction::get_tablets_info(string tablet_num_to_return) { msg = "Parameter Error"; } std::vector tablets_info; - TabletManager* tablet_manager = - ExecEnv::GetInstance()->storage_engine().to_local().tablet_manager(); - tablet_manager->obtain_specific_quantity_tablets(tablets_info, number); + if (!config::is_cloud_mode()) { + TabletManager* tablet_manager = + ExecEnv::GetInstance()->storage_engine().to_local().tablet_manager(); + tablet_manager->obtain_specific_quantity_tablets(tablets_info, number); + } else { + CloudTabletMgr& cloud_tablet_manager = + ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_mgr(); + cloud_tablet_manager.get_tablet_info(number, &tablets_info); + } tablets_info_ej["msg"] = msg; tablets_info_ej["code"] = 0; diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index f2f1f22365297b2..4fb3f3e02cb58c5 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -86,6 +86,94 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _total_evict_size_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_total_evict_size"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_index"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_index"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_index"); + + _evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_self_lru_disposable"); + _evict_by_self_lru_metrics_matrix[FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_self_lru_normal"); + _evict_by_self_lru_metrics_matrix[FileCacheType::INDEX] = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_self_lru_index"); + _evict_by_self_lru_metrics_matrix[FileCacheType::TTL] = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_self_lru_ttl"); + + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_index"); + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_index"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_index"); + + _evict_by_try_release = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_try_release"); + _num_read_blocks = std::make_shared>(_cache_base_path.c_str(), "file_cache_num_read_blocks"); _num_hit_blocks = std::make_shared>(_cache_base_path.c_str(), @@ -109,6 +197,8 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, "file_cache_hit_ratio_5m", 0.0); _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_1h", 0.0); + _disk_limit_mode_metrics = + std::make_shared>(_cache_base_path.c_str(), "disk_limit_mode", 0); _disposable_queue = LRUQueue(cache_settings.disposable_queue_size, cache_settings.disposable_queue_elements, 60 * 60); @@ -116,7 +206,7 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, 7 * 24 * 60 * 60); _normal_queue = LRUQueue(cache_settings.query_queue_size, cache_settings.query_queue_elements, 24 * 60 * 60); - _ttl_queue = LRUQueue(std::numeric_limits::max(), std::numeric_limits::max(), + _ttl_queue = LRUQueue(cache_settings.ttl_queue_size, cache_settings.ttl_queue_elements, std::numeric_limits::max()); _recycle_keys = std::make_shared>( @@ -317,14 +407,10 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte if (st.ok()) { auto& queue = get_queue(origin_type); queue.remove(cell.queue_iterator.value(), cache_lock); - if (config::enable_ttl_cache_evict_using_lru) { - auto& ttl_queue = get_queue(FileCacheType::TTL); - cell.queue_iterator = ttl_queue.add( - cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - } else { - cell.queue_iterator.reset(); - } + auto& ttl_queue = get_queue(FileCacheType::TTL); + cell.queue_iterator = + ttl_queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), + cell.file_block->range().size(), cache_lock); } else { LOG_WARNING("Failed to change key meta").error(st); } @@ -734,11 +820,10 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha << " cache_type=" << cache_type_to_string(context.cache_type) << " error=" << st.msg(); } - if (cell.file_block->cache_type() != FileCacheType::TTL || - config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(cell.file_block->cache_type()); - cell.queue_iterator = queue.add(hash, offset, size, cache_lock); - } + + auto& queue = get_queue(cell.file_block->cache_type()); + cell.queue_iterator = queue.add(hash, offset, size, cache_lock); + if (cell.file_block->cache_type() == FileCacheType::TTL) { if (_key_to_time.find(hash) == _key_to_time.end()) { _key_to_time[hash] = context.expiration_time; @@ -761,11 +846,14 @@ size_t BlockFileCache::try_release() { } } } + size_t remove_size = 0; for (auto& cell : trash) { FileBlockSPtr file_block = cell->file_block; std::lock_guard lc(cell->file_block->_mutex); + remove_size += file_block->range().size(); remove(file_block, cache_lock, lc); } + *_evict_by_try_release << remove_size; LOG(INFO) << "Released " << trash.size() << " blocks in file cache " << _cache_base_path; return trash.size(); } @@ -856,9 +944,10 @@ void BlockFileCache::remove_file_blocks_and_clean_time_maps( void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t cur_cache_size, size_t& removed_size, std::vector& to_evict, - std::lock_guard& cache_lock, bool is_ttl) { + std::lock_guard& cache_lock, + size_t& cur_removed_size) { for (const auto& [entry_key, entry_offset, entry_size] : queue) { - if (!is_overflow(removed_size, size, cur_cache_size, is_ttl)) { + if (!is_overflow(removed_size, size, cur_cache_size)) { break; } auto* cell = get_cell(entry_key, entry_offset, cache_lock); @@ -876,6 +965,7 @@ void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t DCHECK(file_block->_download_state == FileBlock::State::DOWNLOADED); to_evict.push_back(cell); removed_size += cell_size; + cur_removed_size += cell_size; } } } @@ -901,8 +991,9 @@ bool BlockFileCache::try_reserve_for_ttl_without_lru(size_t size, } std::vector to_evict; auto collect_eliminate_fragments = [&](LRUQueue& queue) { + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); }; if (disposable_queue_size != 0) { collect_eliminate_fragments(get_queue(FileCacheType::DISPOSABLE)); @@ -929,8 +1020,9 @@ bool BlockFileCache::try_reserve_for_ttl(size_t size, std::lock_guard to_evict; + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - true); + cur_removed_size); remove_file_blocks_and_clean_time_maps(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); @@ -963,10 +1055,6 @@ bool BlockFileCache::try_reserve(const UInt128Wrapper& hash, const CacheContext& size = 5 * size; } - if (context.cache_type == FileCacheType::TTL) { - return try_reserve_for_ttl(size, cache_lock); - } - auto query_context = config::enable_file_cache_query_limit && (context.query_id.hi != 0 || context.query_id.lo != 0) ? get_query_context(context.query_id, cache_lock) @@ -1144,12 +1232,33 @@ void BlockFileCache::remove_if_cached_async(const UInt128Wrapper& file_key) { } } -std::vector BlockFileCache::get_other_cache_type(FileCacheType cur_cache_type) { +std::vector BlockFileCache::get_other_cache_type_without_ttl( + FileCacheType cur_cache_type) { switch (cur_cache_type) { + case FileCacheType::TTL: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::INDEX}; case FileCacheType::INDEX: return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL}; case FileCacheType::NORMAL: return {FileCacheType::DISPOSABLE, FileCacheType::INDEX}; + case FileCacheType::DISPOSABLE: + return {FileCacheType::NORMAL, FileCacheType::INDEX}; + default: + return {}; + } + return {}; +} + +std::vector BlockFileCache::get_other_cache_type(FileCacheType cur_cache_type) { + switch (cur_cache_type) { + case FileCacheType::TTL: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::INDEX}; + case FileCacheType::INDEX: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::TTL}; + case FileCacheType::NORMAL: + return {FileCacheType::DISPOSABLE, FileCacheType::INDEX, FileCacheType::TTL}; + case FileCacheType::DISPOSABLE: + return {FileCacheType::NORMAL, FileCacheType::INDEX, FileCacheType::TTL}; default: return {}; } @@ -1175,13 +1284,14 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size } bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( - std::vector other_cache_types, size_t size, int64_t cur_time, - std::lock_guard& cache_lock) { + FileCacheType cur_type, std::vector other_cache_types, size_t size, + int64_t cur_time, std::lock_guard& cache_lock) { size_t removed_size = 0; size_t cur_cache_size = _cur_cache_size; std::vector to_evict; for (FileCacheType cache_type : other_cache_types) { auto& queue = get_queue(cache_type); + size_t remove_size_per_type = 0; for (const auto& [entry_key, entry_offset, entry_size] : queue) { if (!is_overflow(removed_size, size, cur_cache_size)) { break; @@ -1203,39 +1313,48 @@ bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( DCHECK(file_block->_download_state == FileBlock::State::DOWNLOADED); to_evict.push_back(cell); removed_size += cell_size; + remove_size_per_type += cell_size; } } + *(_evict_by_heat_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; } remove_file_blocks(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); } -bool BlockFileCache::is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size, - bool is_ttl) const { +bool BlockFileCache::is_overflow(size_t removed_size, size_t need_size, + size_t cur_cache_size) const { bool ret = false; if (_disk_resource_limit_mode) { ret = (removed_size < need_size); } else { ret = (cur_cache_size + need_size - removed_size > _capacity); } - if (is_ttl) { - size_t ttl_threshold = config::max_ttl_cache_ratio * _capacity / 100; - return (ret || ((cur_cache_size + need_size - removed_size) > ttl_threshold)); - } return ret; } bool BlockFileCache::try_reserve_from_other_queue_by_size( - std::vector other_cache_types, size_t size, + FileCacheType cur_type, std::vector other_cache_types, size_t size, std::lock_guard& cache_lock) { size_t removed_size = 0; size_t cur_cache_size = _cur_cache_size; std::vector to_evict; + // we follow the privilege defined in get_other_cache_types to evict for (FileCacheType cache_type : other_cache_types) { auto& queue = get_queue(cache_type); + + // we will not drain each of them to the bottom -- i.e., we only + // evict what they have stolen. + size_t cur_queue_size = queue.get_capacity(cache_lock); + size_t cur_queue_max_size = queue.get_max_size(); + if (cur_queue_size <= cur_queue_max_size) { + continue; + } + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); + *(_evict_by_size_metrics_matrix[cache_type][cur_type]) << cur_removed_size; } remove_file_blocks(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); @@ -1244,16 +1363,15 @@ bool BlockFileCache::try_reserve_from_other_queue_by_size( bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, size_t size, int64_t cur_time, std::lock_guard& cache_lock) { - // disposable queue cannot reserve other queues - if (cur_cache_type == FileCacheType::DISPOSABLE) { - return false; - } - auto other_cache_types = get_other_cache_type(cur_cache_type); - bool reserve_success = try_reserve_from_other_queue_by_hot_interval(other_cache_types, size, - cur_time, cache_lock); + // currently, TTL cache is not considered as a candidate + auto other_cache_types = get_other_cache_type_without_ttl(cur_cache_type); + bool reserve_success = try_reserve_from_other_queue_by_hot_interval( + cur_cache_type, other_cache_types, size, cur_time, cache_lock); if (reserve_success || !config::file_cache_enable_evict_from_other_queue_by_size) { return reserve_success; } + + other_cache_types = get_other_cache_type(cur_cache_type); auto& cur_queue = get_queue(cur_cache_type); size_t cur_queue_size = cur_queue.get_capacity(cache_lock); size_t cur_queue_max_size = cur_queue.get_max_size(); @@ -1261,7 +1379,8 @@ bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, if (_cur_cache_size + size > _capacity && cur_queue_size + size > cur_queue_max_size) { return false; } - return try_reserve_from_other_queue_by_size(other_cache_types, size, cache_lock); + return try_reserve_from_other_queue_by_size(cur_cache_type, other_cache_types, size, + cache_lock); } bool BlockFileCache::try_reserve_for_lru(const UInt128Wrapper& hash, @@ -1277,9 +1396,11 @@ bool BlockFileCache::try_reserve_for_lru(const UInt128Wrapper& hash, size_t cur_cache_size = _cur_cache_size; std::vector to_evict; + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); remove_file_blocks(to_evict, cache_lock); + *(_evict_by_self_lru_metrics_matrix[context.cache_type]) << cur_removed_size; if (is_overflow(removed_size, size, cur_cache_size)) { return false; @@ -1579,6 +1700,7 @@ std::string BlockFileCache::reset_capacity(size_t new_capacity) { ss << " ttl_queue released " << queue_released; } _disk_resource_limit_mode = true; + _disk_limit_mode_metrics->set_value(1); _async_clear_file_cache = true; ss << " total_space_released=" << space_released; } @@ -1600,6 +1722,7 @@ void BlockFileCache::check_disk_resource_limit() { } if (_capacity > _cur_cache_size) { _disk_resource_limit_mode = false; + _disk_limit_mode_metrics->set_value(0); } std::pair percent; int ret = disk_used_percentage(_cache_base_path, &percent); @@ -1625,10 +1748,12 @@ void BlockFileCache::check_disk_resource_limit() { if (capacity_percentage >= config::file_cache_enter_disk_resource_limit_mode_percent || inode_is_insufficient(inode_percentage)) { _disk_resource_limit_mode = true; + _disk_limit_mode_metrics->set_value(1); } else if (_disk_resource_limit_mode && (capacity_percentage < config::file_cache_exit_disk_resource_limit_mode_percent) && (inode_percentage < config::file_cache_exit_disk_resource_limit_mode_percent)) { _disk_resource_limit_mode = false; + _disk_limit_mode_metrics->set_value(0); } if (_disk_resource_limit_mode) { // log per mins @@ -1744,14 +1869,9 @@ void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, if (st.ok()) { auto& queue = get_queue(origin_type); queue.remove(cell.queue_iterator.value(), cache_lock); - if (config::enable_ttl_cache_evict_using_lru) { - auto& ttl_queue = get_queue(FileCacheType::TTL); - cell.queue_iterator = - ttl_queue.add(hash, cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - } else { - cell.queue_iterator.reset(); - } + auto& ttl_queue = get_queue(FileCacheType::TTL); + cell.queue_iterator = ttl_queue.add(hash, cell.file_block->offset(), + cell.file_block->range().size(), cache_lock); } if (!st.ok()) { LOG_WARNING("").error(st); @@ -1909,6 +2029,12 @@ std::map BlockFileCache::get_stats() { stats["index_queue_curr_elements"] = (double)_cur_index_queue_element_count_metrics->get_value(); + stats["ttl_queue_max_size"] = (double)_ttl_queue.get_max_size(); + stats["ttl_queue_curr_size"] = (double)_cur_ttl_cache_lru_queue_cache_size_metrics->get_value(); + stats["ttl_queue_max_elements"] = (double)_ttl_queue.get_max_element_size(); + stats["ttl_queue_curr_elements"] = + (double)_cur_ttl_cache_lru_queue_element_count_metrics->get_value(); + stats["normal_queue_max_size"] = (double)_normal_queue.get_max_size(); stats["normal_queue_curr_size"] = (double)_cur_normal_queue_element_count_metrics->get_value(); stats["normal_queue_max_elements"] = (double)_normal_queue.get_max_element_size(); @@ -1925,6 +2051,36 @@ std::map BlockFileCache::get_stats() { return stats; } +// for be UTs +std::map BlockFileCache::get_stats_unsafe() { + std::map stats; + stats["hits_ratio"] = (double)_hit_ratio->get_value(); + stats["hits_ratio_5m"] = (double)_hit_ratio_5m->get_value(); + stats["hits_ratio_1h"] = (double)_hit_ratio_1h->get_value(); + + stats["index_queue_max_size"] = (double)_index_queue.get_max_size(); + stats["index_queue_curr_size"] = (double)_index_queue.get_capacity_unsafe(); + stats["index_queue_max_elements"] = (double)_index_queue.get_max_element_size(); + stats["index_queue_curr_elements"] = (double)_index_queue.get_elements_num_unsafe(); + + stats["ttl_queue_max_size"] = (double)_ttl_queue.get_max_size(); + stats["ttl_queue_curr_size"] = (double)_ttl_queue.get_capacity_unsafe(); + stats["ttl_queue_max_elements"] = (double)_ttl_queue.get_max_element_size(); + stats["ttl_queue_curr_elements"] = (double)_ttl_queue.get_elements_num_unsafe(); + + stats["normal_queue_max_size"] = (double)_normal_queue.get_max_size(); + stats["normal_queue_curr_size"] = (double)_normal_queue.get_capacity_unsafe(); + stats["normal_queue_max_elements"] = (double)_normal_queue.get_max_element_size(); + stats["normal_queue_curr_elements"] = (double)_normal_queue.get_elements_num_unsafe(); + + stats["disposable_queue_max_size"] = (double)_disposable_queue.get_max_size(); + stats["disposable_queue_curr_size"] = (double)_disposable_queue.get_capacity_unsafe(); + stats["disposable_queue_max_elements"] = (double)_disposable_queue.get_max_element_size(); + stats["disposable_queue_curr_elements"] = (double)_disposable_queue.get_elements_num_unsafe(); + + return stats; +} + template void BlockFileCache::remove(FileBlockSPtr file_block, std::lock_guard& cache_lock, std::lock_guard& block_lock, bool sync); diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index 4bedc7256926532..0de33dadc8249d0 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -183,6 +183,9 @@ class BlockFileCache { std::map get_stats(); + // for be UTs + std::map get_stats_unsafe(); + class LRUQueue { public: LRUQueue() = default; @@ -217,6 +220,10 @@ class BlockFileCache { return cache_size; } + size_t get_capacity_unsafe() const { return cache_size; } + + size_t get_elements_num_unsafe() const { return queue.size(); } + size_t get_elements_num(std::lock_guard& /* cache_lock */) const { return queue.size(); } @@ -383,6 +390,7 @@ class BlockFileCache { bool try_reserve_during_async_load(size_t size, std::lock_guard& cache_lock); std::vector get_other_cache_type(FileCacheType cur_cache_type); + std::vector get_other_cache_type_without_ttl(FileCacheType cur_cache_type); bool try_reserve_from_other_queue(FileCacheType cur_cache_type, size_t offset, int64_t cur_time, std::lock_guard& cache_lock); @@ -428,15 +436,16 @@ class BlockFileCache { void recycle_deleted_blocks(); - bool try_reserve_from_other_queue_by_hot_interval(std::vector other_cache_types, + bool try_reserve_from_other_queue_by_hot_interval(FileCacheType cur_type, + std::vector other_cache_types, size_t size, int64_t cur_time, std::lock_guard& cache_lock); - bool try_reserve_from_other_queue_by_size(std::vector other_cache_types, + bool try_reserve_from_other_queue_by_size(FileCacheType cur_type, + std::vector other_cache_types, size_t size, std::lock_guard& cache_lock); - bool is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size, - bool is_ttl = false) const; + bool is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size) const; void remove_file_blocks(std::vector&, std::lock_guard&); @@ -447,7 +456,7 @@ class BlockFileCache { void find_evict_candidates(LRUQueue& queue, size_t size, size_t cur_cache_size, size_t& removed_size, std::vector& to_evict, - std::lock_guard& cache_lock, bool is_ttl); + std::lock_guard& cache_lock, size_t& cur_removed_size); void recycle_stale_rowset_async_bottom_half(); @@ -506,6 +515,10 @@ class BlockFileCache { std::shared_ptr> _cur_disposable_queue_cache_size_metrics; std::array>, 4> _queue_evict_size_metrics; std::shared_ptr> _total_evict_size_metrics; + std::shared_ptr> _evict_by_heat_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_size_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_self_lru_metrics_matrix[4]; + std::shared_ptr> _evict_by_try_release; std::shared_ptr>> _num_hit_blocks_5m; std::shared_ptr>> _num_read_blocks_5m; @@ -519,6 +532,7 @@ class BlockFileCache { std::shared_ptr> _hit_ratio; std::shared_ptr> _hit_ratio_5m; std::shared_ptr> _hit_ratio_1h; + std::shared_ptr> _disk_limit_mode_metrics; }; } // namespace doris::io diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index c569ace0011866f..674879300452dfc 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -34,6 +34,7 @@ std::string FileCacheSettings::to_string() const { << ", disposable_queue_elements: " << disposable_queue_elements << ", index_queue_size: " << index_queue_size << ", index_queue_elements: " << index_queue_elements + << ", ttl_queue_size: " << ttl_queue_size << ", ttl_queue_elements: " << ttl_queue_elements << ", query_queue_size: " << query_queue_size << ", query_queue_elements: " << query_queue_elements << ", storage: " << storage; return ss.str(); @@ -58,6 +59,10 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach std::max(settings.index_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); + settings.ttl_queue_size = per_size * config::max_ttl_cache_ratio; + settings.ttl_queue_elements = std::max(settings.ttl_queue_size / settings.max_file_block_size, + REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); + settings.query_queue_size = settings.capacity - settings.disposable_queue_size - settings.index_queue_size; settings.query_queue_elements = diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 21309831a8284c9..30579ba7851b28e 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -26,17 +26,17 @@ namespace doris::io { inline static constexpr size_t REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS = 100 * 1024; inline static constexpr size_t FILE_CACHE_MAX_FILE_BLOCK_SIZE = 1 * 1024 * 1024; -inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 85; -inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 10; +inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 40; +inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 5; inline static constexpr size_t DEFAULT_INDEX_PERCENT = 5; using uint128_t = vectorized::UInt128; -enum class FileCacheType { - INDEX, - NORMAL, - DISPOSABLE, - TTL, +enum FileCacheType { + INDEX = 2, + NORMAL = 1, + DISPOSABLE = 0, + TTL = 3, }; struct UInt128Wrapper { @@ -93,6 +93,8 @@ struct FileCacheSettings { size_t index_queue_elements {0}; size_t query_queue_size {0}; size_t query_queue_elements {0}; + size_t ttl_queue_size {0}; + size_t ttl_queue_elements {0}; size_t max_file_block_size {0}; size_t max_query_cache_size {0}; std::string storage; diff --git a/be/src/io/hdfs_builder.cpp b/be/src/io/hdfs_builder.cpp index 9ecb8bcab348b45..59ca46e86944df8 100644 --- a/be/src/io/hdfs_builder.cpp +++ b/be/src/io/hdfs_builder.cpp @@ -27,7 +27,9 @@ #include "common/config.h" #include "common/logging.h" +#ifdef USE_HADOOP_HDFS #include "hadoop_hdfs/hdfs.h" +#endif #include "io/fs/hdfs.h" #include "util/string_util.h" diff --git a/be/src/io/hdfs_util.cpp b/be/src/io/hdfs_util.cpp index 6c1bbf80a1526f1..62546c9bbd4ffb6 100644 --- a/be/src/io/hdfs_util.cpp +++ b/be/src/io/hdfs_util.cpp @@ -17,10 +17,13 @@ #include "io/hdfs_util.h" +#include +#include #include #include #include +#include #include "common/logging.h" #include "io/fs/err_utils.h" @@ -30,7 +33,7 @@ namespace doris::io { namespace { -Status create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name, hdfsFS* fs) { +Status _create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name, hdfsFS* fs) { HDFSCommonBuilder builder; RETURN_IF_ERROR(create_hdfs_builder(hdfs_params, fs_name, &builder)); hdfsFS hdfs_fs = hdfsBuilderConnect(builder.get()); @@ -41,6 +44,38 @@ Status create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name return Status::OK(); } +// https://brpc.apache.org/docs/server/basics/ +// According to the brpc doc, JNI code checks stack layout and cannot be run in +// bthreads so create a pthread for creating hdfs connection if necessary. +Status create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name, hdfsFS* fs) { + bool is_pthread = bthread_self() == 0; + LOG(INFO) << "create hfdfs fs, is_pthread=" << is_pthread << " fs_name=" << fs_name; + if (is_pthread) { // running in pthread + return _create_hdfs_fs(hdfs_params, fs_name, fs); + } + + // running in bthread, switch to a pthread and wait + Status st; + auto btx = bthread::butex_create(); + *(int*)btx = 0; + std::thread t([&] { + st = _create_hdfs_fs(hdfs_params, fs_name, fs); + *(int*)btx = 1; + bthread::butex_wake_all(btx); + }); + std::unique_ptr> defer((int*)0x01, [&t, &btx](...) { + if (t.joinable()) t.join(); + bthread::butex_destroy(btx); + }); + timespec tmout {.tv_sec = std::chrono::system_clock::now().time_since_epoch().count() + 60}; + if (int ret = bthread::butex_wait(btx, 1, &tmout); ret != 0) { + std::string msg = "failed to wait _create_hdfs_fs fs_name=" + fs_name; + LOG(WARNING) << msg << " error=" << std::strerror(errno); + st = Status::Error(msg); + } + return st; +} + uint64_t hdfs_hash_code(const THdfsParams& hdfs_params, const std::string& fs_name) { uint64_t hash_code = 0; // The specified fsname is used first. diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp index 8be29383c1e9b1e..8b9cbd75ed33b80 100644 --- a/be/src/olap/base_compaction.cpp +++ b/be/src/olap/base_compaction.cpp @@ -80,7 +80,7 @@ Status BaseCompaction::execute_compact() { tablet()->set_last_base_compaction_success_time(UnixMillis()); DorisMetrics::instance()->base_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->base_compaction_bytes_total->increment(_input_rowsets_size); + DorisMetrics::instance()->base_compaction_bytes_total->increment(_input_rowsets_total_size); return Status::OK(); } diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index 8a830cd25e7d0d9..1e819de7c554d6e 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -80,7 +80,8 @@ Status _get_segment_column_iterator(const BetaRowsetSharedPtr& rowset, uint32_t .use_page_cache = !config::disable_storage_page_cache, .file_reader = segment->file_reader().get(), .stats = stats, - .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY}, + .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY, + .file_cache_stats = &stats->file_cache_stats}, }; RETURN_IF_ERROR((*column_iterator)->init(opt)); return Status::OK(); @@ -443,7 +444,7 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest RowLocation* row_location, uint32_t version, std::vector>& segment_caches, RowsetSharedPtr* rowset, bool with_rowid, - std::string* encoded_seq_value) { + std::string* encoded_seq_value, OlapReaderStatistics* stats) { SCOPED_BVAR_LATENCY(g_tablet_lookup_rowkey_latency); size_t seq_col_length = 0; // use the latest tablet schema to decide if the tablet has sequence column currently @@ -491,7 +492,7 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest for (auto id : picked_segments) { Status s = segments[id]->lookup_row_key(encoded_key, schema, with_seq_col, with_rowid, - &loc, encoded_seq_value); + &loc, encoded_seq_value, stats); if (s.is()) { continue; } diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index 4aaca77770db0fa..b5da0e3bf06be18 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -155,7 +155,8 @@ class BaseTablet { RowLocation* row_location, uint32_t version, std::vector>& segment_caches, RowsetSharedPtr* rowset = nullptr, bool with_rowid = true, - std::string* encoded_seq_value = nullptr); + std::string* encoded_seq_value = nullptr, + OlapReaderStatistics* stats = nullptr); // calc delete bitmap when flush memtable, use a fake version to calc // For example, cur max version is 5, and we use version 6 to calc but @@ -292,6 +293,9 @@ class BaseTablet { Status show_nested_index_file(std::string* json_meta); + TabletUid tablet_uid() const { return _tablet_meta->tablet_uid(); } + TabletInfo get_tablet_info() const { return TabletInfo(tablet_id(), tablet_uid()); } + protected: // Find the missed versions until the spec_version. // diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 14769bc315ad505..ab40e4abde6d197 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -257,10 +257,10 @@ int64_t Compaction::get_avg_segment_rows() { if (meta->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY) { int64_t compaction_goal_size_mbytes = meta->time_series_compaction_goal_size_mbytes(); return (compaction_goal_size_mbytes * 1024 * 1024 * 2) / - (_input_rowsets_size / (_input_row_num + 1) + 1); + (_input_rowsets_data_size / (_input_row_num + 1) + 1); } return config::vertical_compaction_max_segment_size / - (_input_rowsets_size / (_input_row_num + 1) + 1); + (_input_rowsets_data_size / (_input_row_num + 1) + 1); } CompactionMixin::CompactionMixin(StorageEngine& engine, TabletSharedPtr tablet, @@ -305,9 +305,9 @@ Status CompactionMixin::do_compact_ordered_rowsets() { // build output rowset RowsetMetaSharedPtr rowset_meta = std::make_shared(); rowset_meta->set_num_rows(_input_row_num); - rowset_meta->set_total_disk_size(_input_rowsets_size); - rowset_meta->set_data_disk_size(_input_rowsets_size); - rowset_meta->set_index_disk_size(_input_index_size); + rowset_meta->set_total_disk_size(_input_rowsets_data_size + _input_rowsets_index_size); + rowset_meta->set_data_disk_size(_input_rowsets_data_size); + rowset_meta->set_index_disk_size(_input_rowsets_index_size); rowset_meta->set_empty(_input_row_num == 0); rowset_meta->set_num_segments(_input_num_segments); rowset_meta->set_segments_overlap(NONOVERLAPPING); @@ -320,12 +320,13 @@ Status CompactionMixin::do_compact_ordered_rowsets() { void CompactionMixin::build_basic_info() { for (auto& rowset : _input_rowsets) { - _input_rowsets_size += rowset->data_disk_size(); - _input_index_size += rowset->index_disk_size(); + _input_rowsets_data_size += rowset->data_disk_size(); + _input_rowsets_index_size += rowset->index_disk_size(); + _input_rowsets_total_size += rowset->total_disk_size(); _input_row_num += rowset->num_rows(); _input_num_segments += rowset->num_segments(); } - COUNTER_UPDATE(_input_rowsets_data_size_counter, _input_rowsets_size); + COUNTER_UPDATE(_input_rowsets_data_size_counter, _input_rowsets_data_size); COUNTER_UPDATE(_input_row_num_counter, _input_row_num); COUNTER_UPDATE(_input_segments_num_counter, _input_num_segments); @@ -444,8 +445,12 @@ Status CompactionMixin::execute_compact_impl(int64_t permits) { << ", disk=" << tablet()->data_dir()->path() << ", segments=" << _input_num_segments << ", input_row_num=" << _input_row_num << ", output_row_num=" << _output_rowset->num_rows() - << ", input_rowset_size=" << _input_rowsets_size - << ", output_rowset_size=" << _output_rowset->data_disk_size() + << ", input_rowsets_data_size=" << _input_rowsets_data_size + << ", input_rowsets_index_size=" << _input_rowsets_index_size + << ", input_rowsets_total_size=" << _input_rowsets_total_size + << ", output_rowset_data_size=" << _output_rowset->data_disk_size() + << ", output_rowset_index_size=" << _output_rowset->index_disk_size() + << ", output_rowset_total_size=" << _output_rowset->total_disk_size() << ". elapsed time=" << watch.get_elapse_second() << "s."; _state = CompactionState::SUCCESS; return Status::OK(); @@ -467,8 +472,8 @@ Status CompactionMixin::execute_compact_impl(int64_t permits) { << ". tablet=" << _tablet->tablet_id() << ", output_version=" << _output_version << ", current_max_version=" << tablet()->max_version().second << ", disk=" << tablet()->data_dir()->path() << ", segments=" << _input_num_segments - << ", input_rowset_size=" << _input_rowsets_size - << ", output_rowset_size=" << _output_rowset->data_disk_size() + << ", input_data_size=" << _input_rowsets_data_size + << ", output_rowset_size=" << _output_rowset->total_disk_size() << ", input_row_num=" << _input_row_num << ", output_row_num=" << _output_rowset->num_rows() << ", filtered_row_num=" << _stats.filtered_rows @@ -514,7 +519,6 @@ Status Compaction::do_inverted_index_compaction() { if (dest_segment_num <= 0) { LOG(INFO) << "skip doing index compaction due to no output segments" << ". tablet=" << _tablet->tablet_id() << ", input row number=" << _input_row_num - << ", output row number=" << _output_rowset->num_rows() << ". elapsed time=" << inverted_watch.get_elapse_second() << "s."; return Status::OK(); } diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index 13a37beca19b23d..06ef4268529247b 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -90,10 +90,11 @@ class Compaction { BaseTabletSPtr _tablet; std::vector _input_rowsets; - int64_t _input_rowsets_size {0}; + int64_t _input_rowsets_data_size {0}; + int64_t _input_rowsets_index_size {0}; + int64_t _input_rowsets_total_size {0}; int64_t _input_row_num {0}; int64_t _input_num_segments {0}; - int64_t _input_index_size {0}; Merger::Statistics _stats; diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index b762468b3455a47..b961c694ede4d0e 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -125,7 +125,8 @@ Status CumulativeCompaction::execute_compact() { tablet()->set_last_cumu_compaction_success_time(UnixMillis()); } DorisMetrics::instance()->cumulative_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->cumulative_compaction_bytes_total->increment(_input_rowsets_size); + DorisMetrics::instance()->cumulative_compaction_bytes_total->increment( + _input_rowsets_total_size); return Status::OK(); } diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 4d5b1ce9add3e0e..80fc440ce36a6db 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -346,6 +346,8 @@ Status DeleteHandler::parse_condition(const std::string& condition_str, TConditi } template + requires(std::is_same_v or + std::is_same_v) Status DeleteHandler::_parse_column_pred(TabletSchemaSPtr complete_schema, TabletSchemaSPtr delete_pred_related_schema, const RepeatedPtrField& sub_pred_list, @@ -353,10 +355,13 @@ Status DeleteHandler::_parse_column_pred(TabletSchemaSPtr complete_schema, for (const auto& sub_predicate : sub_pred_list) { TCondition condition; RETURN_IF_ERROR(parse_condition(sub_predicate, &condition)); - int32_t col_unique_id; - if constexpr (std::is_same_v) { - col_unique_id = sub_predicate.col_unique_id; - } else { + int32_t col_unique_id = -1; + if constexpr (std::is_same_v) { + if (sub_predicate.has_column_unique_id()) [[likely]] { + col_unique_id = sub_predicate.column_unique_id(); + } + } + if (col_unique_id < 0) { const auto& column = *DORIS_TRY(delete_pred_related_schema->column(condition.column_name)); col_unique_id = column.unique_id(); diff --git a/be/src/olap/delete_handler.h b/be/src/olap/delete_handler.h index cc585c0abcf9f6b..77de62d31d988e1 100644 --- a/be/src/olap/delete_handler.h +++ b/be/src/olap/delete_handler.h @@ -21,6 +21,7 @@ #include #include +#include #include "common/factory_creator.h" #include "common/status.h" @@ -115,6 +116,8 @@ class DeleteHandler { private: template + requires(std::is_same_v or + std::is_same_v) Status _parse_column_pred( TabletSchemaSPtr complete_schema, TabletSchemaSPtr delete_pred_related_schema, const ::google::protobuf::RepeatedPtrField& sub_pred_list, diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index d3554cae15d66a4..9d40ff5a8fad51b 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -17,6 +17,7 @@ #include "olap/primary_key_index.h" +#include #include #include @@ -95,7 +96,8 @@ Status PrimaryKeyIndexReader::parse_index(io::FileReaderSPtr file_reader, // parse primary key index _index_reader.reset(new segment_v2::IndexedColumnReader(file_reader, meta.primary_key_index())); _index_reader->set_is_pk_index(true); - RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false)); + RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false, + _pk_index_load_stats)); _index_parsed = true; return Status::OK(); @@ -107,7 +109,8 @@ Status PrimaryKeyIndexReader::parse_bf(io::FileReaderSPtr file_reader, segment_v2::ColumnIndexMetaPB column_index_meta = meta.bloom_filter_index(); segment_v2::BloomFilterIndexReader bf_index_reader(std::move(file_reader), column_index_meta.bloom_filter_index()); - RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false)); + RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false, + _pk_index_load_stats)); std::unique_ptr bf_iter; RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter)); RETURN_IF_ERROR(bf_iter->read_bloom_filter(0, &_bf)); diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h index b5eb13131b73a09..dcbbc5f30625f4e 100644 --- a/be/src/olap/primary_key_index.h +++ b/be/src/olap/primary_key_index.h @@ -25,6 +25,7 @@ #include "common/status.h" #include "io/fs/file_reader_writer_fwd.h" +#include "olap/olap_common.h" #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/rowset/segment_v2/bloom_filter_index_writer.h" #include "olap/rowset/segment_v2/indexed_column_reader.h" @@ -97,7 +98,8 @@ class PrimaryKeyIndexBuilder { class PrimaryKeyIndexReader { public: - PrimaryKeyIndexReader() : _index_parsed(false), _bf_parsed(false) {} + PrimaryKeyIndexReader(OlapReaderStatistics* pk_index_load_stats = nullptr) + : _index_parsed(false), _bf_parsed(false), _pk_index_load_stats(pk_index_load_stats) {} ~PrimaryKeyIndexReader() { segment_v2::g_pk_total_bloom_filter_num << -static_cast(_bf_num); @@ -111,9 +113,10 @@ class PrimaryKeyIndexReader { Status parse_bf(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta); - Status new_iterator(std::unique_ptr* index_iterator) const { + Status new_iterator(std::unique_ptr* index_iterator, + OlapReaderStatistics* stats = nullptr) const { DCHECK(_index_parsed); - index_iterator->reset(new segment_v2::IndexedColumnIterator(_index_reader.get())); + index_iterator->reset(new segment_v2::IndexedColumnIterator(_index_reader.get(), stats)); return Status::OK(); } @@ -152,6 +155,7 @@ class PrimaryKeyIndexReader { std::unique_ptr _bf; size_t _bf_num = 0; uint64 _bf_bytes = 0; + OlapReaderStatistics* _pk_index_load_stats = nullptr; }; } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index ee1605a3043daa0..4b51dcc3530476a 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -498,7 +498,7 @@ Status BetaRowset::upload_to(const StorageResource& dest_fs, const RowsetId& new auto st = dest_fs.fs->batch_upload(local_paths, dest_paths); if (st.ok()) { DorisMetrics::instance()->upload_rowset_count->increment(1); - DorisMetrics::instance()->upload_total_byte->increment(data_disk_size()); + DorisMetrics::instance()->upload_total_byte->increment(total_disk_size()); } else { DorisMetrics::instance()->upload_fail_count->increment(1); } diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 3f60e7c5674ae1d..037fae316e91578 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -81,7 +81,7 @@ void build_rowset_meta_with_spec_field(RowsetMeta& rowset_meta, const RowsetMeta& spec_rowset_meta) { rowset_meta.set_num_rows(spec_rowset_meta.num_rows()); rowset_meta.set_total_disk_size(spec_rowset_meta.total_disk_size()); - rowset_meta.set_data_disk_size(spec_rowset_meta.total_disk_size()); + rowset_meta.set_data_disk_size(spec_rowset_meta.data_disk_size()); rowset_meta.set_index_disk_size(spec_rowset_meta.index_disk_size()); // TODO write zonemap to meta rowset_meta.set_empty(spec_rowset_meta.num_rows() == 0); @@ -886,7 +886,8 @@ Status BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch rowset_meta->set_num_segments(segment_num); rowset_meta->set_num_rows(num_rows_written + _num_rows_written); - rowset_meta->set_total_disk_size(total_data_size + _total_data_size); + rowset_meta->set_total_disk_size(total_data_size + _total_data_size + total_index_size + + _total_index_size); rowset_meta->set_data_disk_size(total_data_size + _total_data_size); rowset_meta->set_index_disk_size(total_index_size + _total_index_size); rowset_meta->set_segments_key_bounds(segments_encoded_key_bounds); @@ -955,7 +956,7 @@ Status BaseBetaRowsetWriter::create_inverted_index_file_writer( return Status::OK(); } -Status BetaRowsetWriter::_create_segment_writer_for_segcompaction( +Status BetaRowsetWriter::create_segment_writer_for_segcompaction( std::unique_ptr* writer, int64_t begin, int64_t end) { DCHECK(begin >= 0 && end >= 0); std::string path = BetaRowset::local_segment_path_segcompacted(_context.tablet_path, @@ -995,6 +996,11 @@ Status BetaRowsetWriter::_create_segment_writer_for_segcompaction( RETURN_IF_ERROR(_segcompaction_worker->get_file_writer()->close()); } _segcompaction_worker->get_file_writer().reset(file_writer.release()); + if (auto& idx_file_writer = _segcompaction_worker->get_inverted_index_file_writer(); + idx_file_writer != nullptr) { + RETURN_IF_ERROR(idx_file_writer->close()); + } + _segcompaction_worker->get_inverted_index_file_writer().reset(index_file_writer.release()); return Status::OK(); } @@ -1089,8 +1095,8 @@ Status BetaRowsetWriter::flush_segment_writer_for_segcompaction( SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + inverted_index_file_size; - segstat.index_size = index_size + inverted_index_file_size; + segstat.data_size = segment_size; + segstat.index_size = inverted_index_file_size; segstat.key_bounds = key_bounds; { std::lock_guard lock(_segid_statistics_map_mutex); diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index 47e12a531e96d5a..d96301af22630d1 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -223,7 +223,6 @@ class BaseBetaRowsetWriter : public RowsetWriter { RETURN_NOT_OK_STATUS_WITH_WARN(_idx_files.close(), "failed to close index file when build new rowset"); this->_total_index_size += _idx_files.get_total_index_size(); - this->_total_data_size += _idx_files.get_total_index_size(); return Status::OK(); } @@ -282,6 +281,8 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { Status flush_segment_writer_for_segcompaction( std::unique_ptr* writer, uint64_t index_size, KeyBoundsPB& key_bounds); + Status create_segment_writer_for_segcompaction( + std::unique_ptr* writer, int64_t begin, int64_t end); bool is_segcompacted() const { return _num_segcompacted > 0; } @@ -292,8 +293,6 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { Status _check_segment_number_limit(size_t segnum) override; int64_t _num_seg() const override; Status _wait_flying_segcompaction(); - Status _create_segment_writer_for_segcompaction( - std::unique_ptr* writer, int64_t begin, int64_t end); Status _segcompaction_if_necessary(); Status _segcompaction_rename_last_segments(); Status _load_noncompacted_segment(segment_v2::SegmentSharedPtr& segment, int32_t segment_id); diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 24e660cd2f72101..e1a2347f6aeaa8b 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -149,7 +149,8 @@ class Rowset : public std::enable_shared_from_this { int64_t start_version() const { return rowset_meta()->version().first; } int64_t end_version() const { return rowset_meta()->version().second; } size_t index_disk_size() const { return rowset_meta()->index_disk_size(); } - size_t data_disk_size() const { return rowset_meta()->total_disk_size(); } + size_t data_disk_size() const { return rowset_meta()->data_disk_size(); } + size_t total_disk_size() const { return rowset_meta()->total_disk_size(); } bool empty() const { return rowset_meta()->empty(); } bool zero_num_rows() const { return rowset_meta()->num_rows() == 0; } size_t num_rows() const { return rowset_meta()->num_rows(); } diff --git a/be/src/olap/rowset/rowset_meta.cpp b/be/src/olap/rowset/rowset_meta.cpp index 1571105fa734713..6bed5e800ede4dd 100644 --- a/be/src/olap/rowset/rowset_meta.cpp +++ b/be/src/olap/rowset/rowset_meta.cpp @@ -226,6 +226,7 @@ void RowsetMeta::merge_rowset_meta(const RowsetMeta& other) { set_data_disk_size(data_disk_size() + other.data_disk_size()); set_total_disk_size(total_disk_size() + other.total_disk_size()); set_index_disk_size(index_disk_size() + other.index_disk_size()); + set_total_disk_size(data_disk_size() + index_disk_size()); for (auto&& key_bound : other.get_segments_key_bounds()) { add_segment_key_bounds(key_bound); } diff --git a/be/src/olap/rowset/segcompaction.cpp b/be/src/olap/rowset/segcompaction.cpp index e5d043d8a224864..f901c786062c952 100644 --- a/be/src/olap/rowset/segcompaction.cpp +++ b/be/src/olap/rowset/segcompaction.cpp @@ -232,7 +232,7 @@ Status SegcompactionWorker::_check_correctness(OlapReaderStatistics& reader_stat Status SegcompactionWorker::_create_segment_writer_for_segcompaction( std::unique_ptr* writer, uint32_t begin, uint32_t end) { - return _writer->_create_segment_writer_for_segcompaction(writer, begin, end); + return _writer->create_segment_writer_for_segcompaction(writer, begin, end); } Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPtr segments) { diff --git a/be/src/olap/rowset/segcompaction.h b/be/src/olap/rowset/segcompaction.h index 54c5c3758c20c85..5ec74c0e6609635 100644 --- a/be/src/olap/rowset/segcompaction.h +++ b/be/src/olap/rowset/segcompaction.h @@ -25,6 +25,7 @@ #include "olap/merger.h" #include "olap/simple_rowid_conversion.h" #include "olap/tablet.h" +#include "segment_v2/inverted_index_file_writer.h" #include "segment_v2/segment.h" namespace doris { @@ -69,6 +70,9 @@ class SegcompactionWorker { DeleteBitmapPtr get_converted_delete_bitmap() { return _converted_delete_bitmap; } io::FileWriterPtr& get_file_writer() { return _file_writer; } + InvertedIndexFileWriterPtr& get_inverted_index_file_writer() { + return _inverted_index_file_writer; + } // set the cancel flag, tasks already started will not be cancelled. bool cancel(); @@ -96,6 +100,7 @@ class SegcompactionWorker { // Currently cloud storage engine doesn't need segcompaction BetaRowsetWriter* _writer = nullptr; io::FileWriterPtr _file_writer; + InvertedIndexFileWriterPtr _inverted_index_file_writer = nullptr; // for unique key mow table std::unique_ptr _rowid_conversion = nullptr; diff --git a/be/src/olap/rowset/segment_creator.cpp b/be/src/olap/rowset/segment_creator.cpp index 5f4a3dce7b807e7..e0eb7534123a860 100644 --- a/be/src/olap/rowset/segment_creator.cpp +++ b/be/src/olap/rowset/segment_creator.cpp @@ -225,9 +225,9 @@ Status SegmentFlusher::_flush_segment_writer( if (row_num == 0) { return Status::OK(); } - uint64_t segment_size; - uint64_t index_size; - Status s = writer->finalize(&segment_size, &index_size); + uint64_t segment_file_size; + uint64_t common_index_size; + Status s = writer->finalize(&segment_file_size, &common_index_size); if (!s.ok()) { return Status::Error(s.code(), "failed to finalize segment: {}", s.to_string()); } @@ -249,16 +249,20 @@ Status SegmentFlusher::_flush_segment_writer( uint32_t segment_id = writer->segment_id(); SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + inverted_index_file_size; - segstat.index_size = index_size + inverted_index_file_size; + segstat.data_size = segment_file_size; + segstat.index_size = inverted_index_file_size; segstat.key_bounds = key_bounds; + LOG(INFO) << "tablet_id:" << _context.tablet_id + << ", flushing rowset_dir: " << _context.tablet_path + << ", rowset_id:" << _context.rowset_id << ", data size:" << segstat.data_size + << ", index size:" << segstat.index_size; writer.reset(); RETURN_IF_ERROR(_context.segment_collector->add(segment_id, segstat, flush_schema)); if (flush_size) { - *flush_size = segment_size + inverted_index_file_size; + *flush_size = segment_file_size; } return Status::OK(); } @@ -274,9 +278,9 @@ Status SegmentFlusher::_flush_segment_writer(std::unique_ptrfinalize(&segment_size, &index_size); + uint64_t segment_file_size; + uint64_t common_index_size; + Status s = writer->finalize(&segment_file_size, &common_index_size); if (!s.ok()) { return Status::Error(s.code(), "failed to finalize segment: {}", s.to_string()); } @@ -298,16 +302,20 @@ Status SegmentFlusher::_flush_segment_writer(std::unique_ptrget_segment_id(); SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + inverted_index_file_size; - segstat.index_size = index_size + inverted_index_file_size; + segstat.data_size = segment_file_size; + segstat.index_size = inverted_index_file_size; segstat.key_bounds = key_bounds; + LOG(INFO) << "tablet_id:" << _context.tablet_id + << ", flushing rowset_dir: " << _context.tablet_path + << ", rowset_id:" << _context.rowset_id << ", data size:" << segstat.data_size + << ", index size:" << segstat.index_size; writer.reset(); RETURN_IF_ERROR(_context.segment_collector->add(segment_id, segstat, flush_schema)); if (flush_size) { - *flush_size = segment_size + inverted_index_file_size; + *flush_size = segment_file_size; } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp index 3a1c9f538138f40..609d21ce4f5c224 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp @@ -31,8 +31,10 @@ namespace doris { namespace segment_v2 { -Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory) { +Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { // TODO yyq: implement a new once flag to avoid status construct. + _index_load_stats = index_load_stats; return _load_once.call([this, use_page_cache, kept_in_memory] { return _load(use_page_cache, kept_in_memory); }); @@ -47,7 +49,7 @@ Status BloomFilterIndexReader::_load(bool use_page_cache, bool kept_in_memory) { const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta->bloom_filter(); _bloom_filter_reader.reset(new IndexedColumnReader(_file_reader, bf_index_meta)); - RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory)); + RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory, _index_load_stats)); update_metadata_size(); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h index a10a910b2e1ac4a..fcb0239a2440fa3 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h @@ -47,7 +47,8 @@ class BloomFilterIndexReader : public MetadataAdder { _bloom_filter_index_meta.reset(new BloomFilterIndexPB(bloom_filter_index_meta)); } - Status load(bool use_page_cache, bool kept_in_memory); + Status load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* _bf_index_load_stats = nullptr); BloomFilterAlgorithmPB algorithm() { return _bloom_filter_index_meta->algorithm(); } @@ -69,6 +70,7 @@ class BloomFilterIndexReader : public MetadataAdder { const TypeInfo* _type_info = nullptr; std::unique_ptr _bloom_filter_index_meta = nullptr; std::unique_ptr _bloom_filter_reader; + OlapReaderStatistics* _index_load_stats = nullptr; }; class BloomFilterIndexIterator { diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp index cce35d0b8d63e60..3028211f2661577 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp @@ -62,9 +62,11 @@ int64_t IndexedColumnReader::get_metadata_size() const { return sizeof(IndexedColumnReader) + _meta.ByteSizeLong(); } -Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory) { +Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { _use_page_cache = use_page_cache; _kept_in_memory = kept_in_memory; + _index_load_stats = index_load_stats; _type_info = get_scalar_type_info((FieldType)_meta.data_type()); if (_type_info == nullptr) { @@ -107,7 +109,7 @@ Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* BlockCompressionCodec* local_compress_codec; RETURN_IF_ERROR(get_block_compression_codec(_meta.compression(), &local_compress_codec)); RETURN_IF_ERROR(read_page(PagePointer(pp), handle, &body, &footer, INDEX_PAGE, - local_compress_codec, false)); + local_compress_codec, false, _index_load_stats)); RETURN_IF_ERROR(reader->parse(body, footer.index_page_footer())); _mem_size += body.get_size(); return Status::OK(); @@ -115,8 +117,10 @@ Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* handle, Slice* body, PageFooterPB* footer, PageTypePB type, - BlockCompressionCodec* codec, bool pre_decode) const { + BlockCompressionCodec* codec, bool pre_decode, + OlapReaderStatistics* stats) const { OlapReaderStatistics tmp_stats; + OlapReaderStatistics* stats_ptr = stats != nullptr ? stats : &tmp_stats; PageReadOptions opts { .use_page_cache = _use_page_cache, .kept_in_memory = _kept_in_memory, @@ -125,9 +129,10 @@ Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* handle, .file_reader = _file_reader.get(), .page_pointer = pp, .codec = codec, - .stats = &tmp_stats, + .stats = stats_ptr, .encoding_info = _encoding_info, - .io_ctx = io::IOContext {.is_index_data = true}, + .io_ctx = io::IOContext {.is_index_data = true, + .file_cache_stats = &stats_ptr->file_cache_stats}, }; if (_is_pk_index) { opts.type = PRIMARY_KEY_INDEX_PAGE; @@ -154,8 +159,8 @@ Status IndexedColumnIterator::_read_data_page(const PagePointer& pp) { PageHandle handle; Slice body; PageFooterPB footer; - RETURN_IF_ERROR( - _reader->read_page(pp, &handle, &body, &footer, DATA_PAGE, _compress_codec, true)); + RETURN_IF_ERROR(_reader->read_page(pp, &handle, &body, &footer, DATA_PAGE, _compress_codec, + true, _stats)); // parse data page // note that page_index is not used in IndexedColumnIterator, so we pass 0 PageDecoderOptions opts; diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.h b/be/src/olap/rowset/segment_v2/indexed_column_reader.h index 8a57383cd04c36b..c3469f9f6bed0d4 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.h @@ -27,6 +27,7 @@ #include "common/status.h" #include "io/fs/file_reader_writer_fwd.h" +#include "olap/olap_common.h" #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/index_page.h" #include "olap/rowset/segment_v2/page_handle.h" @@ -53,11 +54,13 @@ class IndexedColumnReader : public MetadataAdder { ~IndexedColumnReader(); - Status load(bool use_page_cache, bool kept_in_memory); + Status load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats = nullptr); // read a page specified by `pp' from `file' into `handle' Status read_page(const PagePointer& pp, PageHandle* handle, Slice* body, PageFooterPB* footer, - PageTypePB type, BlockCompressionCodec* codec, bool pre_decode) const; + PageTypePB type, BlockCompressionCodec* codec, bool pre_decode, + OlapReaderStatistics* stats = nullptr) const; int64_t num_values() const { return _num_values; } const EncodingInfo* encoding_info() const { return _encoding_info; } @@ -97,14 +100,17 @@ class IndexedColumnReader : public MetadataAdder { const KeyCoder* _value_key_coder = nullptr; uint64_t _mem_size = 0; bool _is_pk_index = false; + OlapReaderStatistics* _index_load_stats = nullptr; }; class IndexedColumnIterator { public: - explicit IndexedColumnIterator(const IndexedColumnReader* reader) + explicit IndexedColumnIterator(const IndexedColumnReader* reader, + OlapReaderStatistics* stats = nullptr) : _reader(reader), _ordinal_iter(&reader->_ordinal_index_reader), - _value_iter(&reader->_value_index_reader) {} + _value_iter(&reader->_value_index_reader), + _stats(stats) {} // Seek to the given ordinal entry. Entry 0 is the first entry. // Return Status::Error if provided seek point is past the end. @@ -153,6 +159,7 @@ class IndexedColumnIterator { ordinal_t _current_ordinal = 0; // iterator owned compress codec, should NOT be shared by threads, initialized before used BlockCompressionCodec* _compress_codec = nullptr; + OlapReaderStatistics* _stats = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 469d0b9cf21ba3c..0ad799683fc458e 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -482,7 +482,8 @@ Status Segment::_load_pk_bloom_filter() { }); } -Status Segment::load_pk_index_and_bf() { +Status Segment::load_pk_index_and_bf(OlapReaderStatistics* index_load_stats) { + _pk_index_load_stats = index_load_stats; RETURN_IF_ERROR(load_index()); RETURN_IF_ERROR(_load_pk_bloom_filter()); return Status::OK(); @@ -491,7 +492,7 @@ Status Segment::load_pk_index_and_bf() { Status Segment::load_index() { return _load_index_once.call([this] { if (_tablet_schema->keys_type() == UNIQUE_KEYS && _pk_index_meta != nullptr) { - _pk_index_reader = std::make_unique(); + _pk_index_reader = std::make_unique(_pk_index_load_stats); RETURN_IF_ERROR(_pk_index_reader->parse_index(_file_reader, *_pk_index_meta)); // _meta_mem_usage += _pk_index_reader->get_memory_size(); return Status::OK(); @@ -951,7 +952,7 @@ Status Segment::new_inverted_index_iterator(const TabletColumn& tablet_column, Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_schema, bool with_seq_col, bool with_rowid, RowLocation* row_location, - std::string* encoded_seq_value) { + std::string* encoded_seq_value, OlapReaderStatistics* stats) { RETURN_IF_ERROR(load_pk_index_and_bf()); bool has_seq_col = latest_schema->has_sequence_col(); bool has_rowid = !latest_schema->cluster_key_idxes().empty(); @@ -971,7 +972,7 @@ Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_sche } bool exact_match = false; std::unique_ptr index_iterator; - RETURN_IF_ERROR(_pk_index_reader->new_iterator(&index_iterator)); + RETURN_IF_ERROR(_pk_index_reader->new_iterator(&index_iterator, stats)); auto st = index_iterator->seek_at_or_after(&key_without_seq, &exact_match); if (!st.ok() && !st.is()) { return st; diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 24f4230bc247190..bc5ab1e1fdc80ad 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -133,7 +133,8 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd Status lookup_row_key(const Slice& key, const TabletSchema* latest_schema, bool with_seq_col, bool with_rowid, RowLocation* row_location, - std::string* encoded_seq_value = nullptr); + std::string* encoded_seq_value = nullptr, + OlapReaderStatistics* stats = nullptr); Status read_key_by_rowid(uint32_t row_id, std::string* key); @@ -143,7 +144,7 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd Status load_index(); - Status load_pk_index_and_bf(); + Status load_pk_index_and_bf(OlapReaderStatistics* index_load_stats = nullptr); void update_healthy_status(Status new_status) { _healthy_status.update(new_status); } // The segment is loaded into SegmentCache and then will load indices, if there are something wrong @@ -301,6 +302,7 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd InvertedIndexFileInfo _idx_file_info; int _be_exec_version = BeExecVersionManager::get_newest_version(); + OlapReaderStatistics* _pk_index_load_stats = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 1b7416c5d2dffc3..805d9b65d19c53b 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -1043,8 +1043,10 @@ Status VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock& _opts.rowset_ctx->merged_tablet_schema = _opts.rowset_ctx->tablet_schema; } TabletSchemaSPtr update_schema; + bool check_schema_size = true; RETURN_IF_ERROR(vectorized::schema_util::get_least_common_schema( - {_opts.rowset_ctx->merged_tablet_schema, _flush_schema}, nullptr, update_schema)); + {_opts.rowset_ctx->merged_tablet_schema, _flush_schema}, nullptr, update_schema, + check_schema_size)); CHECK_GE(update_schema->num_columns(), _flush_schema->num_columns()) << "Rowset merge schema columns count is " << update_schema->num_columns() << ", but flush_schema is larger " << _flush_schema->num_columns() diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp index fb8f66226858569..46070f8dccd7ce1 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -138,7 +138,6 @@ Status VerticalBetaRowsetWriter::_flush_columns(segment_v2::SegmentWriter* se this->_segment_num_rows.resize(_cur_writer_idx + 1); this->_segment_num_rows[_cur_writer_idx] = _segment_writers[_cur_writer_idx]->row_count(); } - this->_total_index_size += static_cast(index_size); return Status::OK(); } diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index abc82c6f3ee98d3..26ac54c699b81a4 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -17,6 +17,8 @@ #include "olap/segment_loader.h" +#include + #include "common/config.h" #include "common/status.h" #include "olap/olap_define.h" @@ -52,7 +54,8 @@ void SegmentCache::erase(const SegmentCache::CacheKey& key) { Status SegmentLoader::load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, bool use_cache, - bool need_load_pk_index_and_bf) { + bool need_load_pk_index_and_bf, + OlapReaderStatistics* index_load_stats) { if (cache_handle->is_inited()) { return Status::OK(); } @@ -70,7 +73,7 @@ Status SegmentLoader::load_segments(const BetaRowsetSharedPtr& rowset, segment_v2::SegmentSharedPtr segment; RETURN_IF_ERROR(rowset->load_segment(i, &segment)); if (need_load_pk_index_and_bf) { - RETURN_IF_ERROR(segment->load_pk_index_and_bf()); + RETURN_IF_ERROR(segment->load_pk_index_and_bf(index_load_stats)); } if (use_cache && !config::disable_segment_cache) { // memory of SegmentCache::CacheValue will be handled by SegmentCache diff --git a/be/src/olap/segment_loader.h b/be/src/olap/segment_loader.h index b3b88fa7700409e..834906da93bf740 100644 --- a/be/src/olap/segment_loader.h +++ b/be/src/olap/segment_loader.h @@ -117,7 +117,8 @@ class SegmentLoader { // Load segments of "rowset", return the "cache_handle" which contains segments. // If use_cache is true, it will be loaded from _cache. Status load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, - bool use_cache = false, bool need_load_pk_index_and_bf = false); + bool use_cache = false, bool need_load_pk_index_and_bf = false, + OlapReaderStatistics* index_load_stats = nullptr); void erase_segment(const SegmentCache::CacheKey& key); diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index ef93ab25caeac9f..7470afe0ef62c72 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -149,11 +149,15 @@ Status SingleReplicaCompaction::_do_single_replica_compaction_impl() { LOG(INFO) << "succeed to do single replica compaction" << ". tablet=" << _tablet->tablet_id() << ", output_version=" << _output_version << ", current_max_version=" << current_max_version - << ", input_rowset_size=" << _input_rowsets_size + << ", input_rowsets_data_size=" << _input_rowsets_data_size + << ", input_rowsets_index_size=" << _input_rowsets_index_size + << ", input_rowsets_total_size=" << _input_rowsets_total_size << ", input_row_num=" << _input_row_num << ", input_segments_num=" << _input_num_segments - << ", _input_index_size=" << _input_index_size + << ", _input_index_size=" << _input_rowsets_index_size << ", output_rowset_data_size=" << _output_rowset->data_disk_size() + << ", output_rowset_index_size=" << _output_rowset->index_disk_size() + << ", output_rowset_total_size=" << _output_rowset->total_disk_size() << ", output_row_num=" << _output_rowset->num_rows() << ", output_segments_num=" << _output_rowset->num_segments(); return Status::OK(); @@ -264,10 +268,11 @@ bool SingleReplicaCompaction::_find_rowset_to_fetch(const std::vector& return false; } for (auto& rowset : _input_rowsets) { - _input_rowsets_size += rowset->data_disk_size(); + _input_rowsets_data_size += rowset->data_disk_size(); _input_row_num += rowset->num_rows(); _input_num_segments += rowset->num_segments(); - _input_index_size += rowset->index_disk_size(); + _input_rowsets_index_size += rowset->index_disk_size(); + _input_rowsets_total_size += rowset->data_disk_size() + rowset->index_disk_size(); } _output_version = *proper_version; } diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 450f3d2cb8bab45..b1d4c9dfb891f62 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include "common/compiler_util.h" // IWYU pragma: keep @@ -86,6 +88,7 @@ #include "olap/rowset/beta_rowset.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_meta_manager.h" #include "olap/rowset/rowset_writer.h" @@ -329,6 +332,7 @@ Status Tablet::init() { // should save tablet meta to remote meta store // if it's a primary replica void Tablet::save_meta() { + check_table_size_correctness(); auto res = _tablet_meta->save_meta(_data_dir); CHECK_EQ(res, Status::OK()) << "fail to save tablet_meta. res=" << res << ", root=" << _data_dir->path(); @@ -1201,10 +1205,6 @@ Status Tablet::_contains_version(const Version& version) { return Status::OK(); } -TabletInfo Tablet::get_tablet_info() const { - return TabletInfo(tablet_id(), tablet_uid()); -} - std::vector Tablet::pick_candidate_rowsets_to_cumulative_compaction() { std::vector candidate_rowsets; if (_cumulative_point == K_INVALID_CUMULATIVE_POINT) { @@ -2047,8 +2047,8 @@ Status Tablet::_cooldown_data(RowsetSharedPtr rowset) { LOG(INFO) << "Upload rowset " << old_rowset->version() << " " << new_rowset_id.to_string() << " to " << storage_resource.fs->root_path().native() << ", tablet_id=" << tablet_id() << ", duration=" << duration.count() - << ", capacity=" << old_rowset->data_disk_size() - << ", tp=" << old_rowset->data_disk_size() / duration.count() + << ", capacity=" << old_rowset->total_disk_size() + << ", tp=" << old_rowset->total_disk_size() / duration.count() << ", old rowset_id=" << old_rowset->rowset_id().to_string(); // gen a new rowset @@ -2427,7 +2427,7 @@ RowsetSharedPtr Tablet::need_cooldown(int64_t* cooldown_timestamp, size_t* file_ // current time or it's datatime is less than current time if (newest_cooldown_time != 0 && newest_cooldown_time < UnixSeconds()) { *cooldown_timestamp = newest_cooldown_time; - *file_size = rowset->data_disk_size(); + *file_size = rowset->total_disk_size(); VLOG_DEBUG << "tablet need cooldown, tablet id: " << tablet_id() << " file_size: " << *file_size; return rowset; @@ -2737,4 +2737,120 @@ void Tablet::clear_cache() { } } +void Tablet::check_table_size_correctness() { + if (!config::enable_table_size_correctness_check) { + return; + } + const std::vector& all_rs_metas = _tablet_meta->all_rs_metas(); + for (const auto& rs_meta : all_rs_metas) { + int64_t total_segment_size = get_segment_file_size(rs_meta); + int64_t total_inverted_index_size = get_inverted_index_file_szie(rs_meta); + if (rs_meta->data_disk_size() != total_segment_size || + rs_meta->index_disk_size() != total_inverted_index_size || + rs_meta->data_disk_size() + rs_meta->index_disk_size() != rs_meta->total_disk_size()) { + LOG(WARNING) << "[Local table table size check failed]:" + << " tablet id: " << rs_meta->tablet_id() + << ", rowset id:" << rs_meta->rowset_id() + << ", rowset data disk size:" << rs_meta->data_disk_size() + << ", rowset real data disk size:" << total_segment_size + << ", rowset index disk size:" << rs_meta->index_disk_size() + << ", rowset real index disk size:" << total_inverted_index_size + << ", rowset total disk size:" << rs_meta->total_disk_size() + << ", rowset segment path:" + << StorageResource().remote_segment_path( + rs_meta->tablet_id(), rs_meta->rowset_id().to_string(), 0); + DCHECK(false); + } + } +} + +std::string Tablet::get_segment_path(const RowsetMetaSharedPtr& rs_meta, int64_t seg_id) { + std::string segment_path; + if (rs_meta->is_local()) { + segment_path = local_segment_path(_tablet_path, rs_meta->rowset_id().to_string(), seg_id); + } else { + segment_path = rs_meta->remote_storage_resource().value()->remote_segment_path( + rs_meta->tablet_id(), rs_meta->rowset_id().to_string(), seg_id); + } + return segment_path; +} + +int64_t Tablet::get_segment_file_size(const RowsetMetaSharedPtr& rs_meta) { + const auto& fs = rs_meta->fs(); + if (!fs) { + LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta->resource_id(); + } + int64_t total_segment_size = 0; + for (int64_t seg_id = 0; seg_id < rs_meta->num_segments(); seg_id++) { + std::string segment_path = get_segment_path(rs_meta, seg_id); + int64_t segment_file_size = 0; + auto st = fs->file_size(segment_path, &segment_file_size); + if (!st.ok()) { + segment_file_size = 0; + LOG(WARNING) << "table size correctness check get segment size failed! msg:" + << st.to_string() << ", segment path:" << segment_path; + } + total_segment_size += segment_file_size; + } + return total_segment_size; +} + +int64_t Tablet::get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta) { + const auto& fs = rs_meta->fs(); + if (!fs) { + LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta->resource_id(); + } + int64_t total_inverted_index_size = 0; + + if (rs_meta->tablet_schema()->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { + auto indices = rs_meta->tablet_schema()->indexes(); + for (auto& index : indices) { + // only get file_size for inverted index + if (index.index_type() != IndexType::INVERTED) { + continue; + } + for (int seg_id = 0; seg_id < rs_meta->num_segments(); ++seg_id) { + std::string segment_path = get_segment_path(rs_meta, seg_id); + int64_t file_size = 0; + + std::string inverted_index_file_path = + InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_path), + index.index_id(), index.get_index_suffix()); + auto st = fs->file_size(inverted_index_file_path, &file_size); + if (!st.ok()) { + file_size = 0; + LOG(WARNING) << " tablet id: " << get_tablet_info().tablet_id + << ", rowset id:" << rs_meta->rowset_id() + << ", table size correctness check get inverted index v1 " + "size failed! msg:" + << st.to_string() + << ", inverted index path:" << inverted_index_file_path; + } + total_inverted_index_size += file_size; + } + } + } else { + for (int seg_id = 0; seg_id < rs_meta->num_segments(); ++seg_id) { + int64_t file_size = 0; + std::string segment_path = get_segment_path(rs_meta, seg_id); + std::string inverted_index_file_path = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_path)); + auto st = fs->file_size(inverted_index_file_path, &file_size); + if (!st.ok()) { + file_size = 0; + LOG(WARNING) << " tablet id: " << get_tablet_info().tablet_id + << ", rowset id:" << rs_meta->rowset_id() + << ", table size correctness check get inverted index v2 " + "size failed! msg:" + << st.to_string() + << ", inverted index path:" << inverted_index_file_path; + } + total_inverted_index_size += file_size; + } + } + return total_inverted_index_size; +} + } // namespace doris diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 2d7975b0fc12d14..f5866c67641581c 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -115,7 +115,6 @@ class Tablet final : public BaseTablet { DataDir* data_dir() const { return _data_dir; } int64_t replica_id() const { return _tablet_meta->replica_id(); } - TabletUid tablet_uid() const { return _tablet_meta->tablet_uid(); } const std::string& tablet_path() const { return _tablet_path; } @@ -279,8 +278,6 @@ class Tablet final : public BaseTablet { void check_tablet_path_exists(); - TabletInfo get_tablet_info() const; - std::vector pick_candidate_rowsets_to_cumulative_compaction(); std::vector pick_candidate_rowsets_to_base_compaction(); std::vector pick_candidate_rowsets_to_full_compaction(); @@ -534,6 +531,10 @@ class Tablet final : public BaseTablet { //////////////////////////////////////////////////////////////////////////// void _clear_cache_by_rowset(const BetaRowsetSharedPtr& rowset); + void check_table_size_correctness(); + std::string get_segment_path(const RowsetMetaSharedPtr& rs_meta, int64_t seg_id); + int64_t get_segment_file_size(const RowsetMetaSharedPtr& rs_meta); + int64_t get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta); public: static const int64_t K_INVALID_CUMULATIVE_POINT = -1; diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 394aeb17b85183e..d56e529e42bf4b4 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -642,7 +642,7 @@ inline size_t TabletMeta::num_rows() const { inline size_t TabletMeta::tablet_footprint() const { size_t total_size = 0; for (auto& rs : _rs_metas) { - total_size += rs->data_disk_size(); + total_size += rs->total_disk_size(); } return total_size; } @@ -651,7 +651,7 @@ inline size_t TabletMeta::tablet_local_size() const { size_t total_size = 0; for (auto& rs : _rs_metas) { if (rs->is_local()) { - total_size += rs->data_disk_size(); + total_size += rs->total_disk_size(); } } return total_size; @@ -661,7 +661,7 @@ inline size_t TabletMeta::tablet_remote_size() const { size_t total_size = 0; for (auto& rs : _rs_metas) { if (!rs->is_local()) { - total_size += rs->data_disk_size(); + total_size += rs->total_disk_size(); } } return total_size; diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index c88a23a0c360cf9..4041afac78ee136 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -747,7 +747,15 @@ void TabletIndex::init_from_thrift(const TOlapTableIndex& index, if (column_idx >= 0) { col_unique_ids[i] = tablet_schema.column(column_idx).unique_id(); } else { - col_unique_ids[i] = -1; + // if column unique id not found by column name, find by column unique id + // column unique id can not bigger than tablet schema column size, if bigger than column size means + // this column is a new column added by light schema change + if (index.__isset.column_unique_ids && + index.column_unique_ids[i] < tablet_schema.num_columns()) { + col_unique_ids[i] = index.column_unique_ids[i]; + } else { + col_unique_ids[i] = -1; + } } } _col_unique_ids = std::move(col_unique_ids); diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp index d0c4b0e45f468ef..05ecfc0401b6d04 100644 --- a/be/src/olap/task/engine_checksum_task.cpp +++ b/be/src/olap/task/engine_checksum_task.cpp @@ -93,7 +93,7 @@ Status EngineChecksumTask::_compute_checksum() { } size_t input_size = 0; for (const auto& rowset : input_rowsets) { - input_size += rowset->data_disk_size(); + input_size += rowset->total_disk_size(); } auto res = reader.init(reader_params); diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index 2ecadfa53b09b78..8f8c3f7ad8004e5 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -207,13 +207,12 @@ Status IndexBuilder::update_inverted_index_info() { InvertedIndexStorageFormatPB::V1) { if (_is_drop_op) { VLOG_DEBUG << "data_disk_size:" << input_rowset_meta->data_disk_size() - << " total_disk_size:" << input_rowset_meta->data_disk_size() + << " total_disk_size:" << input_rowset_meta->total_disk_size() << " index_disk_size:" << input_rowset_meta->index_disk_size() << " drop_index_size:" << drop_index_size; rowset_meta->set_total_disk_size(input_rowset_meta->total_disk_size() - drop_index_size); - rowset_meta->set_data_disk_size(input_rowset_meta->data_disk_size() - - drop_index_size); + rowset_meta->set_data_disk_size(input_rowset_meta->data_disk_size()); rowset_meta->set_index_disk_size(input_rowset_meta->index_disk_size() - drop_index_size); } else { @@ -238,7 +237,7 @@ Status IndexBuilder::update_inverted_index_info() { } rowset_meta->set_total_disk_size(input_rowset_meta->total_disk_size() - total_index_size); - rowset_meta->set_data_disk_size(input_rowset_meta->data_disk_size() - total_index_size); + rowset_meta->set_data_disk_size(input_rowset_meta->data_disk_size()); rowset_meta->set_index_disk_size(input_rowset_meta->index_disk_size() - total_index_size); } @@ -323,8 +322,7 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta inverted_index_size += inverted_index_writer->get_index_file_total_size(); } _inverted_index_file_writers.clear(); - output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size() + - inverted_index_size); + output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size()); output_rowset_meta->set_total_disk_size(output_rowset_meta->total_disk_size() + inverted_index_size); output_rowset_meta->set_index_disk_size(output_rowset_meta->index_disk_size() + @@ -383,10 +381,14 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta auto column_name = inverted_index.columns[0]; auto column_idx = output_rowset_schema->field_index(column_name); if (column_idx < 0) { - LOG(WARNING) << "referenced column was missing. " - << "[column=" << column_name << " referenced_column=" << column_idx - << "]"; - continue; + column_idx = + output_rowset_schema->field_index(inverted_index.column_unique_ids[0]); + if (column_idx < 0) { + LOG(WARNING) << "referenced column was missing. " + << "[column=" << column_name + << " referenced_column=" << column_idx << "]"; + continue; + } } auto column = output_rowset_schema->column(column_idx); if (!InvertedIndexColumnWriter::check_support_inverted_index(column)) { @@ -489,8 +491,7 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta } _inverted_index_builders.clear(); _inverted_index_file_writers.clear(); - output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size() + - inverted_index_size); + output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size()); output_rowset_meta->set_total_disk_size(output_rowset_meta->total_disk_size() + inverted_index_size); output_rowset_meta->set_index_disk_size(output_rowset_meta->index_disk_size() + diff --git a/be/src/pipeline/common/agg_utils.h b/be/src/pipeline/common/agg_utils.h index a3cc175b1ed0a25..135bc67712345f9 100644 --- a/be/src/pipeline/common/agg_utils.h +++ b/be/src/pipeline/common/agg_utils.h @@ -80,23 +80,19 @@ using AggregatedMethodVariants = std::variant< vectorized::UInt256, AggDataNullable>>, vectorized::MethodSingleNullableColumn< vectorized::MethodStringNoCache>, - vectorized::MethodKeysFixed, false>, - vectorized::MethodKeysFixed, true>, - vectorized::MethodKeysFixed, false>, - vectorized::MethodKeysFixed, true>, - vectorized::MethodKeysFixed, false>, - vectorized::MethodKeysFixed, true>, - vectorized::MethodKeysFixed, false>, - vectorized::MethodKeysFixed, true>>; + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>>; struct AggregatedDataVariants : public DataVariants { + vectorized::MethodOneNumber, vectorized::DataWithNullKey> { AggregatedDataWithoutKey without_key = nullptr; - template void init(const std::vector& data_types, HashKeyType type) { + bool nullable = data_types.size() == 1 && data_types[0]->is_nullable(); + switch (type) { case HashKeyType::without_key: break; @@ -104,28 +100,28 @@ struct AggregatedDataVariants method_variant.emplace>(); break; case HashKeyType::int8_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int16_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int32_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int32_key_phase2: - emplace_single(); + emplace_single(nullable); break; case HashKeyType::int64_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int64_key_phase2: - emplace_single(); + emplace_single(nullable); break; case HashKeyType::int128_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int256_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::string_key: if (nullable) { @@ -138,24 +134,20 @@ struct AggregatedDataVariants } break; case HashKeyType::fixed64: - method_variant - .emplace, nullable>>( - get_key_sizes(data_types)); + method_variant.emplace>>( + get_key_sizes(data_types)); break; case HashKeyType::fixed128: - method_variant - .emplace, nullable>>( - get_key_sizes(data_types)); + method_variant.emplace>>( + get_key_sizes(data_types)); break; case HashKeyType::fixed136: - method_variant - .emplace, nullable>>( - get_key_sizes(data_types)); + method_variant.emplace>>( + get_key_sizes(data_types)); break; case HashKeyType::fixed256: - method_variant - .emplace, nullable>>( - get_key_sizes(data_types)); + method_variant.emplace>>( + get_key_sizes(data_types)); break; default: throw Exception(ErrorCode::INTERNAL_ERROR, diff --git a/be/src/pipeline/common/distinct_agg_utils.h b/be/src/pipeline/common/distinct_agg_utils.h index c7ecbd2142c7f0f..806039d5a36a4b4 100644 --- a/be/src/pipeline/common/distinct_agg_utils.h +++ b/be/src/pipeline/common/distinct_agg_utils.h @@ -72,48 +72,43 @@ using DistinctMethodVariants = std::variant< vectorized::DataWithNullKey>>>, vectorized::MethodSingleNullableColumn>>, - vectorized::MethodKeysFixed, false>, - vectorized::MethodKeysFixed, true>, - vectorized::MethodKeysFixed, false>, - vectorized::MethodKeysFixed, true>, - vectorized::MethodKeysFixed, false>, - vectorized::MethodKeysFixed, true>, - vectorized::MethodKeysFixed, false>, - vectorized::MethodKeysFixed, true>>; + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>>; struct DistinctDataVariants : public DataVariants { - template + vectorized::MethodOneNumber, vectorized::DataWithNullKey> { void init(const std::vector& data_types, HashKeyType type) { + bool nullable = data_types.size() == 1 && data_types[0]->is_nullable(); switch (type) { case HashKeyType::serialized: method_variant.emplace>(); break; case HashKeyType::int8_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int16_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int32_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int32_key_phase2: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int64_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int64_key_phase2: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int128_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::int256_key: - emplace_single, nullable>(); + emplace_single>(nullable); break; case HashKeyType::string_key: if (nullable) { @@ -126,23 +121,19 @@ struct DistinctDataVariants } break; case HashKeyType::fixed64: - method_variant.emplace< - vectorized::MethodKeysFixed, nullable>>( + method_variant.emplace>>( get_key_sizes(data_types)); break; case HashKeyType::fixed128: - method_variant.emplace< - vectorized::MethodKeysFixed, nullable>>( + method_variant.emplace>>( get_key_sizes(data_types)); break; case HashKeyType::fixed136: - method_variant.emplace< - vectorized::MethodKeysFixed, nullable>>( + method_variant.emplace>>( get_key_sizes(data_types)); break; case HashKeyType::fixed256: - method_variant.emplace< - vectorized::MethodKeysFixed, nullable>>( + method_variant.emplace>>( get_key_sizes(data_types)); break; default: diff --git a/be/src/pipeline/common/join_utils.h b/be/src/pipeline/common/join_utils.h index 5be3e4af2f374bd..e214d1a52931a90 100644 --- a/be/src/pipeline/common/join_utils.h +++ b/be/src/pipeline/common/join_utils.h @@ -36,43 +36,29 @@ using JoinOpVariants = std::integral_constant, std::integral_constant>; -using SerializedHashTableContext = vectorized::MethodSerialized>; -using I8HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I16HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I32HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I64HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I128HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I256HashTableContext = vectorized::PrimaryTypeHashTableContext; -using MethodOneString = vectorized::MethodStringNoCache>; -template -using I64FixedKeyHashTableContext = - vectorized::FixedKeyHashTableContext; - -template -using I128FixedKeyHashTableContext = - vectorized::FixedKeyHashTableContext; +template +using PrimaryTypeHashTableContext = vectorized::MethodOneNumber>>; -template -using I256FixedKeyHashTableContext = - vectorized::FixedKeyHashTableContext; +template +using FixedKeyHashTableContext = vectorized::MethodKeysFixed>>; -template -using I136FixedKeyHashTableContext = - vectorized::FixedKeyHashTableContext; +using SerializedHashTableContext = vectorized::MethodSerialized>; +using MethodOneString = vectorized::MethodStringNoCache>; -using HashTableVariants = - std::variant, - I64FixedKeyHashTableContext, I128FixedKeyHashTableContext, - I128FixedKeyHashTableContext, I256FixedKeyHashTableContext, - I256FixedKeyHashTableContext, I136FixedKeyHashTableContext, - I136FixedKeyHashTableContext, MethodOneString>; +using HashTableVariants = std::variant< + std::monostate, SerializedHashTableContext, PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + FixedKeyHashTableContext, FixedKeyHashTableContext, + FixedKeyHashTableContext, + FixedKeyHashTableContext, MethodOneString>; struct JoinDataVariants { HashTableVariants method_variant; - template void init(const std::vector& data_types, HashKeyType type) { // todo: support single column nullable context switch (type) { @@ -80,69 +66,40 @@ struct JoinDataVariants { method_variant.emplace(); break; case HashKeyType::int8_key: - if (nullable) { - method_variant.emplace>( - get_key_sizes(data_types)); - } else { - method_variant.emplace(); - } + method_variant.emplace>(); break; case HashKeyType::int16_key: - if (nullable) { - method_variant.emplace>( - get_key_sizes(data_types)); - } else { - method_variant.emplace(); - } + method_variant.emplace>(); break; case HashKeyType::int32_key: - if (nullable) { - method_variant.emplace>( - get_key_sizes(data_types)); - } else { - method_variant.emplace(); - } + method_variant.emplace>(); break; case HashKeyType::int64_key: - if (nullable) { - method_variant.emplace>( - get_key_sizes(data_types)); - } else { - method_variant.emplace(); - } + method_variant.emplace>(); break; case HashKeyType::int128_key: - if (nullable) { - method_variant.emplace>( - get_key_sizes(data_types)); - } else { - method_variant.emplace(); - } + method_variant.emplace>(); break; case HashKeyType::int256_key: - if (nullable) { - method_variant.emplace(); - } else { - method_variant.emplace(); - } + method_variant.emplace>(); break; case HashKeyType::string_key: method_variant.emplace(); break; case HashKeyType::fixed64: - method_variant.emplace>( + method_variant.emplace>( get_key_sizes(data_types)); break; case HashKeyType::fixed128: - method_variant.emplace>( + method_variant.emplace>( get_key_sizes(data_types)); break; case HashKeyType::fixed136: - method_variant.emplace>( + method_variant.emplace>( get_key_sizes(data_types)); break; case HashKeyType::fixed256: - method_variant.emplace>( + method_variant.emplace>( get_key_sizes(data_types)); break; default: diff --git a/be/src/pipeline/common/partition_sort_utils.h b/be/src/pipeline/common/partition_sort_utils.h index 38bc8744dc14623..9317a783ba68bfe 100644 --- a/be/src/pipeline/common/partition_sort_utils.h +++ b/be/src/pipeline/common/partition_sort_utils.h @@ -123,57 +123,41 @@ struct PartitionBlocks { using PartitionDataPtr = PartitionBlocks*; using PartitionDataWithStringKey = PHHashMap; using PartitionDataWithShortStringKey = StringHashMap; -using PartitionDataWithUInt8Key = PHHashMap; -using PartitionDataWithUInt16Key = PHHashMap; -using PartitionDataWithUInt32Key = - PHHashMap>; -using PartitionDataWithUInt64Key = - PHHashMap>; -using PartitionDataWithUInt128Key = - PHHashMap>; -using PartitionDataWithUInt256Key = - PHHashMap>; -using PartitionDataWithUInt136Key = - PHHashMap>; + +template +using PartitionData = PHHashMap>; + +template +using PartitionDataSingle = vectorized::MethodOneNumber>; + +template +using PartitionDataSingleNullable = vectorized::MethodSingleNullableColumn< + vectorized::MethodOneNumber>>>; using PartitionedMethodVariants = std::variant< std::monostate, vectorized::MethodSerialized, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, + PartitionDataSingle, PartitionDataSingle, + PartitionDataSingle, PartitionDataSingle, + PartitionDataSingle, PartitionDataSingle, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, vectorized::MethodStringNoCache, vectorized::MethodSingleNullableColumn>>>; struct PartitionedHashMapVariants : public DataVariants { - template + vectorized::MethodOneNumber, vectorized::DataWithNullKey> { void init(const std::vector& data_types, HashKeyType type) { + bool nullable = data_types.size() == 1 && data_types[0]->is_nullable(); switch (type) { case HashKeyType::without_key: { break; @@ -183,27 +167,27 @@ struct PartitionedHashMapVariants break; } case HashKeyType::int8_key: { - emplace_single(); + emplace_single>(nullable); break; } case HashKeyType::int16_key: { - emplace_single(); + emplace_single>(nullable); break; } case HashKeyType::int32_key: { - emplace_single(); + emplace_single>(nullable); break; } case HashKeyType::int64_key: { - emplace_single(); + emplace_single>(nullable); break; } case HashKeyType::int128_key: { - emplace_single(); + emplace_single>(nullable); break; } case HashKeyType::int256_key: { - emplace_single(); + emplace_single>(nullable); break; } case HashKeyType::string_key: { @@ -218,24 +202,20 @@ struct PartitionedHashMapVariants break; } case HashKeyType::fixed64: - method_variant - .emplace>( - get_key_sizes(data_types)); + method_variant.emplace>>( + get_key_sizes(data_types)); break; case HashKeyType::fixed128: - method_variant - .emplace>( - get_key_sizes(data_types)); + method_variant.emplace>>( + get_key_sizes(data_types)); break; case HashKeyType::fixed136: - method_variant - .emplace>( - get_key_sizes(data_types)); + method_variant.emplace>>( + get_key_sizes(data_types)); break; case HashKeyType::fixed256: - method_variant - .emplace>( - get_key_sizes(data_types)); + method_variant.emplace>>( + get_key_sizes(data_types)); break; default: throw Exception(ErrorCode::INTERNAL_ERROR, diff --git a/be/src/pipeline/common/set_utils.h b/be/src/pipeline/common/set_utils.h index 014546be124ced7..ed64035fb4289da 100644 --- a/be/src/pipeline/common/set_utils.h +++ b/be/src/pipeline/common/set_utils.h @@ -25,10 +25,9 @@ namespace doris { -template +template using SetFixedKeyHashTableContext = - vectorized::MethodKeysFixed>, - has_null>; + vectorized::MethodKeysFixed>>; template using SetPrimaryTypeHashTableContext = @@ -47,59 +46,84 @@ using SetHashTableVariants = SetPrimaryTypeHashTableContext, SetPrimaryTypeHashTableContext, SetPrimaryTypeHashTableContext, - SetFixedKeyHashTableContext, - SetFixedKeyHashTableContext, - SetFixedKeyHashTableContext, - SetFixedKeyHashTableContext, - SetFixedKeyHashTableContext, - SetFixedKeyHashTableContext, - SetFixedKeyHashTableContext, - SetFixedKeyHashTableContext>; + SetFixedKeyHashTableContext, + SetFixedKeyHashTableContext, + SetFixedKeyHashTableContext, + SetFixedKeyHashTableContext>; struct SetDataVariants { SetHashTableVariants method_variant; - template void init(const std::vector& data_types, HashKeyType type) { + bool nullable = data_types.size() == 1 && data_types[0]->is_nullable(); switch (type) { case HashKeyType::serialized: method_variant.emplace(); break; case HashKeyType::int8_key: - method_variant.emplace>(); + if (nullable) { + method_variant.emplace>( + get_key_sizes(data_types)); + } else { + method_variant.emplace>(); + } break; case HashKeyType::int16_key: - method_variant.emplace>(); + if (nullable) { + method_variant.emplace>( + get_key_sizes(data_types)); + } else { + method_variant.emplace>(); + } break; case HashKeyType::int32_key: - method_variant.emplace>(); + if (nullable) { + method_variant.emplace>( + get_key_sizes(data_types)); + } else { + method_variant.emplace>(); + } break; case HashKeyType::int64_key: - method_variant.emplace>(); + if (nullable) { + method_variant.emplace>( + get_key_sizes(data_types)); + } else { + method_variant.emplace>(); + } break; case HashKeyType::int128_key: - method_variant.emplace>(); + if (nullable) { + method_variant.emplace>( + get_key_sizes(data_types)); + } else { + method_variant.emplace>(); + } break; case HashKeyType::int256_key: - method_variant.emplace>(); + if (nullable) { + method_variant.emplace(); + } else { + method_variant.emplace>(); + } break; case HashKeyType::string_key: method_variant.emplace(); break; case HashKeyType::fixed64: - method_variant.emplace>( + method_variant.emplace>( get_key_sizes(data_types)); break; case HashKeyType::fixed128: - method_variant.emplace>( + method_variant.emplace>( get_key_sizes(data_types)); break; case HashKeyType::fixed136: - method_variant.emplace>( + method_variant.emplace>( get_key_sizes(data_types)); break; case HashKeyType::fixed256: - method_variant.emplace>( + method_variant.emplace>( get_key_sizes(data_types)); break; default: diff --git a/be/src/pipeline/dependency.cpp b/be/src/pipeline/dependency.cpp index 8d82c340e2dc699..5fef018423df25d 100644 --- a/be/src/pipeline/dependency.cpp +++ b/be/src/pipeline/dependency.cpp @@ -34,13 +34,14 @@ namespace doris::pipeline { #include "common/compile_check_begin.h" Dependency* BasicSharedState::create_source_dependency(int operator_id, int node_id, - std::string name) { + const std::string& name) { source_deps.push_back(std::make_shared(operator_id, node_id, name + "_DEPENDENCY")); source_deps.back()->set_shared_state(this); return source_deps.back().get(); } -Dependency* BasicSharedState::create_sink_dependency(int dest_id, int node_id, std::string name) { +Dependency* BasicSharedState::create_sink_dependency(int dest_id, int node_id, + const std::string& name) { sink_deps.push_back(std::make_shared(dest_id, node_id, name + "_DEPENDENCY", true)); sink_deps.back()->set_shared_state(this); return sink_deps.back().get(); @@ -105,16 +106,6 @@ std::string RuntimeFilterDependency::debug_string(int indentation_level) { return fmt::to_string(debug_string_buffer); } -Dependency* RuntimeFilterDependency::is_blocked_by(PipelineTask* task) { - std::unique_lock lc(_task_lock); - auto ready = _ready.load(); - if (!ready && task) { - _add_block_task(task); - task->_blocked_dep = this; - } - return ready ? nullptr : this; -} - void RuntimeFilterTimer::call_timeout() { _parent->set_ready(); } diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index a035d57a8379ea0..4cc3aceaeebdfae 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -27,7 +28,6 @@ #include #include "common/logging.h" -#include "concurrentqueue.h" #include "gutil/integral_types.h" #include "pipeline/common/agg_utils.h" #include "pipeline/common/join_utils.h" @@ -81,17 +81,15 @@ struct BasicSharedState { virtual ~BasicSharedState() = default; - Dependency* create_source_dependency(int operator_id, int node_id, std::string name); + Dependency* create_source_dependency(int operator_id, int node_id, const std::string& name); - Dependency* create_sink_dependency(int dest_id, int node_id, std::string name); + Dependency* create_sink_dependency(int dest_id, int node_id, const std::string& name); }; class Dependency : public std::enable_shared_from_this { public: ENABLE_FACTORY_CREATOR(Dependency); - Dependency(int id, int node_id, std::string name) - : _id(id), _node_id(node_id), _name(std::move(name)), _ready(false) {} - Dependency(int id, int node_id, std::string name, bool ready) + Dependency(int id, int node_id, std::string name, bool ready = false) : _id(id), _node_id(node_id), _name(std::move(name)), _ready(ready) {} virtual ~Dependency() = default; @@ -278,8 +276,6 @@ class RuntimeFilterDependency final : public Dependency { : Dependency(id, node_id, name), _runtime_filter(runtime_filter) {} std::string debug_string(int indentation_level = 0) override; - Dependency* is_blocked_by(PipelineTask* task) override; - private: const IRuntimeFilter* _runtime_filter = nullptr; }; @@ -606,8 +602,9 @@ struct HashJoinSharedState : public JoinSharedState { ENABLE_FACTORY_CREATOR(HashJoinSharedState) // mark the join column whether support null eq std::vector is_null_safe_eq_join; + // mark the build hash table whether it needs to store null value - std::vector store_null_in_hash_table; + std::vector serialize_null_into_key; std::shared_ptr arena = std::make_shared(); // maybe share hash table with other fragment instances diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp index ccf24d0cb1e21c3..1d1bfbe7d173ab0 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp @@ -733,7 +733,7 @@ Status AggSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( _pool, tnode.agg_node.aggregate_functions[i], tnode.agg_node.__isset.agg_sort_infos ? tnode.agg_node.agg_sort_infos[i] : dummy, - &evaluator)); + tnode.agg_node.grouping_exprs.empty(), &evaluator)); _aggregate_evaluators.push_back(evaluator); } diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index afe9aeab8fdb847..abde34a1d0255bc 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -30,8 +30,10 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); - _blocks_memory_usage = ADD_COUNTER_WITH_LEVEL(_profile, "MemoryUsageBlocks", TUnit::BYTES, 1); - _evaluation_timer = ADD_TIMER(profile(), "EvaluationTime"); + _evaluation_timer = ADD_TIMER(profile(), "GetPartitionBoundTime"); + _compute_agg_data_timer = ADD_TIMER(profile(), "ComputeAggDataTime"); + _compute_partition_by_timer = ADD_TIMER(profile(), "ComputePartitionByTime"); + _compute_order_by_timer = ADD_TIMER(profile(), "ComputeOrderByTime"); return Status::OK(); } @@ -288,35 +290,41 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block } } - for (size_t i = 0; i < _agg_functions_size; - ++i) { //insert _agg_input_columns, execute calculate for its - for (size_t j = 0; j < local_state._agg_expr_ctxs[i].size(); ++j) { - RETURN_IF_ERROR(_insert_range_column( - input_block, local_state._agg_expr_ctxs[i][j], - local_state._shared_state->agg_input_columns[i][j].get(), block_rows)); + { + SCOPED_TIMER(local_state._compute_agg_data_timer); + for (size_t i = 0; i < _agg_functions_size; + ++i) { //insert _agg_input_columns, execute calculate for its + for (size_t j = 0; j < local_state._agg_expr_ctxs[i].size(); ++j) { + RETURN_IF_ERROR(_insert_range_column( + input_block, local_state._agg_expr_ctxs[i][j], + local_state._shared_state->agg_input_columns[i][j].get(), block_rows)); + } } } - //record column idx in block - for (size_t i = 0; i < local_state._shared_state->partition_by_eq_expr_ctxs.size(); ++i) { - int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->partition_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); - DCHECK_GE(result_col_id, 0); - local_state._shared_state->partition_by_column_idxs[i] = result_col_id; + { + SCOPED_TIMER(local_state._compute_partition_by_timer); + for (size_t i = 0; i < local_state._shared_state->partition_by_eq_expr_ctxs.size(); ++i) { + int result_col_id = -1; + RETURN_IF_ERROR(local_state._shared_state->partition_by_eq_expr_ctxs[i]->execute( + input_block, &result_col_id)); + DCHECK_GE(result_col_id, 0); + local_state._shared_state->partition_by_column_idxs[i] = result_col_id; + } } - for (size_t i = 0; i < local_state._shared_state->order_by_eq_expr_ctxs.size(); ++i) { - int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->order_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); - DCHECK_GE(result_col_id, 0); - local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; + { + SCOPED_TIMER(local_state._compute_order_by_timer); + for (size_t i = 0; i < local_state._shared_state->order_by_eq_expr_ctxs.size(); ++i) { + int result_col_id = -1; + RETURN_IF_ERROR(local_state._shared_state->order_by_eq_expr_ctxs[i]->execute( + input_block, &result_col_id)); + DCHECK_GE(result_col_id, 0); + local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; + } } - int64_t block_mem_usage = input_block->allocated_bytes(); - COUNTER_UPDATE(local_state._memory_used_counter, block_mem_usage); + COUNTER_UPDATE(local_state._memory_used_counter, input_block->allocated_bytes()); COUNTER_SET(local_state._peak_memory_usage_counter, local_state._memory_used_counter->value()); - COUNTER_UPDATE(local_state._blocks_memory_usage, block_mem_usage); //TODO: if need improvement, the is a tips to maintain a free queue, //so the memory could reuse, no need to new/delete again; diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index b35354107f671f4..e04b220ee351e7f 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -58,7 +58,9 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; }; diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index 019f95042c2e4a8..2e041ab45d20bf4 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -162,7 +162,10 @@ Status AnalyticLocalState::init(RuntimeState* state, LocalStateInfo& info) { SCOPED_TIMER(_init_timer); _blocks_memory_usage = profile()->AddHighWaterMarkCounter("MemoryUsageBlocks", TUnit::BYTES, "", 1); - _evaluation_timer = ADD_TIMER(profile(), "EvaluationTime"); + _evaluation_timer = ADD_TIMER(profile(), "GetPartitionBoundTime"); + _execute_timer = ADD_TIMER(profile(), "ExecuteTime"); + _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); + _get_result_timer = ADD_TIMER(profile(), "GetResultsTime"); return Status::OK(); } @@ -233,12 +236,6 @@ Status AnalyticLocalState::open(RuntimeState* state) { std::placeholders::_1); } } - _executor.insert_result = - std::bind(&AnalyticLocalState::_insert_result_info, this, std::placeholders::_1); - _executor.execute = - std::bind(&AnalyticLocalState::_execute_for_win_func, this, std::placeholders::_1, - std::placeholders::_2, std::placeholders::_3, std::placeholders::_4); - _create_agg_status(); return Status::OK(); } @@ -282,6 +279,7 @@ void AnalyticLocalState::_destroy_agg_status() { void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end) { + SCOPED_TIMER(_execute_timer); for (size_t i = 0; i < _agg_functions_size; ++i) { std::vector agg_columns; for (int j = 0; j < _shared_state->agg_input_columns[i].size(); ++j) { @@ -300,6 +298,7 @@ void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t } void AnalyticLocalState::_insert_result_info(int64_t current_block_rows) { + SCOPED_TIMER(_get_result_timer); int64_t current_block_row_pos = _shared_state->input_block_first_row_positions[_output_block_index]; int64_t get_result_start = _shared_state->current_row_position - current_block_row_pos; @@ -344,6 +343,7 @@ void AnalyticLocalState::_insert_result_info(int64_t current_block_rows) { } Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); while (_shared_state->current_row_position < _shared_state->partition_by_end.pos && _window_end_position < current_block_rows) { int64_t range_start, range_end; @@ -367,31 +367,33 @@ Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { // Make sure range_start <= range_end range_start = std::min(range_start, range_end); } - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, range_start, - range_end); - _executor.insert_result(current_block_rows); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + range_start, range_end); + _insert_result_info(current_block_rows); } return Status::OK(); } Status AnalyticLocalState::_get_next_for_partition(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); if (_next_partition) { - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, - _partition_by_start.pos, _shared_state->partition_by_end.pos); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + _partition_by_start.pos, _shared_state->partition_by_end.pos); } - _executor.insert_result(current_block_rows); + _insert_result_info(current_block_rows); return Status::OK(); } Status AnalyticLocalState::_get_next_for_range(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); while (_shared_state->current_row_position < _shared_state->partition_by_end.pos && _window_end_position < current_block_rows) { if (_shared_state->current_row_position >= _order_by_end.pos) { _update_order_by_range(); - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, - _order_by_start.pos, _order_by_end.pos); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + _order_by_start.pos, _order_by_end.pos); } - _executor.insert_result(current_block_rows); + _insert_result_info(current_block_rows); } return Status::OK(); } @@ -500,11 +502,13 @@ Status AnalyticSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state RETURN_IF_ERROR(OperatorX::init(tnode, state)); const TAnalyticNode& analytic_node = tnode.analytic_node; size_t agg_size = analytic_node.analytic_functions.size(); - for (int i = 0; i < agg_size; ++i) { vectorized::AggFnEvaluator* evaluator = nullptr; + // Window function treats all NullableAggregateFunction as AlwaysNullable. + // Its behavior is same with executed without group by key. + // https://github.com/apache/doris/pull/40693 RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( - _pool, analytic_node.analytic_functions[i], {}, &evaluator)); + _pool, analytic_node.analytic_functions[i], {}, /*wihout_key*/ true, &evaluator)); _agg_functions.emplace_back(evaluator); } @@ -536,7 +540,7 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block local_state.init_result_columns(); size_t current_block_rows = local_state._shared_state->input_blocks[local_state._output_block_index].rows(); - static_cast(local_state._executor.get_next(current_block_rows)); + RETURN_IF_ERROR(local_state._executor.get_next(current_block_rows)); if (local_state._window_end_position == current_block_rows) { break; } diff --git a/be/src/pipeline/exec/analytic_source_operator.h b/be/src/pipeline/exec/analytic_source_operator.h index 0080ad5e03c8b02..8f44b77f567e559 100644 --- a/be/src/pipeline/exec/analytic_source_operator.h +++ b/be/src/pipeline/exec/analytic_source_operator.h @@ -96,17 +96,15 @@ class AnalyticLocalState final : public PipelineXLocalState std::vector _agg_functions; RuntimeProfile::Counter* _evaluation_timer = nullptr; + RuntimeProfile::Counter* _execute_timer = nullptr; + RuntimeProfile::Counter* _get_next_timer = nullptr; + RuntimeProfile::Counter* _get_result_timer = nullptr; RuntimeProfile::HighWaterMarkCounter* _blocks_memory_usage = nullptr; - using vectorized_execute = std::function; using vectorized_get_next = std::function; - using vectorized_get_result = std::function; struct executor { - vectorized_execute execute; vectorized_get_next get_next; - vectorized_get_result insert_result; }; executor _executor; diff --git a/be/src/pipeline/exec/cache_source_operator.cpp b/be/src/pipeline/exec/cache_source_operator.cpp index 2e9b21976f841ae..cace8465fc2d463 100644 --- a/be/src/pipeline/exec/cache_source_operator.cpp +++ b/be/src/pipeline/exec/cache_source_operator.cpp @@ -65,7 +65,7 @@ Status CacheSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { // 3. lookup the cache and find proper slot order hit_cache = QueryCache::instance()->lookup(_cache_key, _version, &_query_cache_handle); - _runtime_profile->add_info_string("HitCache", hit_cache ? "1" : "0"); + _runtime_profile->add_info_string("HitCache", std::to_string(hit_cache)); if (hit_cache && !cache_param.force_refresh_query_cache) { _hit_cache_results = _query_cache_handle.get_cache_result(); auto hit_cache_slot_orders = _query_cache_handle.get_cache_slot_orders(); @@ -125,13 +125,16 @@ Status CacheSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* b if (local_state._hit_cache_results == nullptr) { Defer insert_cache([&] { - if (*eos && local_state._need_insert_cache) { - local_state._runtime_profile->add_info_string("InsertCache", "1"); - local_state._global_cache->insert(local_state._cache_key, local_state._version, - local_state._local_cache_blocks, - local_state._slot_orders, - local_state._current_query_cache_bytes); - local_state._local_cache_blocks.clear(); + if (*eos) { + local_state._runtime_profile->add_info_string( + "InsertCache", std::to_string(local_state._need_insert_cache)); + if (local_state._need_insert_cache) { + local_state._global_cache->insert(local_state._cache_key, local_state._version, + local_state._local_cache_blocks, + local_state._slot_orders, + local_state._current_query_cache_bytes); + local_state._local_cache_blocks.clear(); + } } }); @@ -162,7 +165,6 @@ Status CacheSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* b // over the max bytes, pass through the data, no need to do cache local_state._local_cache_blocks.clear(); local_state._need_insert_cache = false; - local_state._runtime_profile->add_info_string("InsertCache", "0"); } else { local_state._local_cache_blocks.emplace_back(std::move(output_block)); } diff --git a/be/src/pipeline/exec/datagen_operator.cpp b/be/src/pipeline/exec/datagen_operator.cpp index 965092b7eef20f2..d400953799e5bbb 100644 --- a/be/src/pipeline/exec/datagen_operator.cpp +++ b/be/src/pipeline/exec/datagen_operator.cpp @@ -70,17 +70,25 @@ Status DataGenSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* RETURN_IF_CANCELLED(state); auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - Status res = local_state._table_func->get_next(state, block, eos); - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, - block->columns())); + { + SCOPED_TIMER(local_state._table_function_execution_timer); + RETURN_IF_ERROR(local_state._table_func->get_next(state, block, eos)); + } + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, + block->columns())); + } local_state.reached_limit(block, eos); - return res; + return Status::OK(); } Status DataGenLocalState::init(RuntimeState* state, LocalStateInfo& info) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); + _table_function_execution_timer = ADD_TIMER(profile(), "TableFunctionExecutionTime"); + _filter_timer = ADD_TIMER(profile(), "FilterTime"); auto& p = _parent->cast(); _table_func = std::make_shared(p._tuple_id, p._tuple_desc); _table_func->set_tuple_desc(p._tuple_desc); diff --git a/be/src/pipeline/exec/datagen_operator.h b/be/src/pipeline/exec/datagen_operator.h index c63ef97bb7a40f6..bada5ec4080d08b 100644 --- a/be/src/pipeline/exec/datagen_operator.h +++ b/be/src/pipeline/exec/datagen_operator.h @@ -44,6 +44,8 @@ class DataGenLocalState final : public PipelineXLocalState<> { private: friend class DataGenSourceOperatorX; std::shared_ptr _table_func; + RuntimeProfile::Counter* _table_function_execution_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; class DataGenSourceOperatorX final : public OperatorX { diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp index 642b669263456d2..bb282fd118e5c0b 100644 --- a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp @@ -354,7 +354,7 @@ Status DistinctStreamingAggOperatorX::init(const TPlanNode& tnode, RuntimeState* RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( _pool, tnode.agg_node.aggregate_functions[i], tnode.agg_node.__isset.agg_sort_infos ? tnode.agg_node.agg_sort_infos[i] : dummy, - &evaluator)); + tnode.agg_node.grouping_exprs.empty(), &evaluator)); _aggregate_evaluators.push_back(evaluator); } diff --git a/be/src/pipeline/exec/exchange_sink_buffer.cpp b/be/src/pipeline/exec/exchange_sink_buffer.cpp index 016802f8f73bd8e..7163299d766f4e8 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.cpp +++ b/be/src/pipeline/exec/exchange_sink_buffer.cpp @@ -235,7 +235,7 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { auto send_callback = request.channel->get_send_callback(id, request.eos); send_callback->cntl_->set_timeout_ms(request.channel->_brpc_timeout_ms); - if (config::exchange_sink_ignore_eovercrowded) { + if (config::execution_ignore_eovercrowded) { send_callback->cntl_->ignore_eovercrowded(); } send_callback->addFailedHandler([&, weak_task_ctx = weak_task_exec_ctx()]( @@ -313,7 +313,7 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { } auto send_callback = request.channel->get_send_callback(id, request.eos); send_callback->cntl_->set_timeout_ms(request.channel->_brpc_timeout_ms); - if (config::exchange_sink_ignore_eovercrowded) { + if (config::execution_ignore_eovercrowded) { send_callback->cntl_->ignore_eovercrowded(); } send_callback->addFailedHandler([&, weak_task_ctx = weak_task_exec_ctx()]( diff --git a/be/src/pipeline/exec/exchange_sink_buffer.h b/be/src/pipeline/exec/exchange_sink_buffer.h index 2ff7a20086470a1..13692532a335a42 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.h +++ b/be/src/pipeline/exec/exchange_sink_buffer.h @@ -195,7 +195,6 @@ class ExchangeSinkBuffer final : public HasTaskExecutionCtx { private: friend class ExchangeSinkLocalState; - void _set_ready_to_finish(bool all_done); phmap::flat_hash_map> _instance_to_package_queue_mutex; diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp index 55b0e43c936d5a7..1f91af01aa1f6bb 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.cpp +++ b/be/src/pipeline/exec/exchange_sink_operator.cpp @@ -39,11 +39,6 @@ namespace doris::pipeline { #include "common/compile_check_begin.h" -Status ExchangeSinkLocalState::serialize_block(vectorized::Block* src, PBlock* dest, - int num_receivers) { - return _parent->cast().serialize_block(*this, src, dest, num_receivers); -} - bool ExchangeSinkLocalState::transfer_large_data_by_brpc() const { return _parent->cast()._transfer_large_data_by_brpc; } @@ -61,14 +56,10 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _local_sent_rows = ADD_COUNTER(_profile, "LocalSentRows", TUnit::UNIT); _serialize_batch_timer = ADD_TIMER(_profile, "SerializeBatchTime"); _compress_timer = ADD_TIMER(_profile, "CompressTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); - _brpc_wait_timer = ADD_TIMER(_profile, "BrpcSendTime.Wait"); _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); _split_block_hash_compute_timer = ADD_TIMER(_profile, "SplitBlockHashComputeTime"); - _split_block_distribute_by_channel_timer = - ADD_TIMER(_profile, "SplitBlockDistributeByChannelTime"); + _distribute_rows_into_channels_timer = ADD_TIMER(_profile, "DistributeRowsIntoChannelsTime"); _blocks_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "BlocksProduced", TUnit::UNIT, 1); - _rows_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "RowsProduced", TUnit::UNIT, 1); _overall_throughput = _profile->add_derived_counter( "OverallThroughput", TUnit::BYTES_PER_SECOND, [this]() { @@ -141,7 +132,7 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { std::mt19937 g(rd()); shuffle(channels.begin(), channels.end(), g); } - int local_size = 0; + size_t local_size = 0; for (int i = 0; i < channels.size(); ++i) { RETURN_IF_ERROR(channels[i]->open(state)); if (channels[i]->is_local()) { @@ -151,6 +142,8 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { } only_local_exchange = local_size == channels.size(); + _rpc_channels_num = channels.size() - local_size; + PUniqueId id; id.set_hi(_state->query_id().hi); id.set_lo(_state->query_id().lo); @@ -389,7 +382,6 @@ void ExchangeSinkOperatorX::_handle_eof_channel(RuntimeState* state, ChannelPtrT Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, bool eos) { auto& local_state = get_local_state(state); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); - COUNTER_UPDATE(local_state.rows_sent_counter(), (int64_t)block->rows()); SCOPED_TIMER(local_state.exec_time_counter()); bool all_receiver_eof = true; for (auto& channel : local_state.channels) { @@ -431,14 +423,15 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block { bool serialized = false; RETURN_IF_ERROR(local_state._serializer.next_serialized_block( - block, block_holder->get_block(), local_state.channels.size(), &serialized, - eos)); + block, block_holder->get_block(), local_state._rpc_channels_num, + &serialized, eos)); if (serialized) { auto cur_block = local_state._serializer.get_block()->to_block(); if (!cur_block.empty()) { + DCHECK(eos || local_state._serializer.is_local()) << debug_string(state, 0); RETURN_IF_ERROR(local_state._serializer.serialize_block( &cur_block, block_holder->get_block(), - local_state.channels.size())); + local_state._rpc_channels_num)); } else { block_holder->reset_block(); } @@ -504,10 +497,12 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block old_channel_mem_usage += channel->mem_usage(); } if (_part_type == TPartitionType::HASH_PARTITIONED) { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); RETURN_IF_ERROR(channel_add_rows( state, local_state.channels, local_state._partition_count, local_state._partitioner->get_channel_ids().get(), rows, block, eos)); } else { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); RETURN_IF_ERROR(channel_add_rows( state, local_state.channels, local_state._partition_count, local_state._partitioner->get_channel_ids().get(), rows, block, eos)); @@ -556,10 +551,13 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block local_state._row_distribution._deal_batched = true; RETURN_IF_ERROR(local_state._send_new_partition_batch()); } - // the convert_block maybe different with block after execute exprs - // when send data we still use block - RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, num_channels, - channel2rows, block, eos)); + { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); + // the convert_block maybe different with block after execute exprs + // when send data we still use block + RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, num_channels, + channel2rows, block, eos)); + } int64_t new_channel_mem_usage = 0; for (const auto& channel : local_state.channels) { new_channel_mem_usage += channel->mem_usage(); @@ -579,8 +577,12 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block } std::vector> assignments = local_state.scale_writer_partitioning_exchanger->accept(block); - RETURN_IF_ERROR(channel_add_rows_with_idx( - state, local_state.channels, local_state.channels.size(), assignments, block, eos)); + { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); + RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, + local_state.channels.size(), assignments, + block, eos)); + } int64_t new_channel_mem_usage = 0; for (const auto& channel : local_state.channels) { @@ -635,24 +637,6 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block return final_st; } -Status ExchangeSinkOperatorX::serialize_block(ExchangeSinkLocalState& state, vectorized::Block* src, - PBlock* dest, int num_receivers) { - { - SCOPED_TIMER(state.serialize_batch_timer()); - dest->Clear(); - size_t uncompressed_bytes = 0; - size_t compressed_bytes = 0; - RETURN_IF_ERROR(src->serialize(_state->be_exec_version(), dest, &uncompressed_bytes, - &compressed_bytes, _compression_type, - _transfer_large_data_by_brpc)); - COUNTER_UPDATE(state.bytes_sent_counter(), compressed_bytes * num_receivers); - COUNTER_UPDATE(state.uncompressed_bytes_counter(), uncompressed_bytes * num_receivers); - COUNTER_UPDATE(state.compress_timer(), src->get_compress_time()); - } - - return Status::OK(); -} - void ExchangeSinkLocalState::register_channels(pipeline::ExchangeSinkBuffer* buffer) { for (auto& channel : channels) { channel->register_exchange_buffer(buffer); diff --git a/be/src/pipeline/exec/exchange_sink_operator.h b/be/src/pipeline/exec/exchange_sink_operator.h index 141693eb820f4a5..63d502900054703 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.h +++ b/be/src/pipeline/exec/exchange_sink_operator.h @@ -77,27 +77,13 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { Status open(RuntimeState* state) override; Status close(RuntimeState* state, Status exec_status) override; Dependency* finishdependency() override { return _finish_dependency.get(); } - Status serialize_block(vectorized::Block* src, PBlock* dest, int num_receivers = 1); void register_channels(pipeline::ExchangeSinkBuffer* buffer); - RuntimeProfile::Counter* brpc_wait_timer() { return _brpc_wait_timer; } RuntimeProfile::Counter* blocks_sent_counter() { return _blocks_sent_counter; } - RuntimeProfile::Counter* rows_sent_counter() { return _rows_sent_counter; } RuntimeProfile::Counter* local_send_timer() { return _local_send_timer; } RuntimeProfile::Counter* local_bytes_send_counter() { return _local_bytes_send_counter; } RuntimeProfile::Counter* local_sent_rows() { return _local_sent_rows; } - RuntimeProfile::Counter* brpc_send_timer() { return _brpc_send_timer; } - RuntimeProfile::Counter* serialize_batch_timer() { return _serialize_batch_timer; } - RuntimeProfile::Counter* split_block_distribute_by_channel_timer() { - return _split_block_distribute_by_channel_timer; - } - RuntimeProfile::Counter* bytes_sent_counter() { return _bytes_sent_counter; } - RuntimeProfile::Counter* split_block_hash_compute_timer() { - return _split_block_hash_compute_timer; - } RuntimeProfile::Counter* merge_block_timer() { return _merge_block_timer; } - RuntimeProfile::Counter* compress_timer() { return _compress_timer; } - RuntimeProfile::Counter* uncompressed_bytes_counter() { return _uncompressed_bytes_counter; } [[nodiscard]] bool transfer_large_data_by_brpc() const; bool is_finished() const override { return _reach_limit.load(); } void set_reach_limit() { _reach_limit = true; }; @@ -129,16 +115,13 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { std::unique_ptr _sink_buffer = nullptr; RuntimeProfile::Counter* _serialize_batch_timer = nullptr; RuntimeProfile::Counter* _compress_timer = nullptr; - RuntimeProfile::Counter* _brpc_send_timer = nullptr; - RuntimeProfile::Counter* _brpc_wait_timer = nullptr; RuntimeProfile::Counter* _bytes_sent_counter = nullptr; RuntimeProfile::Counter* _uncompressed_bytes_counter = nullptr; RuntimeProfile::Counter* _local_sent_rows = nullptr; RuntimeProfile::Counter* _local_send_timer = nullptr; RuntimeProfile::Counter* _split_block_hash_compute_timer = nullptr; - RuntimeProfile::Counter* _split_block_distribute_by_channel_timer = nullptr; + RuntimeProfile::Counter* _distribute_rows_into_channels_timer = nullptr; RuntimeProfile::Counter* _blocks_sent_counter = nullptr; - RuntimeProfile::Counter* _rows_sent_counter = nullptr; // Throughput per total time spent in sender RuntimeProfile::Counter* _overall_throughput = nullptr; // Used to counter send bytes under local data exchange @@ -153,6 +136,7 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { int _sender_id; std::shared_ptr _broadcast_pb_mem_limiter; + size_t _rpc_channels_num = 0; vectorized::BlockSerializer _serializer; std::shared_ptr _queue_dependency = nullptr; @@ -221,8 +205,6 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorXcreate_merger( local_state.vsort_exec_exprs.lhs_ordering_expr_ctxs(), _is_asc_order, _nulls_first, state->batch_size(), _limit, _offset)); local_state.is_ready = true; return Status::OK(); } - auto status = local_state.stream_recvr->get_next(block, eos); - RETURN_IF_ERROR(doris::vectorized::VExprContext::filter_block(local_state.conjuncts(), block, - block->columns())); + { + SCOPED_TIMER(local_state.get_data_from_recvr_timer); + RETURN_IF_ERROR(local_state.stream_recvr->get_next(block, eos)); + } + { + SCOPED_TIMER(local_state.filter_timer); + RETURN_IF_ERROR(doris::vectorized::VExprContext::filter_block(local_state.conjuncts(), + block, block->columns())); + } // In vsortrunmerger, it will set eos=true, and block not empty // so that eos==true, could not make sure that block not have valid data if (!*eos || block->rows() > 0) { @@ -176,7 +187,7 @@ Status ExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block local_state.set_num_rows_returned(_limit); } } - return status; + return Status::OK(); } Status ExchangeLocalState::close(RuntimeState* state) { diff --git a/be/src/pipeline/exec/exchange_source_operator.h b/be/src/pipeline/exec/exchange_source_operator.h index c8ef674d2698538..f938f5007d16430 100644 --- a/be/src/pipeline/exec/exchange_source_operator.h +++ b/be/src/pipeline/exec/exchange_source_operator.h @@ -59,6 +59,9 @@ class ExchangeLocalState final : public PipelineXLocalState<> { std::vector> deps; std::vector metrics; + RuntimeProfile::Counter* get_data_from_recvr_timer = nullptr; + RuntimeProfile::Counter* filter_timer = nullptr; + RuntimeProfile::Counter* create_merger_timer = nullptr; }; class ExchangeSourceOperatorX final : public OperatorX { diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp index e0171b41ab1ee8b..9f99d55d3ea9893 100644 --- a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp +++ b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp @@ -64,6 +64,7 @@ Status GroupCommitBlockSinkLocalState::open(RuntimeState* state) { } Status GroupCommitBlockSinkLocalState::_initialize_load_queue() { + SCOPED_TIMER(_init_load_queue_timer); auto& p = _parent->cast(); if (_state->exec_env()->wal_mgr()->is_running()) { RETURN_IF_ERROR(_state->exec_env()->group_commit_mgr()->get_first_block_load_queue( @@ -238,6 +239,17 @@ Status GroupCommitBlockSinkLocalState::_add_blocks(RuntimeState* state, return Status::OK(); } +Status GroupCommitBlockSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { + RETURN_IF_ERROR(Base::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _init_load_queue_timer = ADD_TIMER(_profile, "InitLoadQueueTime"); + _valid_and_convert_block_timer = ADD_TIMER(_profile, "ValidAndConvertBlockTime"); + _find_partition_timer = ADD_TIMER(_profile, "FindPartitionTime"); + _append_blocks_timer = ADD_TIMER(_profile, "AppendBlocksTime"); + return Status::OK(); +} + Status GroupCommitBlockSinkOperatorX::init(const TDataSink& t_sink) { RETURN_IF_ERROR(Base::init(t_sink)); DCHECK(t_sink.__isset.olap_table_sink); @@ -318,10 +330,15 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, vectorized::Bloc std::shared_ptr block; bool has_filtered_rows = false; - RETURN_IF_ERROR(local_state._block_convertor->validate_and_convert_block( - state, input_block, block, local_state._output_vexpr_ctxs, rows, has_filtered_rows)); + { + SCOPED_TIMER(local_state._valid_and_convert_block_timer); + RETURN_IF_ERROR(local_state._block_convertor->validate_and_convert_block( + state, input_block, block, local_state._output_vexpr_ctxs, rows, + has_filtered_rows)); + } local_state._has_filtered_rows = false; if (!local_state._vpartition->is_auto_partition()) { + SCOPED_TIMER(local_state._find_partition_timer); //reuse vars for find_partition local_state._partitions.assign(rows, nullptr); local_state._filter_bitmap.Reset(rows); @@ -351,23 +368,26 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, vectorized::Bloc } } } - - if (local_state._block_convertor->num_filtered_rows() > 0 || local_state._has_filtered_rows) { - auto cloneBlock = block->clone_without_columns(); - auto res_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); - for (int i = 0; i < rows; ++i) { - if (local_state._block_convertor->filter_map()[i]) { - continue; - } - if (local_state._filter_bitmap.Get(i)) { - continue; + { + SCOPED_TIMER(local_state._append_blocks_timer); + if (local_state._block_convertor->num_filtered_rows() > 0 || + local_state._has_filtered_rows) { + auto cloneBlock = block->clone_without_columns(); + auto res_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + for (int i = 0; i < rows; ++i) { + if (local_state._block_convertor->filter_map()[i]) { + continue; + } + if (local_state._filter_bitmap.Get(i)) { + continue; + } + res_block.add_row(block.get(), i); } - res_block.add_row(block.get(), i); + block->swap(res_block.to_block()); } - block->swap(res_block.to_block()); + // add block into block queue + RETURN_IF_ERROR(local_state._add_block(state, block)); } - // add block into block queue - RETURN_IF_ERROR(local_state._add_block(state, block)); return wind_up(); } diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.h b/be/src/pipeline/exec/group_commit_block_sink_operator.h index 32ca0613652ae47..e469aee8df595c0 100644 --- a/be/src/pipeline/exec/group_commit_block_sink_operator.h +++ b/be/src/pipeline/exec/group_commit_block_sink_operator.h @@ -42,8 +42,8 @@ class GroupCommitBlockSinkLocalState final : public PipelineXSinkLocalState dependencies() const override { @@ -79,6 +79,11 @@ class GroupCommitBlockSinkLocalState final : public PipelineXSinkLocalState _finish_dependency; std::shared_ptr _create_plan_dependency = nullptr; std::shared_ptr _put_block_dependency = nullptr; + + RuntimeProfile::Counter* _init_load_queue_timer = nullptr; + RuntimeProfile::Counter* _valid_and_convert_block_timer = nullptr; + RuntimeProfile::Counter* _find_partition_timer = nullptr; + RuntimeProfile::Counter* _append_blocks_timer = nullptr; }; class GroupCommitBlockSinkOperatorX final diff --git a/be/src/pipeline/exec/group_commit_scan_operator.cpp b/be/src/pipeline/exec/group_commit_scan_operator.cpp index 9577639813a7604..141a5e7bf770c56 100644 --- a/be/src/pipeline/exec/group_commit_scan_operator.cpp +++ b/be/src/pipeline/exec/group_commit_scan_operator.cpp @@ -31,6 +31,7 @@ GroupCommitOperatorX::GroupCommitOperatorX(ObjectPool* pool, const TPlanNode& tn Status GroupCommitOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state.exec_time_counter()); bool find_node = false; while (!find_node && !*eos) { RETURN_IF_ERROR(local_state.load_block_queue->get_block(state, block, &find_node, eos, diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 5ead4ba13a389c1..37de9ac93d839f5 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -43,7 +43,7 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _shared_state->join_op_variants = p._join_op_variants; _shared_state->is_null_safe_eq_join = p._is_null_safe_eq_join; - _shared_state->store_null_in_hash_table = p._store_null_in_hash_table; + _shared_state->serialize_null_into_key = p._serialize_null_into_key; _build_expr_ctxs.resize(p._build_expr_ctxs.size()); for (size_t i = 0; i < _build_expr_ctxs.size(); i++) { RETURN_IF_ERROR(p._build_expr_ctxs[i]->clone(state, _build_expr_ctxs[i])); @@ -51,19 +51,19 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _shared_state->build_exprs_size = _build_expr_ctxs.size(); _should_build_hash_table = true; + profile()->add_info_string("BroadcastJoin", std::to_string(p._is_broadcast_join)); if (p._is_broadcast_join) { - profile()->add_info_string("BroadcastJoin", "true"); if (state->enable_share_hash_table_for_broadcast_join()) { _should_build_hash_table = info.task_idx == 0; if (_should_build_hash_table) { - profile()->add_info_string("ShareHashTableEnabled", "true"); p._shared_hashtable_controller->set_builder_and_consumers( state->fragment_instance_id(), p.node_id()); } - } else { - profile()->add_info_string("ShareHashTableEnabled", "false"); } } + profile()->add_info_string("BuildShareHashTable", std::to_string(_should_build_hash_table)); + profile()->add_info_string("ShareHashTableEnabled", + std::to_string(state->enable_share_hash_table_for_broadcast_join())); if (!_should_build_hash_table) { _dependency->block(); _finish_dependency->block(); @@ -72,6 +72,7 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _finish_dependency->shared_from_this()); } + _runtime_filter_init_timer = ADD_TIMER(profile(), "RuntimeFilterInitTime"); _build_blocks_memory_usage = ADD_COUNTER_WITH_LEVEL(profile(), "MemoryUsageBuildBlocks", TUnit::BYTES, 1); _hash_table_memory_usage = @@ -81,13 +82,10 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo // Build phase auto* record_profile = _should_build_hash_table ? profile() : faker_runtime_profile(); - _build_table_timer = ADD_TIMER(profile(), "BuildTableTime"); - _build_side_merge_block_timer = ADD_TIMER(profile(), "BuildSideMergeBlockTime"); + _build_table_timer = ADD_TIMER(profile(), "BuildHashTableTime"); + _build_side_merge_block_timer = ADD_TIMER(profile(), "MergeBuildBlockTime"); _build_table_insert_timer = ADD_TIMER(record_profile, "BuildTableInsertTime"); _build_expr_call_timer = ADD_TIMER(record_profile, "BuildExprCallTime"); - _build_side_compute_hash_timer = ADD_TIMER(record_profile, "BuildSideHashComputingTime"); - - _allocate_resource_timer = ADD_TIMER(profile(), "AllocateResourceTime"); // Hash Table Init RETURN_IF_ERROR(_hash_table_init(state)); @@ -227,33 +225,22 @@ Status HashJoinBuildSinkLocalState::_extract_join_column( vectorized::Block& block, vectorized::ColumnUInt8::MutablePtr& null_map, vectorized::ColumnRawPtrs& raw_ptrs, const std::vector& res_col_ids) { auto& shared_state = *_shared_state; - auto& p = _parent->cast(); for (size_t i = 0; i < shared_state.build_exprs_size; ++i) { - if (p._should_convert_to_nullable[i]) { + const auto* column = block.get_by_position(res_col_ids[i]).column.get(); + if (!column->is_nullable() && shared_state.serialize_null_into_key[i]) { _key_columns_holder.emplace_back( vectorized::make_nullable(block.get_by_position(res_col_ids[i]).column)); raw_ptrs[i] = _key_columns_holder.back().get(); - continue; - } - - if (shared_state.is_null_safe_eq_join[i]) { - raw_ptrs[i] = block.get_by_position(res_col_ids[i]).column.get(); + } else if (const auto* nullable = check_and_get_column(*column); + !shared_state.serialize_null_into_key[i] && nullable) { + // update nulllmap and split nested out of ColumnNullable when serialize_null_into_key is false and column is nullable + const auto& col_nested = nullable->get_nested_column(); + const auto& col_nullmap = nullable->get_null_map_data(); + DCHECK(null_map != nullptr); + vectorized::VectorizedUtils::update_null_map(null_map->get_data(), col_nullmap); + raw_ptrs[i] = &col_nested; } else { - const auto* column = block.get_by_position(res_col_ids[i]).column.get(); - if (const auto* nullable = check_and_get_column(*column)) { - const auto& col_nested = nullable->get_nested_column(); - const auto& col_nullmap = nullable->get_null_map_data(); - - if (shared_state.store_null_in_hash_table[i]) { - raw_ptrs[i] = nullable; - } else { - DCHECK(null_map != nullptr); - vectorized::VectorizedUtils::update_null_map(null_map->get_data(), col_nullmap); - raw_ptrs[i] = &col_nested; - } - } else { - raw_ptrs[i] = column; - } + raw_ptrs[i] = column; } } return Status::OK(); @@ -267,7 +254,6 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, if (UNLIKELY(rows == 0)) { return Status::OK(); } - COUNTER_UPDATE(_build_rows_counter, rows); block.replace_if_overflow(); vectorized::ColumnRawPtrs raw_ptrs(_build_expr_ctxs.size()); @@ -284,13 +270,9 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, .data()[0] = 1; } } - // TODO: Now we are not sure whether a column is nullable only by ExecNode's `row_desc` - // so we have to initialize this flag by the first build block. - if (!_has_set_need_null_map_for_build) { - _has_set_need_null_map_for_build = true; - _set_build_ignore_flag(block, _build_col_ids); - } - if (p._short_circuit_for_null_in_build_side || _build_side_ignore_null) { + + _set_build_side_has_external_nullmap(block, _build_col_ids); + if (_build_side_has_external_nullmap) { null_map_val = vectorized::ColumnUInt8::create(); null_map_val->get_data().assign(rows, (uint8_t)0); } @@ -300,27 +282,23 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, st = std::visit( vectorized::Overload { - [&](std::monostate& arg, auto join_op, auto has_null_value, + [&](std::monostate& arg, auto join_op, auto short_circuit_for_null_in_build_side, auto with_other_conjuncts) -> Status { LOG(FATAL) << "FATAL: uninited hash table"; __builtin_unreachable(); return Status::OK(); }, - [&](auto&& arg, auto&& join_op, auto has_null_value, - auto short_circuit_for_null_in_build_side, + [&](auto&& arg, auto&& join_op, auto short_circuit_for_null_in_build_side, auto with_other_conjuncts) -> Status { using HashTableCtxType = std::decay_t; using JoinOpType = std::decay_t; ProcessHashTableBuild hash_table_build_process( rows, raw_ptrs, this, state->batch_size(), state); auto st = hash_table_build_process.template run< - JoinOpType::value, has_null_value, - short_circuit_for_null_in_build_side, with_other_conjuncts>( - arg, - has_null_value || short_circuit_for_null_in_build_side - ? &null_map_val->get_data() - : nullptr, + JoinOpType::value, short_circuit_for_null_in_build_side, + with_other_conjuncts>( + arg, null_map_val ? &null_map_val->get_data() : nullptr, &_shared_state->_has_null_in_build_side); COUNTER_SET(_memory_used_counter, _build_blocks_memory_usage->value() + @@ -330,22 +308,24 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, return st; }}, _shared_state->hash_table_variants->method_variant, _shared_state->join_op_variants, - vectorized::make_bool_variant(_build_side_ignore_null), vectorized::make_bool_variant(p._short_circuit_for_null_in_build_side), vectorized::make_bool_variant((p._have_other_join_conjunct))); return st; } -void HashJoinBuildSinkLocalState::_set_build_ignore_flag(vectorized::Block& block, - const std::vector& res_col_ids) { +void HashJoinBuildSinkLocalState::_set_build_side_has_external_nullmap( + vectorized::Block& block, const std::vector& res_col_ids) { auto& p = _parent->cast(); + if (p._short_circuit_for_null_in_build_side) { + _build_side_has_external_nullmap = true; + return; + } for (size_t i = 0; i < _build_expr_ctxs.size(); ++i) { - if (!_shared_state->is_null_safe_eq_join[i] && !p._short_circuit_for_null_in_build_side) { - const auto* column = block.get_by_position(res_col_ids[i]).column.get(); - if (check_and_get_column(*column)) { - _build_side_ignore_null |= !_shared_state->store_null_in_hash_table[i]; - } + const auto* column = block.get_by_position(res_col_ids[i]).column.get(); + if (column->is_nullable() && !_shared_state->serialize_null_into_key[i]) { + _build_side_has_external_nullmap = true; + return; } } } @@ -359,7 +339,7 @@ Status HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { /// For 'null safe equal' join, /// the build key column maybe be converted to nullable from non-nullable. - if (p._should_convert_to_nullable[i]) { + if (p._serialize_null_into_key[i]) { data_type = vectorized::make_nullable(data_type); } data_types.emplace_back(std::move(data_type)); @@ -393,10 +373,6 @@ Status HashJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* st _hash_output_slot_ids = tnode.hash_join_node.hash_output_slot_ids; } - const bool build_stores_null = _join_op == TJoinOp::RIGHT_OUTER_JOIN || - _join_op == TJoinOp::FULL_OUTER_JOIN || - _join_op == TJoinOp::RIGHT_ANTI_JOIN; - const std::vector& eq_join_conjuncts = tnode.hash_join_node.eq_join_conjuncts; for (const auto& eq_join_conjunct : eq_join_conjuncts) { vectorized::VExprContextSPtr build_ctx; @@ -430,16 +406,18 @@ Status HashJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* st (eq_join_conjunct.right.nodes[0].is_nullable || eq_join_conjunct.left.nodes[0].is_nullable); - const bool should_convert_to_nullable = is_null_safe_equal && - !eq_join_conjunct.right.nodes[0].is_nullable && - eq_join_conjunct.left.nodes[0].is_nullable; _is_null_safe_eq_join.push_back(is_null_safe_equal); - _should_convert_to_nullable.emplace_back(should_convert_to_nullable); - // if is null aware, build join column and probe join column both need dispose null value - _store_null_in_hash_table.emplace_back( - is_null_safe_equal || - (_build_expr_ctxs.back()->root()->is_nullable() && build_stores_null)); + if (eq_join_conjuncts.size() == 1) { + // single column key serialize method must use nullmap for represent null to instead serialize null into key + _serialize_null_into_key.emplace_back(false); + } else if (is_null_safe_equal) { + // use serialize null into key to represent multi column null value + _serialize_null_into_key.emplace_back(true); + } else { + // on normal conditions, because null!=null, it can be expressed directly with nullmap. + _serialize_null_into_key.emplace_back(false); + } } return Status::OK(); diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index 1d52feaccff5262..45aa1e8c8a262dc 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -56,7 +56,8 @@ class HashJoinBuildSinkLocalState final protected: Status _hash_table_init(RuntimeState* state); - void _set_build_ignore_flag(vectorized::Block& block, const std::vector& res_col_ids); + void _set_build_side_has_external_nullmap(vectorized::Block& block, + const std::vector& res_col_ids); Status _do_evaluate(vectorized::Block& block, vectorized::VExprContextSPtrs& exprs, RuntimeProfile::Counter& expr_call_timer, std::vector& res_col_ids); std::vector _convert_block_to_null(vectorized::Block& block); @@ -79,7 +80,6 @@ class HashJoinBuildSinkLocalState final vectorized::MutableBlock _build_side_mutable_block; std::shared_ptr _runtime_filter_slots; - bool _has_set_need_null_map_for_build = false; /* * The comparison result of a null value with any other value is null, @@ -87,21 +87,19 @@ class HashJoinBuildSinkLocalState final * the result of an equality condition involving null should be false, * so null does not need to be added to the hash table. */ - bool _build_side_ignore_null = false; + bool _build_side_has_external_nullmap = false; std::vector _build_col_ids; std::shared_ptr _finish_dependency; RuntimeProfile::Counter* _build_table_timer = nullptr; RuntimeProfile::Counter* _build_expr_call_timer = nullptr; RuntimeProfile::Counter* _build_table_insert_timer = nullptr; - RuntimeProfile::Counter* _build_side_compute_hash_timer = nullptr; RuntimeProfile::Counter* _build_side_merge_block_timer = nullptr; - RuntimeProfile::Counter* _allocate_resource_timer = nullptr; - RuntimeProfile::Counter* _build_blocks_memory_usage = nullptr; RuntimeProfile::Counter* _hash_table_memory_usage = nullptr; RuntimeProfile::Counter* _build_arena_memory_usage = nullptr; + RuntimeProfile::Counter* _runtime_filter_init_timer = nullptr; }; class HashJoinBuildSinkOperatorX final @@ -154,13 +152,11 @@ class HashJoinBuildSinkOperatorX final // build expr vectorized::VExprContextSPtrs _build_expr_ctxs; // mark the build hash table whether it needs to store null value - std::vector _store_null_in_hash_table; + std::vector _serialize_null_into_key; // mark the join column whether support null eq std::vector _is_null_safe_eq_join; - std::vector _should_convert_to_nullable; - bool _is_broadcast_join = false; std::shared_ptr _shared_hashtable_controller; @@ -184,11 +180,10 @@ struct ProcessHashTableBuild { _batch_size(batch_size), _state(state) {} - template + template Status run(HashTableContext& hash_table_ctx, vectorized::ConstNullMapPtr null_map, bool* has_null_key) { - if (short_circuit_for_null || ignore_null) { + if (null_map) { // first row is mocked and is null // TODO: Need to test the for loop. break may better for (uint32_t i = 1; i < _rows; i++) { @@ -208,8 +203,21 @@ struct ProcessHashTableBuild { hash_table_ctx.init_serialized_keys(_build_raw_ptrs, _rows, null_map ? null_map->data() : nullptr, true, true, hash_table_ctx.hash_table->get_bucket_size()); - hash_table_ctx.hash_table->template build( - hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), _rows); + // only 2 cases need to access the null value in hash table + bool keep_null_key = false; + if ((JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) && + with_other_conjuncts) { + //null aware join with other conjuncts + keep_null_key = true; + } else if (_parent->_shared_state->is_null_safe_eq_join.size() == 1 && + _parent->_shared_state->is_null_safe_eq_join[0]) { + // single null safe eq + keep_null_key = true; + } + + hash_table_ctx.hash_table->build(hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), + _rows, keep_null_key); hash_table_ctx.bucket_nums.resize(_batch_size); hash_table_ctx.bucket_nums.shrink_to_fit(); diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index a9d3c962b767e80..426bfcb219dc042 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -57,13 +57,11 @@ Status HashJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) _probe_arena_memory_usage = profile()->AddHighWaterMarkCounter("MemoryUsageProbeKeyArena", TUnit::BYTES, "", 1); // Probe phase - _probe_next_timer = ADD_TIMER(profile(), "ProbeFindNextTime"); _probe_expr_call_timer = ADD_TIMER(profile(), "ProbeExprCallTime"); _search_hashtable_timer = ADD_TIMER(profile(), "ProbeWhenSearchHashTableTime"); _build_side_output_timer = ADD_TIMER(profile(), "ProbeWhenBuildSideOutputTime"); _probe_side_output_timer = ADD_TIMER(profile(), "ProbeWhenProbeSideOutputTime"); - _probe_process_hashtable_timer = ADD_TIMER(profile(), "ProbeWhenProcessHashTableTime"); - _process_other_join_conjunct_timer = ADD_TIMER(profile(), "OtherJoinConjunctTime"); + _non_equal_join_conjuncts_timer = ADD_TIMER(profile(), "NonEqualJoinConjunctEvaluationTime"); _init_probe_side_timer = ADD_TIMER(profile(), "InitProbeSideTime"); return Status::OK(); } @@ -154,11 +152,9 @@ Status HashJoinProbeLocalState::close(RuntimeState* state) { bool HashJoinProbeLocalState::_need_probe_null_map(vectorized::Block& block, const std::vector& res_col_ids) { for (size_t i = 0; i < _probe_expr_ctxs.size(); ++i) { - if (!_shared_state->is_null_safe_eq_join[i]) { - const auto* column = block.get_by_position(res_col_ids[i]).column.get(); - if (check_and_get_column(*column)) { - return true; - } + const auto* column = block.get_by_position(res_col_ids[i]).column.get(); + if (column->is_nullable() && !_shared_state->serialize_null_into_key[i]) { + return true; } } return false; @@ -231,7 +227,6 @@ HashJoinProbeOperatorX::HashJoinProbeOperatorX(ObjectPool* pool, const TPlanNode Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Block* output_block, bool* eos) const { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state._probe_timer); if (local_state._shared_state->short_circuit_for_probe) { // If we use a short-circuit strategy, should return empty block directly. *eos = true; @@ -290,16 +285,14 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc if (local_state._probe_index < local_state._probe_block.rows()) { DCHECK(local_state._has_set_need_null_map_for_probe); std::visit( - [&](auto&& arg, auto&& process_hashtable_ctx, auto need_null_map_for_probe, - auto ignore_null) { + [&](auto&& arg, auto&& process_hashtable_ctx, auto need_judge_null) { using HashTableProbeType = std::decay_t; if constexpr (!std::is_same_v) { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { - st = process_hashtable_ctx.template process( + st = process_hashtable_ctx.template process( arg, - need_null_map_for_probe + local_state._null_map_column ? &local_state._null_map_column->get_data() : nullptr, mutable_join_block, &temp_block, @@ -314,8 +307,8 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc }, local_state._shared_state->hash_table_variants->method_variant, *local_state._process_hashtable_ctx_variants, - vectorized::make_bool_variant(local_state._need_null_map_for_probe), - vectorized::make_bool_variant(local_state._shared_state->probe_ignore_null)); + vectorized::make_bool_variant(local_state._need_null_map_for_probe && + local_state._shared_state->probe_ignore_null)); } else if (local_state._probe_eos) { if (_is_right_semi_anti || (_is_outer_join && _join_op != TJoinOp::LEFT_OUTER_JOIN)) { std::visit( @@ -324,7 +317,7 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc if constexpr (!std::is_same_v) { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { - st = process_hashtable_ctx.process_data_in_hashtable( + st = process_hashtable_ctx.finish_probing( arg, mutable_join_block, &temp_block, eos, _is_mark_join); } else { st = Status::InternalError("uninited hash table"); @@ -383,34 +376,22 @@ Status HashJoinProbeLocalState::_extract_join_column(vectorized::Block& block, } auto& shared_state = *_shared_state; - auto& p = _parent->cast(); for (size_t i = 0; i < shared_state.build_exprs_size; ++i) { - if (p._should_convert_to_nullable[i]) { + const auto* column = block.get_by_position(res_col_ids[i]).column.get(); + if (!column->is_nullable() && shared_state.serialize_null_into_key[i]) { _key_columns_holder.emplace_back( vectorized::make_nullable(block.get_by_position(res_col_ids[i]).column)); _probe_columns[i] = _key_columns_holder.back().get(); - continue; - } - - if (shared_state.is_null_safe_eq_join[i]) { - _probe_columns[i] = block.get_by_position(res_col_ids[i]).column.get(); + } else if (const auto* nullable = check_and_get_column(*column); + nullable && !shared_state.serialize_null_into_key[i]) { + // update nulllmap and split nested out of ColumnNullable when serialize_null_into_key is false and column is nullable + const auto& col_nested = nullable->get_nested_column(); + const auto& col_nullmap = nullable->get_null_map_data(); + DCHECK(_null_map_column != nullptr); + vectorized::VectorizedUtils::update_null_map(_null_map_column->get_data(), col_nullmap); + _probe_columns[i] = &col_nested; } else { - const auto* column = block.get_by_position(res_col_ids[i]).column.get(); - if (const auto* nullable = check_and_get_column(*column)) { - const auto& col_nested = nullable->get_nested_column(); - const auto& col_nullmap = nullable->get_null_map_data(); - - DCHECK(_null_map_column != nullptr); - vectorized::VectorizedUtils::update_null_map(_null_map_column->get_data(), - col_nullmap); - if (shared_state.store_null_in_hash_table[i]) { - _probe_columns[i] = nullable; - } else { - _probe_columns[i] = &col_nested; - } - } else { - _probe_columns[i] = column; - } + _probe_columns[i] = column; } } return Status::OK(); @@ -531,20 +512,6 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode, RuntimeState* state) null_aware || (_probe_expr_ctxs.back()->root()->is_nullable() && probe_dispose_null); conjuncts_index++; - const bool is_null_safe_equal = eq_join_conjunct.__isset.opcode && - (eq_join_conjunct.opcode == TExprOpcode::EQ_FOR_NULL) && - (eq_join_conjunct.right.nodes[0].is_nullable || - eq_join_conjunct.left.nodes[0].is_nullable); - - /// If it's right anti join, - /// we should convert the probe to nullable if the build side is nullable. - /// And if it is 'null safe equal', - /// we must make sure the build side and the probe side are both nullable or non-nullable. - const bool should_convert_to_nullable = - (is_null_safe_equal || _join_op == TJoinOp::RIGHT_ANTI_JOIN) && - !eq_join_conjunct.left.nodes[0].is_nullable && - eq_join_conjunct.right.nodes[0].is_nullable; - _should_convert_to_nullable.emplace_back(should_convert_to_nullable); } for (size_t i = 0; i < _probe_expr_ctxs.size(); ++i) { _probe_ignore_null |= !probe_not_ignore_null[i]; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h index 7da7a3b238d3b86..1bdb9d13347d09e 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.h +++ b/be/src/pipeline/exec/hashjoin_probe_operator.h @@ -117,14 +117,12 @@ class HashJoinProbeLocalState final std::make_unique(); RuntimeProfile::Counter* _probe_expr_call_timer = nullptr; - RuntimeProfile::Counter* _probe_next_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; - RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr; RuntimeProfile::HighWaterMarkCounter* _probe_arena_memory_usage = nullptr; RuntimeProfile::Counter* _search_hashtable_timer = nullptr; RuntimeProfile::Counter* _init_probe_side_timer = nullptr; RuntimeProfile::Counter* _build_side_output_timer = nullptr; - RuntimeProfile::Counter* _process_other_join_conjunct_timer = nullptr; + RuntimeProfile::Counter* _non_equal_join_conjuncts_timer = nullptr; }; class HashJoinProbeOperatorX final : public JoinProbeOperatorX { @@ -178,8 +176,6 @@ class HashJoinProbeOperatorX final : public JoinProbeOperatorX _should_convert_to_nullable; - vectorized::DataTypes _right_table_data_types; vectorized::DataTypes _left_table_data_types; std::vector _hash_output_slot_ids; diff --git a/be/src/pipeline/exec/jdbc_table_sink_operator.cpp b/be/src/pipeline/exec/jdbc_table_sink_operator.cpp index 10fd0d8e40bf253..29c881d1c281000 100644 --- a/be/src/pipeline/exec/jdbc_table_sink_operator.cpp +++ b/be/src/pipeline/exec/jdbc_table_sink_operator.cpp @@ -47,6 +47,7 @@ Status JdbcTableSinkOperatorX::open(RuntimeState* state) { Status JdbcTableSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, bool eos) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); + COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); RETURN_IF_ERROR(local_state.sink(state, block, eos)); return Status::OK(); } diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h b/be/src/pipeline/exec/join/process_hash_table_probe.h index 692b91f6a0120a8..14e0edd977f57bc 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe.h @@ -55,7 +55,7 @@ struct ProcessHashTableProbe { int last_probe_index, bool all_match_one, bool have_other_join_conjunct); - template + template Status process(HashTableType& hash_table_ctx, ConstNullMapPtr null_map, vectorized::MutableBlock& mutable_block, vectorized::Block* output_block, uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); @@ -64,8 +64,8 @@ struct ProcessHashTableProbe { // the output block struct is same with mutable block. we can do more opt on it and simplify // the logic of probe // TODO: opt the visited here to reduce the size of hash table - template + template Status do_process(HashTableType& hash_table_ctx, ConstNullMapPtr null_map, vectorized::MutableBlock& mutable_block, vectorized::Block* output_block, uint32_t probe_rows); @@ -87,9 +87,8 @@ struct ProcessHashTableProbe { // Process full outer join/ right join / right semi/anti join to output the join result // in hash table template - Status process_data_in_hashtable(HashTableType& hash_table_ctx, - vectorized::MutableBlock& mutable_block, - vectorized::Block* output_block, bool* eos, bool is_mark_join); + Status finish_probing(HashTableType& hash_table_ctx, vectorized::MutableBlock& mutable_block, + vectorized::Block* output_block, bool* eos, bool is_mark_join); /// For null aware join with other conjuncts, if the probe key of one row on left side is null, /// we should make this row match with all rows in build side. @@ -136,7 +135,7 @@ struct ProcessHashTableProbe { RuntimeProfile::Counter* _init_probe_side_timer = nullptr; RuntimeProfile::Counter* _build_side_output_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; - RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr; + RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr; size_t _right_col_idx; size_t _right_col_len; diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 7fc639b47a4d015..05cd3d7d9e0590f 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -56,7 +56,7 @@ ProcessHashTableProbe::ProcessHashTableProbe(HashJoinProbeLocalState _init_probe_side_timer(parent->_init_probe_side_timer), _build_side_output_timer(parent->_build_side_output_timer), _probe_side_output_timer(parent->_probe_side_output_timer), - _probe_process_hashtable_timer(parent->_probe_process_hashtable_timer), + _finish_probe_phase_timer(parent->_finish_probe_phase_timer), _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _parent->left_table_data_types().size()), @@ -187,8 +187,8 @@ typename HashTableType::State ProcessHashTableProbe::_init_probe_sid } template -template +template Status ProcessHashTableProbe::do_process(HashTableType& hash_table_ctx, vectorized::ConstNullMapPtr null_map, vectorized::MutableBlock& mutable_block, @@ -206,8 +206,8 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c SCOPED_TIMER(_init_probe_side_timer); _init_probe_side( hash_table_ctx, probe_rows, with_other_conjuncts, - need_null_map_for_probe ? null_map->data() : nullptr, - need_null_map_for_probe && ignore_null && + null_map ? null_map->data() : nullptr, + need_judge_null && (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN || JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN || JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || @@ -255,14 +255,12 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c } } else { SCOPED_TIMER(_search_hashtable_timer); - auto [new_probe_idx, new_build_idx, - new_current_offset] = hash_table_ctx.hash_table->template find_batch < JoinOpType, - with_other_conjuncts, is_mark_join, - need_null_map_for_probe && - ignore_null > (hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), - probe_index, build_index, cast_set(probe_rows), - _probe_indexs.data(), _probe_visited, _build_indexs.data(), - has_mark_join_conjunct); + auto [new_probe_idx, new_build_idx, new_current_offset] = + hash_table_ctx.hash_table->template find_batch( + hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), probe_index, + build_index, cast_set(probe_rows), _probe_indexs.data(), + _probe_visited, _build_indexs.data(), has_mark_join_conjunct); probe_index = new_probe_idx; build_index = new_build_idx; current_offset = new_current_offset; @@ -504,7 +502,7 @@ Status ProcessHashTableProbe::do_other_join_conjuncts(vectorized::Bl return Status::OK(); } - SCOPED_TIMER(_parent->_process_other_join_conjunct_timer); + SCOPED_TIMER(_parent->_non_equal_join_conjuncts_timer); size_t orig_columns = output_block->columns(); vectorized::IColumn::Filter other_conjunct_filter(row_count, 1); { @@ -619,10 +617,11 @@ Status ProcessHashTableProbe::do_other_join_conjuncts(vectorized::Bl template template -Status ProcessHashTableProbe::process_data_in_hashtable( - HashTableType& hash_table_ctx, vectorized::MutableBlock& mutable_block, - vectorized::Block* output_block, bool* eos, bool is_mark_join) { - SCOPED_TIMER(_probe_process_hashtable_timer); +Status ProcessHashTableProbe::finish_probing(HashTableType& hash_table_ctx, + vectorized::MutableBlock& mutable_block, + vectorized::Block* output_block, bool* eos, + bool is_mark_join) { + SCOPED_TIMER(_finish_probe_phase_timer); auto& mcol = mutable_block.mutable_columns(); if (is_mark_join) { std::unique_ptr mark_column = @@ -675,7 +674,7 @@ Status ProcessHashTableProbe::process_data_in_hashtable( } template -template +template Status ProcessHashTableProbe::process(HashTableType& hash_table_ctx, vectorized::ConstNullMapPtr null_map, vectorized::MutableBlock& mutable_block, @@ -685,9 +684,9 @@ Status ProcessHashTableProbe::process(HashTableType& hash_table_ctx, Status res; std::visit( [&](auto is_mark_join, auto have_other_join_conjunct) { - res = do_process( - hash_table_ctx, null_map, mutable_block, output_block, probe_rows); + res = do_process(hash_table_ctx, null_map, mutable_block, + output_block, probe_rows); }, vectorized::make_bool_variant(is_mark_join), vectorized::make_bool_variant(have_other_join_conjunct)); @@ -703,50 +702,32 @@ struct ExtractType { }; #define INSTANTIATION(JoinOpType, T) \ - template Status \ - ProcessHashTableProbe::process::Type>( \ + template Status ProcessHashTableProbe::process::Type>( \ ExtractType::Type & hash_table_ctx, vectorized::ConstNullMapPtr null_map, \ vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ - template Status \ - ProcessHashTableProbe::process::Type>( \ + template Status ProcessHashTableProbe::process::Type>( \ ExtractType::Type & hash_table_ctx, vectorized::ConstNullMapPtr null_map, \ vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ - template Status \ - ProcessHashTableProbe::process::Type>( \ - ExtractType::Type & hash_table_ctx, vectorized::ConstNullMapPtr null_map, \ - vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ - uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ - template Status \ - ProcessHashTableProbe::process::Type>( \ - ExtractType::Type & hash_table_ctx, vectorized::ConstNullMapPtr null_map, \ - vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ - uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ - \ - template Status \ - ProcessHashTableProbe::process_data_in_hashtable::Type>( \ + template Status ProcessHashTableProbe::finish_probing::Type>( \ ExtractType::Type & hash_table_ctx, vectorized::MutableBlock & mutable_block, \ vectorized::Block * output_block, bool* eos, bool is_mark_join); -#define INSTANTIATION_FOR(JoinOpType) \ - template struct ProcessHashTableProbe; \ - \ - INSTANTIATION(JoinOpType, (SerializedHashTableContext)); \ - INSTANTIATION(JoinOpType, (I8HashTableContext)); \ - INSTANTIATION(JoinOpType, (I16HashTableContext)); \ - INSTANTIATION(JoinOpType, (I32HashTableContext)); \ - INSTANTIATION(JoinOpType, (I64HashTableContext)); \ - INSTANTIATION(JoinOpType, (I128HashTableContext)); \ - INSTANTIATION(JoinOpType, (I256HashTableContext)); \ - INSTANTIATION(JoinOpType, (I64FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I64FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I128FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I128FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I256FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I256FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I136FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (MethodOneString)); \ - INSTANTIATION(JoinOpType, (I136FixedKeyHashTableContext)); +#define INSTANTIATION_FOR(JoinOpType) \ + template struct ProcessHashTableProbe; \ + \ + INSTANTIATION(JoinOpType, (SerializedHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext)); \ + INSTANTIATION(JoinOpType, (MethodOneString)); #include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/join_build_sink_operator.cpp b/be/src/pipeline/exec/join_build_sink_operator.cpp index fc0d3b8746077ba..8b3f5cd98ff7c06 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.cpp +++ b/be/src/pipeline/exec/join_build_sink_operator.cpp @@ -33,15 +33,11 @@ Status JoinBuildSinkLocalState::init(RuntimeState* stat PipelineXSinkLocalState::profile()->add_info_string("JoinType", to_string(p._join_op)); - _build_rows_counter = ADD_COUNTER(PipelineXSinkLocalState::profile(), - "BuildRows", TUnit::UNIT); _publish_runtime_filter_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), "PublishRuntimeFilterTime"); - _runtime_filter_compute_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), - "RuntimeFilterComputeTime"); - _runtime_filter_init_timer = - ADD_TIMER(PipelineXSinkLocalState::profile(), "RuntimeFilterInitTime"); + _runtime_filter_compute_timer = + ADD_TIMER(PipelineXSinkLocalState::profile(), "BuildRuntimeFilterTime"); return Status::OK(); } diff --git a/be/src/pipeline/exec/join_build_sink_operator.h b/be/src/pipeline/exec/join_build_sink_operator.h index 714e0c341906781..9d79a97397ff776 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.h +++ b/be/src/pipeline/exec/join_build_sink_operator.h @@ -39,10 +39,8 @@ class JoinBuildSinkLocalState : public PipelineXSinkLocalState template friend class JoinBuildSinkOperatorX; - RuntimeProfile::Counter* _build_rows_counter = nullptr; RuntimeProfile::Counter* _publish_runtime_filter_timer = nullptr; RuntimeProfile::Counter* _runtime_filter_compute_timer = nullptr; - RuntimeProfile::Counter* _runtime_filter_init_timer = nullptr; std::vector> _runtime_filters; }; diff --git a/be/src/pipeline/exec/join_probe_operator.cpp b/be/src/pipeline/exec/join_probe_operator.cpp index 76dc75a90d8f3c4..11b5b29c8b556b1 100644 --- a/be/src/pipeline/exec/join_probe_operator.cpp +++ b/be/src/pipeline/exec/join_probe_operator.cpp @@ -29,11 +29,10 @@ Status JoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - _probe_timer = ADD_TIMER(Base::profile(), "ProbeTime"); _join_filter_timer = ADD_TIMER(Base::profile(), "JoinFilterTimer"); _build_output_block_timer = ADD_TIMER(Base::profile(), "BuildOutputBlock"); _probe_rows_counter = ADD_COUNTER_WITH_LEVEL(Base::profile(), "ProbeRows", TUnit::UNIT, 1); - + _finish_probe_phase_timer = ADD_TIMER(Base::profile(), "FinishProbePhaseTime"); return Status::OK(); } diff --git a/be/src/pipeline/exec/join_probe_operator.h b/be/src/pipeline/exec/join_probe_operator.h index 3f68c73d04b1612..078806cea4fc5ac 100644 --- a/be/src/pipeline/exec/join_probe_operator.h +++ b/be/src/pipeline/exec/join_probe_operator.h @@ -49,10 +49,10 @@ class JoinProbeLocalState : public PipelineXLocalState { size_t _mark_column_id = -1; - RuntimeProfile::Counter* _probe_timer = nullptr; RuntimeProfile::Counter* _probe_rows_counter = nullptr; RuntimeProfile::Counter* _join_filter_timer = nullptr; RuntimeProfile::Counter* _build_output_block_timer = nullptr; + RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr; std::unique_ptr _child_block = nullptr; bool _child_eos = false; diff --git a/be/src/pipeline/exec/memory_scratch_sink_operator.cpp b/be/src/pipeline/exec/memory_scratch_sink_operator.cpp index 1d022f9304fd0de..2c69c0e2b2ba9fc 100644 --- a/be/src/pipeline/exec/memory_scratch_sink_operator.cpp +++ b/be/src/pipeline/exec/memory_scratch_sink_operator.cpp @@ -33,6 +33,9 @@ Status MemoryScratchSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _get_arrow_schema_timer = ADD_TIMER(_profile, "GetArrowSchemaTime"); + _convert_block_to_arrow_batch_timer = ADD_TIMER(_profile, "ConvertBlockToArrowBatchTime"); + _evaluation_timer = ADD_TIMER(_profile, "EvaluationTime"); // create queue state->exec_env()->result_queue_mgr()->create_queue(state->fragment_instance_id(), &_queue); @@ -92,13 +95,22 @@ Status MemoryScratchSinkOperatorX::sink(RuntimeState* state, vectorized::Block* // Exec vectorized expr here to speed up, block.rows() == 0 means expr exec // failed, just return the error status vectorized::Block block; - RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( - local_state._output_vexpr_ctxs, *input_block, &block)); + { + SCOPED_TIMER(local_state._evaluation_timer); + RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( + local_state._output_vexpr_ctxs, *input_block, &block)); + } std::shared_ptr block_arrow_schema; - // After expr executed, use recaculated schema as final schema - RETURN_IF_ERROR(convert_block_arrow_schema(block, &block_arrow_schema, state->timezone())); - RETURN_IF_ERROR(convert_to_arrow_batch(block, block_arrow_schema, arrow::default_memory_pool(), - &result, _timezone_obj)); + { + SCOPED_TIMER(local_state._get_arrow_schema_timer); + // After expr executed, use recaculated schema as final schema + RETURN_IF_ERROR(get_arrow_schema(block, &block_arrow_schema, state->timezone())); + } + { + SCOPED_TIMER(local_state._convert_block_to_arrow_batch_timer); + RETURN_IF_ERROR(convert_to_arrow_batch( + block, block_arrow_schema, arrow::default_memory_pool(), &result, _timezone_obj)); + } local_state._queue->blocking_put(result); if (local_state._queue->size() > config::max_memory_sink_batch_count) { local_state._queue_dependency->block(); diff --git a/be/src/pipeline/exec/memory_scratch_sink_operator.h b/be/src/pipeline/exec/memory_scratch_sink_operator.h index 69c0fa14042ef28..c74659d15b96f29 100644 --- a/be/src/pipeline/exec/memory_scratch_sink_operator.h +++ b/be/src/pipeline/exec/memory_scratch_sink_operator.h @@ -45,6 +45,9 @@ class MemoryScratchSinkLocalState final : public PipelineXSinkLocalState _queue_dependency = nullptr; + RuntimeProfile::Counter* _get_arrow_schema_timer = nullptr; + RuntimeProfile::Counter* _convert_block_to_arrow_batch_timer = nullptr; + RuntimeProfile::Counter* _evaluation_timer = nullptr; }; class MemoryScratchSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp index 71204f1285ce7b1..e45e59d17e27b37 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp @@ -40,6 +40,9 @@ Status MultiCastDataStreamSourceLocalState::init(RuntimeState* state, LocalState auto& p = _parent->cast(); _shared_state->multi_cast_data_streamer->set_dep_by_sender_idx(p._consumer_id, _dependency); _wait_for_rf_timer = ADD_TIMER(_runtime_profile, "WaitForRuntimeFilter"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); + _get_data_timer = ADD_TIMER(_runtime_profile, "GetDataTime"); + _materialize_data_timer = ADD_TIMER(_runtime_profile, "MaterializeDataTime"); // init profile for runtime filter RuntimeFilterConsumer::_init_profile(profile()); init_runtime_filter_dependency(_filter_dependencies, p.operator_id(), p.node_id(), @@ -86,15 +89,19 @@ Status MultiCastDataStreamerSourceOperatorX::get_block(RuntimeState* state, if (!local_state._output_expr_contexts.empty()) { output_block = &tmp_block; } - RETURN_IF_ERROR(local_state._shared_state->multi_cast_data_streamer->pull(_consumer_id, - output_block, eos)); - + { + SCOPED_TIMER(local_state._get_data_timer); + RETURN_IF_ERROR(local_state._shared_state->multi_cast_data_streamer->pull( + _consumer_id, output_block, eos)); + } if (!local_state._conjuncts.empty()) { + SCOPED_TIMER(local_state._filter_timer); RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, output_block->columns())); } if (!local_state._output_expr_contexts.empty() && output_block->rows() > 0) { + SCOPED_TIMER(local_state._materialize_data_timer); RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( local_state._output_expr_contexts, *output_block, block, true)); vectorized::materialize_block_inplace(*block); diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.h b/be/src/pipeline/exec/multi_cast_data_stream_source.h index 2059f706cad3f50..57410bf8d9568a5 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.h +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.h @@ -68,6 +68,9 @@ class MultiCastDataStreamSourceLocalState final : public PipelineXLocalState> _filter_dependencies; RuntimeProfile::Counter* _wait_for_rf_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; + RuntimeProfile::Counter* _get_data_timer = nullptr; + RuntimeProfile::Counter* _materialize_data_timer = nullptr; }; class MultiCastDataStreamerSourceOperatorX final diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp index 59020a5df437bde..83b378e792c3fa3 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp @@ -139,7 +139,6 @@ Status NestedLoopJoinBuildSinkOperatorX::sink(doris::RuntimeState* state, vector } if (eos) { - COUNTER_UPDATE(local_state._build_rows_counter, local_state._build_rows); RuntimeFilterBuild rf_ctx(&local_state); RETURN_IF_ERROR(rf_ctx(state)); diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp index d0fb4ee19a58249..afa1a2e59b798ce 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp @@ -43,6 +43,10 @@ Status NestedLoopJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); _loop_join_timer = ADD_TIMER(profile(), "LoopGenerateJoin"); + _output_temp_blocks_timer = ADD_TIMER(profile(), "OutputTempBlocksTime"); + _update_visited_flags_timer = ADD_TIMER(profile(), "UpdateVisitedFlagsTime"); + _join_conjuncts_evaluation_timer = ADD_TIMER(profile(), "JoinConjunctsEvaluationTime"); + _filtered_by_join_conjuncts_timer = ADD_TIMER(profile(), "FilteredByJoinConjunctsTime"); return Status::OK(); } @@ -168,23 +172,26 @@ Status NestedLoopJoinProbeLocalState::generate_join_block_data(RuntimeState* sta _process_left_child_block(_join_block, now_process_build_block); } - if constexpr (set_probe_side_flag) { - RETURN_IF_ERROR( - (_do_filtering_and_update_visited_flags( - &_join_block, !p._is_left_semi_anti))); - _update_additional_flags(&_join_block); - // If this join operation is left outer join or full outer join, when - // `_left_side_process_count`, means all rows from build - // side have been joined with _left_side_process_count, we should output current - // probe row with null from build side. - if (_left_side_process_count) { - _finalize_current_phase( - _join_block, state->batch_size()); + { + SCOPED_TIMER(_finish_probe_phase_timer); + if constexpr (set_probe_side_flag) { + RETURN_IF_ERROR( + (_do_filtering_and_update_visited_flags( + &_join_block, !p._is_left_semi_anti))); + _update_additional_flags(&_join_block); + // If this join operation is left outer join or full outer join, when + // `_left_side_process_count`, means all rows from build + // side have been joined with _left_side_process_count, we should output current + // probe row with null from build side. + if (_left_side_process_count) { + _finalize_current_phase( + _join_block, state->batch_size()); + } + } else if (_left_side_process_count && p._is_mark_join && + _shared_state->build_blocks.empty()) { + _append_left_data_with_null(_join_block); } - } else if (_left_side_process_count && p._is_mark_join && - _shared_state->build_blocks.empty()) { - _append_left_data_with_null(_join_block); } } @@ -377,6 +384,7 @@ void NestedLoopJoinProbeLocalState::_append_left_data_with_null(vectorized::Bloc void NestedLoopJoinProbeLocalState::_process_left_child_block( vectorized::Block& block, const vectorized::Block& now_process_build_block) const { + SCOPED_TIMER(_output_temp_blocks_timer); auto& p = _parent->cast(); auto dst_columns = block.mutate_columns(); const size_t max_added_rows = now_process_build_block.rows(); @@ -485,6 +493,7 @@ Status NestedLoopJoinProbeOperatorX::push(doris::RuntimeState* state, vectorized set_build_side_flag, set_probe_side_flag>( state, join_op_variants); }; + SCOPED_TIMER(local_state._loop_join_timer); RETURN_IF_ERROR( std::visit(func, local_state._shared_state->join_op_variants, vectorized::make_bool_variant(_match_all_build || _is_right_semi_anti), diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.h b/be/src/pipeline/exec/nested_loop_join_probe_operator.h index 5b0fec159e28bf1..c744e6acdc507e3 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.h @@ -68,42 +68,48 @@ class NestedLoopJoinProbeLocalState final size_t build_block_idx, size_t processed_blocks_num, bool materialize, Filter& filter) { - if constexpr (SetBuildSideFlag) { - for (size_t i = 0; i < processed_blocks_num; i++) { - auto& build_side_flag = - assert_cast( - _shared_state->build_side_visited_flags[build_block_idx].get()) - ->get_data(); - auto* __restrict build_side_flag_data = build_side_flag.data(); - auto cur_sz = build_side_flag.size(); - const size_t offset = _build_offset_stack.top(); - _build_offset_stack.pop(); - for (size_t j = 0; j < cur_sz; j++) { - build_side_flag_data[j] |= filter[offset + j]; + { + SCOPED_TIMER(_update_visited_flags_timer); + if constexpr (SetBuildSideFlag) { + for (size_t i = 0; i < processed_blocks_num; i++) { + auto& build_side_flag = + assert_cast( + _shared_state->build_side_visited_flags[build_block_idx].get()) + ->get_data(); + auto* __restrict build_side_flag_data = build_side_flag.data(); + auto cur_sz = build_side_flag.size(); + const size_t offset = _build_offset_stack.top(); + _build_offset_stack.pop(); + for (size_t j = 0; j < cur_sz; j++) { + build_side_flag_data[j] |= filter[offset + j]; + } + build_block_idx = build_block_idx == 0 ? _shared_state->build_blocks.size() - 1 + : build_block_idx - 1; } - build_block_idx = build_block_idx == 0 ? _shared_state->build_blocks.size() - 1 - : build_block_idx - 1; } - } - if constexpr (SetProbeSideFlag) { - int64_t end = filter.size(); - for (int i = _left_block_pos == _child_block->rows() ? _left_block_pos - 1 - : _left_block_pos; - i >= _left_block_start_pos; i--) { - int64_t offset = 0; - if (!_probe_offset_stack.empty()) { - offset = _probe_offset_stack.top(); - _probe_offset_stack.pop(); - } - if (!_cur_probe_row_visited_flags[i]) { - _cur_probe_row_visited_flags[i] = - simd::contain_byte(filter.data() + offset, end - offset, 1) ? 1 - : 0; + if constexpr (SetProbeSideFlag) { + int64_t end = filter.size(); + for (int i = _left_block_pos == _child_block->rows() ? _left_block_pos - 1 + : _left_block_pos; + i >= _left_block_start_pos; i--) { + int64_t offset = 0; + if (!_probe_offset_stack.empty()) { + offset = _probe_offset_stack.top(); + _probe_offset_stack.pop(); + } + if (!_cur_probe_row_visited_flags[i]) { + _cur_probe_row_visited_flags[i] = + simd::contain_byte(filter.data() + offset, end - offset, 1) + ? 1 + : 0; + } + end = offset; } - end = offset; } } + if (materialize) { + SCOPED_TIMER(_filtered_by_join_conjuncts_timer); vectorized::Block::filter_block_internal(block, filter, column_to_keep); } else { CLEAR_BLOCK @@ -125,8 +131,11 @@ class NestedLoopJoinProbeLocalState final if (LIKELY(!_join_conjuncts.empty() && block->rows() > 0)) { vectorized::IColumn::Filter filter(block->rows(), 1); bool can_filter_all = false; - RETURN_IF_ERROR(vectorized::VExprContext::execute_conjuncts( - _join_conjuncts, nullptr, IgnoreNull, block, &filter, &can_filter_all)); + { + SCOPED_TIMER(_join_conjuncts_evaluation_timer); + RETURN_IF_ERROR(vectorized::VExprContext::execute_conjuncts( + _join_conjuncts, nullptr, IgnoreNull, block, &filter, &can_filter_all)); + } if (can_filter_all) { CLEAR_BLOCK @@ -185,6 +194,10 @@ class NestedLoopJoinProbeLocalState final vectorized::VExprContextSPtrs _join_conjuncts; RuntimeProfile::Counter* _loop_join_timer = nullptr; + RuntimeProfile::Counter* _output_temp_blocks_timer = nullptr; + RuntimeProfile::Counter* _update_visited_flags_timer = nullptr; + RuntimeProfile::Counter* _join_conjuncts_evaluation_timer = nullptr; + RuntimeProfile::Counter* _filtered_by_join_conjuncts_timer = nullptr; }; class NestedLoopJoinProbeOperatorX final diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h index 2a2b3fdd3b95b2e..e1b81d8dae3b071 100644 --- a/be/src/pipeline/exec/operator.h +++ b/be/src/pipeline/exec/operator.h @@ -730,6 +730,9 @@ class OperatorXBase : public OperatorBase { void set_parallel_tasks(int parallel_tasks) { _parallel_tasks = parallel_tasks; } int parallel_tasks() const { return _parallel_tasks; } + // To keep compatibility with older FE + void set_serial_operator() { _is_serial_operator = true; } + protected: template friend class PipelineXLocalState; diff --git a/be/src/pipeline/exec/repeat_operator.cpp b/be/src/pipeline/exec/repeat_operator.cpp index dba4f27af7c385d..5c94d43f0d1e05d 100644 --- a/be/src/pipeline/exec/repeat_operator.cpp +++ b/be/src/pipeline/exec/repeat_operator.cpp @@ -46,6 +46,16 @@ Status RepeatLocalState::open(RuntimeState* state) { return Status::OK(); } +Status RepeatLocalState::init(RuntimeState* state, LocalStateInfo& info) { + RETURN_IF_ERROR(Base::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _evaluate_input_timer = ADD_TIMER(profile(), "EvaluateInputDataTime"); + _get_repeat_data_timer = ADD_TIMER(profile(), "GetRepeatDataTime"); + _filter_timer = ADD_TIMER(profile(), "FilterTime"); + return Status::OK(); +} + Status RepeatOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(OperatorXBase::init(tnode, state)); RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(tnode.repeat_node.exprs, _expr_ctxs)); @@ -166,23 +176,24 @@ Status RepeatLocalState::add_grouping_id_column(std::size_t rows, std::size_t& c Status RepeatOperatorX::push(RuntimeState* state, vectorized::Block* input_block, bool eos) const { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state._evaluate_input_timer); local_state._child_eos = eos; - auto& _intermediate_block = local_state._intermediate_block; - auto& _expr_ctxs = local_state._expr_ctxs; - DCHECK(!_intermediate_block || _intermediate_block->rows() == 0); + auto& intermediate_block = local_state._intermediate_block; + auto& expr_ctxs = local_state._expr_ctxs; + DCHECK(!intermediate_block || intermediate_block->rows() == 0); if (input_block->rows() > 0) { - _intermediate_block = vectorized::Block::create_unique(); + intermediate_block = vectorized::Block::create_unique(); - for (auto& expr : _expr_ctxs) { + for (auto& expr : expr_ctxs) { int result_column_id = -1; RETURN_IF_ERROR(expr->execute(input_block, &result_column_id)); DCHECK(result_column_id != -1); input_block->get_by_position(result_column_id).column = input_block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); - _intermediate_block->insert(input_block->get_by_position(result_column_id)); + intermediate_block->insert(input_block->get_by_position(result_column_id)); } - DCHECK_EQ(_expr_ctxs.size(), _intermediate_block->columns()); + DCHECK_EQ(expr_ctxs.size(), intermediate_block->columns()); } return Status::OK(); @@ -202,33 +213,39 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp } DCHECK(output_block->rows() == 0); - if (_intermediate_block && _intermediate_block->rows() > 0) { - RETURN_IF_ERROR(local_state.get_repeated_block(_intermediate_block.get(), _repeat_id_idx, - output_block)); + { + SCOPED_TIMER(local_state._get_repeat_data_timer); + if (_intermediate_block && _intermediate_block->rows() > 0) { + RETURN_IF_ERROR(local_state.get_repeated_block(_intermediate_block.get(), + _repeat_id_idx, output_block)); - _repeat_id_idx++; + _repeat_id_idx++; - int size = _repeat_id_list.size(); - if (_repeat_id_idx >= size) { - _intermediate_block->clear(); + int size = _repeat_id_list.size(); + if (_repeat_id_idx >= size) { + _intermediate_block->clear(); + _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); + _repeat_id_idx = 0; + } + } else if (local_state._expr_ctxs.empty()) { + auto m_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( + output_block, _output_slots); + auto rows = _child_block.rows(); + auto& columns = m_block.mutable_columns(); + + for (int repeat_id_idx = 0; repeat_id_idx < _repeat_id_list.size(); repeat_id_idx++) { + std::size_t cur_col = 0; + RETURN_IF_ERROR( + local_state.add_grouping_id_column(rows, cur_col, columns, repeat_id_idx)); + } _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); - _repeat_id_idx = 0; } - } else if (local_state._expr_ctxs.empty()) { - auto m_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block(output_block, - _output_slots); - auto rows = _child_block.rows(); - auto& columns = m_block.mutable_columns(); - - for (int repeat_id_idx = 0; repeat_id_idx < _repeat_id_list.size(); repeat_id_idx++) { - std::size_t cur_col = 0; - RETURN_IF_ERROR( - local_state.add_grouping_id_column(rows, cur_col, columns, repeat_id_idx)); - } - _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); } - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, - output_block->columns())); + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, + output_block->columns())); + } *eos = _child_eos && _child_block.rows() == 0; local_state.reached_limit(output_block, eos); return Status::OK(); diff --git a/be/src/pipeline/exec/repeat_operator.h b/be/src/pipeline/exec/repeat_operator.h index 22398df372ae654..31f88f37231aaae 100644 --- a/be/src/pipeline/exec/repeat_operator.h +++ b/be/src/pipeline/exec/repeat_operator.h @@ -36,6 +36,7 @@ class RepeatLocalState final : public PipelineXLocalState { using Base = PipelineXLocalState; RepeatLocalState(RuntimeState* state, OperatorXBase* parent); + Status init(RuntimeState* state, LocalStateInfo& info) override; Status open(RuntimeState* state) override; Status get_repeated_block(vectorized::Block* child_block, int repeat_id_idx, @@ -53,6 +54,10 @@ class RepeatLocalState final : public PipelineXLocalState { int _repeat_id_idx; std::unique_ptr _intermediate_block; vectorized::VExprContextSPtrs _expr_ctxs; + + RuntimeProfile::Counter* _evaluate_input_timer = nullptr; + RuntimeProfile::Counter* _get_repeat_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; class RepeatOperatorX final : public StatefulOperatorX { diff --git a/be/src/pipeline/exec/result_file_sink_operator.cpp b/be/src/pipeline/exec/result_file_sink_operator.cpp index 93026427b86d56d..7c9c38ece5c4e94 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.cpp +++ b/be/src/pipeline/exec/result_file_sink_operator.cpp @@ -85,12 +85,6 @@ Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& i SCOPED_TIMER(_init_timer); _sender_id = info.sender_id; - _brpc_wait_timer = ADD_TIMER(_profile, "BrpcSendTime.Wait"); - _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); - _split_block_distribute_by_channel_timer = - ADD_TIMER(_profile, "SplitBlockDistributeByChannelTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); auto& p = _parent->cast(); CHECK(p._file_opts.get() != nullptr); // create sender diff --git a/be/src/pipeline/exec/result_file_sink_operator.h b/be/src/pipeline/exec/result_file_sink_operator.h index 7268efe4de40654..e9f2b8eeb9c6700 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.h +++ b/be/src/pipeline/exec/result_file_sink_operator.h @@ -40,26 +40,12 @@ class ResultFileSinkLocalState final [[nodiscard]] int sender_id() const { return _sender_id; } - RuntimeProfile::Counter* brpc_wait_timer() { return _brpc_wait_timer; } - RuntimeProfile::Counter* local_send_timer() { return _local_send_timer; } - RuntimeProfile::Counter* brpc_send_timer() { return _brpc_send_timer; } - RuntimeProfile::Counter* merge_block_timer() { return _merge_block_timer; } - RuntimeProfile::Counter* split_block_distribute_by_channel_timer() { - return _split_block_distribute_by_channel_timer; - } - private: friend class ResultFileSinkOperatorX; std::shared_ptr _sender; std::shared_ptr _block_holder; - RuntimeProfile::Counter* _brpc_wait_timer = nullptr; - RuntimeProfile::Counter* _local_send_timer = nullptr; - RuntimeProfile::Counter* _brpc_send_timer = nullptr; - RuntimeProfile::Counter* _merge_block_timer = nullptr; - RuntimeProfile::Counter* _split_block_distribute_by_channel_timer = nullptr; - int _sender_id; }; diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index f04ace2e292595c..53a517f859c4e3c 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -41,13 +41,12 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _fetch_row_id_timer = ADD_TIMER(profile(), "FetchRowIdTime"); + _write_data_timer = ADD_TIMER(profile(), "WriteDataTime"); static const std::string timer_name = "WaitForDependencyTime"; _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL(_profile, timer_name, 1); auto fragment_instance_id = state->fragment_instance_id(); - _blocks_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "BlocksProduced", TUnit::UNIT, 1); - _rows_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "RowsProduced", TUnit::UNIT, 1); - if (state->query_options().enable_parallel_result_sink) { _sender = _parent->cast()._sender; } else { @@ -146,12 +145,15 @@ Status ResultSinkOperatorX::open(RuntimeState* state) { Status ResultSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, bool eos) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - COUNTER_UPDATE(local_state.rows_sent_counter(), (int64_t)block->rows()); - COUNTER_UPDATE(local_state.blocks_sent_counter(), 1); + COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); if (_fetch_option.use_two_phase_fetch && block->rows() > 0) { + SCOPED_TIMER(local_state._fetch_row_id_timer); RETURN_IF_ERROR(_second_phase_fetch_data(state, block)); } - RETURN_IF_ERROR(local_state._writer->write(state, *block)); + { + SCOPED_TIMER(local_state._write_data_timer); + RETURN_IF_ERROR(local_state._writer->write(state, *block)); + } if (_fetch_option.use_two_phase_fetch) { // Block structure may be changed by calling _second_phase_fetch_data(). // So we should clear block in case of unmatched columns diff --git a/be/src/pipeline/exec/result_sink_operator.h b/be/src/pipeline/exec/result_sink_operator.h index 3c503096ecb51e8..339c167825643bd 100644 --- a/be/src/pipeline/exec/result_sink_operator.h +++ b/be/src/pipeline/exec/result_sink_operator.h @@ -128,8 +128,6 @@ class ResultSinkLocalState final : public PipelineXSinkLocalState _sender = nullptr; std::shared_ptr _writer = nullptr; - RuntimeProfile::Counter* _blocks_sent_counter = nullptr; - RuntimeProfile::Counter* _rows_sent_counter = nullptr; + + RuntimeProfile::Counter* _fetch_row_id_timer = nullptr; + RuntimeProfile::Counter* _write_data_timer = nullptr; }; class ResultSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index 024a737e1579a32..6200f3b12ce5a05 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -1281,6 +1281,7 @@ Status ScanOperatorX::get_block(RuntimeState* state, vectorized: if (*eos) { // reach limit, stop the scanners. local_state._scanner_ctx->stop_scanners(state); + local_state._scanner_profile->add_info_string("EOS", "True"); } return Status::OK(); diff --git a/be/src/pipeline/exec/set_probe_sink_operator.cpp b/be/src/pipeline/exec/set_probe_sink_operator.cpp index 37db9afacfcacd4..813dad3ad79de64 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.cpp +++ b/be/src/pipeline/exec/set_probe_sink_operator.cpp @@ -71,12 +71,16 @@ Status SetProbeSinkOperatorX::sink(RuntimeState* state, vectorized auto probe_rows = in_block->rows(); if (probe_rows > 0) { - RETURN_IF_ERROR(_extract_probe_column(local_state, *in_block, local_state._probe_columns, - _cur_child_id)); + { + SCOPED_TIMER(local_state._extract_probe_data_timer); + RETURN_IF_ERROR(_extract_probe_column(local_state, *in_block, + local_state._probe_columns, _cur_child_id)); + } RETURN_IF_ERROR(std::visit( [&](auto&& arg) -> Status { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { + SCOPED_TIMER(local_state._probe_timer); vectorized::HashTableProbe process_hashtable_ctx(&local_state, probe_rows); return process_hashtable_ctx.mark_data_in_hashtable(arg); @@ -99,6 +103,9 @@ Status SetProbeSinkLocalState::init(RuntimeState* state, LocalSink RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + + _probe_timer = ADD_TIMER(Base::profile(), "ProbeTime"); + _extract_probe_data_timer = ADD_TIMER(Base::profile(), "ExtractProbeDataTime"); Parent& parent = _parent->cast(); _shared_state->probe_finished_children_dependency[parent._cur_child_id] = _dependency; _dependency->block(); diff --git a/be/src/pipeline/exec/set_probe_sink_operator.h b/be/src/pipeline/exec/set_probe_sink_operator.h index f320c8e89cdcaf4..368ea812cdfe013 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.h +++ b/be/src/pipeline/exec/set_probe_sink_operator.h @@ -60,6 +60,9 @@ class SetProbeSinkLocalState final : public PipelineXSinkLocalState diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp index 9a81333efaed89d..539134e53e7fe21 100644 --- a/be/src/pipeline/exec/set_sink_operator.cpp +++ b/be/src/pipeline/exec/set_sink_operator.cpp @@ -40,8 +40,10 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo auto& valid_element_in_hash_tbl = local_state._shared_state->valid_element_in_hash_tbl; if (in_block->rows() != 0) { - RETURN_IF_ERROR(local_state._mutable_block.merge(*in_block)); - + { + SCOPED_TIMER(local_state._merge_block_timer); + RETURN_IF_ERROR(local_state._mutable_block.merge(*in_block)); + } if (local_state._mutable_block.rows() > std::numeric_limits::max()) { return Status::NotSupported("set operator do not support build table rows over:" + std::to_string(std::numeric_limits::max())); @@ -49,6 +51,7 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo } if (eos || local_state._mutable_block.allocated_bytes() >= BUILD_BLOCK_MAX_SIZE) { + SCOPED_TIMER(local_state._build_timer); build_block = local_state._mutable_block.to_block(); RETURN_IF_ERROR(_process_build_block(local_state, build_block, state)); local_state._mutable_block.clear(); @@ -152,6 +155,7 @@ Status SetSinkLocalState::init(RuntimeState* state, LocalSinkState RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _merge_block_timer = ADD_TIMER(_profile, "MergeBlocksTime"); _build_timer = ADD_TIMER(_profile, "BuildTime"); auto& parent = _parent->cast(); _shared_state->probe_finished_children_dependency[parent._cur_child_id] = _dependency; diff --git a/be/src/pipeline/exec/set_sink_operator.h b/be/src/pipeline/exec/set_sink_operator.h index 8e3c264f267f0b5..ba387d97b413600 100644 --- a/be/src/pipeline/exec/set_sink_operator.h +++ b/be/src/pipeline/exec/set_sink_operator.h @@ -49,14 +49,14 @@ class SetSinkLocalState final : public PipelineXSinkLocalState { private: friend class SetSinkOperatorX; - template - friend struct vectorized::HashTableBuild; - RuntimeProfile::Counter* _build_timer; // time to build hash table vectorized::MutableBlock _mutable_block; // every child has its result expr list vectorized::VExprContextSPtrs _child_exprs; vectorized::Arena _arena; + + RuntimeProfile::Counter* _merge_block_timer = nullptr; + RuntimeProfile::Counter* _build_timer = nullptr; }; template diff --git a/be/src/pipeline/exec/set_source_operator.cpp b/be/src/pipeline/exec/set_source_operator.cpp index 58958462c2f021a..ebcd13ddf14ce40 100644 --- a/be/src/pipeline/exec/set_source_operator.cpp +++ b/be/src/pipeline/exec/set_source_operator.cpp @@ -29,6 +29,8 @@ Status SetSourceLocalState::init(RuntimeState* state, LocalStateIn RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _get_data_timer = ADD_TIMER(_runtime_profile, "GetDataTime"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); _shared_state->probe_finished_children_dependency.resize( _parent->cast>()._child_quantity, nullptr); return Status::OK(); @@ -75,21 +77,26 @@ Status SetSourceOperatorX::get_block(RuntimeState* state, vectoriz auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); _create_mutable_cols(local_state, block); - auto st = std::visit( - [&](auto&& arg) -> Status { - using HashTableCtxType = std::decay_t; - if constexpr (!std::is_same_v) { - return _get_data_in_hashtable(local_state, arg, block, - state->batch_size(), eos); - } else { - LOG(FATAL) << "FATAL: uninited hash table"; - __builtin_unreachable(); - } - }, - local_state._shared_state->hash_table_variants->method_variant); - RETURN_IF_ERROR(st); - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, - block->columns())); + { + SCOPED_TIMER(local_state._get_data_timer); + RETURN_IF_ERROR(std::visit( + [&](auto&& arg) -> Status { + using HashTableCtxType = std::decay_t; + if constexpr (!std::is_same_v) { + return _get_data_in_hashtable(local_state, arg, block, + state->batch_size(), eos); + } else { + LOG(FATAL) << "FATAL: uninited hash table"; + __builtin_unreachable(); + } + }, + local_state._shared_state->hash_table_variants->method_variant)); + } + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, + block->columns())); + } local_state.reached_limit(block, eos); return Status::OK(); } diff --git a/be/src/pipeline/exec/set_source_operator.h b/be/src/pipeline/exec/set_source_operator.h index ce3d0c52edf1d52..976ffde3bf23eae 100644 --- a/be/src/pipeline/exec/set_source_operator.h +++ b/be/src/pipeline/exec/set_source_operator.h @@ -46,6 +46,9 @@ class SetSourceLocalState final : public PipelineXLocalState { std::vector _mutable_cols; //record build column type vectorized::DataTypes _left_table_data_types; + + RuntimeProfile::Counter* _get_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; template diff --git a/be/src/pipeline/exec/sort_sink_operator.cpp b/be/src/pipeline/exec/sort_sink_operator.cpp index 6d6684437b81249..faec4961af93b7f 100644 --- a/be/src/pipeline/exec/sort_sink_operator.cpp +++ b/be/src/pipeline/exec/sort_sink_operator.cpp @@ -32,6 +32,8 @@ Status SortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { SCOPED_TIMER(_init_timer); _sort_blocks_memory_usage = ADD_COUNTER_WITH_LEVEL(_profile, "MemoryUsageSortBlocks", TUnit::BYTES, 1); + _append_blocks_timer = ADD_TIMER(profile(), "AppendBlockTime"); + _update_runtime_predicate_timer = ADD_TIMER(profile(), "UpdateRuntimePredicateTime"); return Status::OK(); } @@ -119,7 +121,10 @@ Status SortSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* in SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); if (in_block->rows() > 0) { - RETURN_IF_ERROR(local_state._shared_state->sorter->append_block(in_block)); + { + SCOPED_TIMER(local_state._append_blocks_timer); + RETURN_IF_ERROR(local_state._shared_state->sorter->append_block(in_block)); + } int64_t data_size = local_state._shared_state->sorter->data_size(); COUNTER_SET(local_state._sort_blocks_memory_usage, data_size); COUNTER_SET(local_state._memory_used_counter, data_size); @@ -128,6 +133,7 @@ Status SortSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* in RETURN_IF_CANCELLED(state); if (state->get_query_ctx()->has_runtime_predicate(_node_id)) { + SCOPED_TIMER(local_state._update_runtime_predicate_timer); auto& predicate = state->get_query_ctx()->get_runtime_predicate(_node_id); if (predicate.enable()) { vectorized::Field new_top = local_state._shared_state->sorter->get_top_value(); diff --git a/be/src/pipeline/exec/sort_sink_operator.h b/be/src/pipeline/exec/sort_sink_operator.h index a5a24e371635fe7..6bf87164e71026e 100644 --- a/be/src/pipeline/exec/sort_sink_operator.h +++ b/be/src/pipeline/exec/sort_sink_operator.h @@ -46,6 +46,8 @@ class SortSinkLocalState : public PipelineXSinkLocalState { // topn top value vectorized::Field old_top {vectorized::Field::Types::Null}; + RuntimeProfile::Counter* _append_blocks_timer = nullptr; + RuntimeProfile::Counter* _update_runtime_predicate_timer = nullptr; }; class SortSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_operator.cpp index 96de1f32be5a019..bc1e7fa8ae451f9 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/streaming_aggregation_operator.cpp @@ -1144,7 +1144,7 @@ Status StreamingAggOperatorX::init(const TPlanNode& tnode, RuntimeState* state) RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( _pool, tnode.agg_node.aggregate_functions[i], tnode.agg_node.__isset.agg_sort_infos ? tnode.agg_node.agg_sort_infos[i] : dummy, - &evaluator)); + tnode.agg_node.grouping_exprs.empty(), &evaluator)); _aggregate_evaluators.push_back(evaluator); } diff --git a/be/src/pipeline/exec/table_function_operator.cpp b/be/src/pipeline/exec/table_function_operator.cpp index 38e69f7cb0e897e..c1621470f435b4f 100644 --- a/be/src/pipeline/exec/table_function_operator.cpp +++ b/be/src/pipeline/exec/table_function_operator.cpp @@ -32,6 +32,18 @@ namespace doris::pipeline { TableFunctionLocalState::TableFunctionLocalState(RuntimeState* state, OperatorXBase* parent) : PipelineXLocalState<>(state, parent), _child_block(vectorized::Block::create_unique()) {} +Status TableFunctionLocalState::init(RuntimeState* state, LocalStateInfo& info) { + RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _init_function_timer = ADD_TIMER(_runtime_profile, "InitTableFunctionTime"); + _process_rows_timer = ADD_TIMER(_runtime_profile, "ProcessRowsTime"); + _copy_data_timer = ADD_TIMER(_runtime_profile, "CopyDataTime"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); + _repeat_data_timer = ADD_TIMER(_runtime_profile, "RepeatDataTime"); + return Status::OK(); +} + Status TableFunctionLocalState::open(RuntimeState* state) { SCOPED_TIMER(PipelineXLocalState<>::exec_time_counter()); SCOPED_TIMER(PipelineXLocalState<>::_open_timer); @@ -59,6 +71,7 @@ void TableFunctionLocalState::_copy_output_slots( if (!_current_row_insert_times) { return; } + SCOPED_TIMER(_copy_data_timer); auto& p = _parent->cast(); for (auto index : p._output_slot_indexs) { auto src_column = _child_block->get_by_position(index).column; @@ -197,15 +210,18 @@ Status TableFunctionLocalState::get_expanded_block(RuntimeState* state, columns[index]->insert_many_defaults(row_size - columns[index]->size()); } - // 3. eval conjuncts - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(_conjuncts, output_block, - output_block->columns())); + { + SCOPED_TIMER(_filter_timer); // 3. eval conjuncts + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(_conjuncts, output_block, + output_block->columns())); + } *eos = _child_eos && _cur_child_offset == -1; return Status::OK(); } void TableFunctionLocalState::process_next_child_row() { + SCOPED_TIMER(_process_rows_timer); _cur_child_offset++; if (_cur_child_offset >= _child_block->rows()) { diff --git a/be/src/pipeline/exec/table_function_operator.h b/be/src/pipeline/exec/table_function_operator.h index 75b1608fad7112a..81160acb7f7611c 100644 --- a/be/src/pipeline/exec/table_function_operator.h +++ b/be/src/pipeline/exec/table_function_operator.h @@ -37,6 +37,7 @@ class TableFunctionLocalState final : public PipelineXLocalState<> { TableFunctionLocalState(RuntimeState* state, OperatorXBase* parent); ~TableFunctionLocalState() override = default; + Status init(RuntimeState* state, LocalStateInfo& infos) override; Status open(RuntimeState* state) override; Status close(RuntimeState* state) override { for (auto* fn : _fns) { @@ -67,6 +68,12 @@ class TableFunctionLocalState final : public PipelineXLocalState<> { std::unique_ptr _child_block; int _current_row_insert_times = 0; bool _child_eos = false; + + RuntimeProfile::Counter* _init_function_timer = nullptr; + RuntimeProfile::Counter* _process_rows_timer = nullptr; + RuntimeProfile::Counter* _copy_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; + RuntimeProfile::Counter* _repeat_data_timer = nullptr; }; class TableFunctionOperatorX final : public StatefulOperatorX { @@ -93,6 +100,7 @@ class TableFunctionOperatorX final : public StatefulOperatorXprocess_init(input_block, state)); } local_state.process_next_child_row(); diff --git a/be/src/pipeline/exec/union_sink_operator.cpp b/be/src/pipeline/exec/union_sink_operator.cpp index 288fc131037fabb..8467eeb1d5467a6 100644 --- a/be/src/pipeline/exec/union_sink_operator.cpp +++ b/be/src/pipeline/exec/union_sink_operator.cpp @@ -32,6 +32,7 @@ Status UnionSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _expr_timer = ADD_TIMER(_profile, "ExprTime"); auto& p = _parent->cast(); _shared_state->data_queue.set_sink_dependency(_dependency, p._cur_child_id); return Status::OK(); diff --git a/be/src/pipeline/exec/union_sink_operator.h b/be/src/pipeline/exec/union_sink_operator.h index f939950143ae920..aa94ed9a73038fb 100644 --- a/be/src/pipeline/exec/union_sink_operator.h +++ b/be/src/pipeline/exec/union_sink_operator.h @@ -55,6 +55,7 @@ class UnionSinkLocalState final : public PipelineXSinkLocalState { @@ -136,6 +137,7 @@ class UnionSinkOperatorX final : public DataSinkOperatorX { Status materialize_block(RuntimeState* state, vectorized::Block* src_block, int child_idx, vectorized::Block* res_block) { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state._expr_timer); const auto& child_exprs = local_state._child_expr; vectorized::ColumnsWithTypeAndName colunms; for (size_t i = 0; i < child_exprs.size(); ++i) { diff --git a/be/src/pipeline/pipeline.h b/be/src/pipeline/pipeline.h index 98e52ec5271613b..b969186b178bf7e 100644 --- a/be/src/pipeline/pipeline.h +++ b/be/src/pipeline/pipeline.h @@ -44,8 +44,7 @@ class Pipeline : public std::enable_shared_from_this { friend class PipelineFragmentContext; public: - explicit Pipeline(PipelineId pipeline_id, int num_tasks, - std::weak_ptr context, int num_tasks_of_parent) + explicit Pipeline(PipelineId pipeline_id, int num_tasks, int num_tasks_of_parent) : _pipeline_id(pipeline_id), _num_tasks(num_tasks), _num_tasks_of_parent(num_tasks_of_parent) { @@ -86,7 +85,9 @@ class Pipeline : public std::enable_shared_from_this { std::vector>& children() { return _children; } void set_children(std::shared_ptr child) { _children.push_back(child); } - void set_children(std::vector> children) { _children = children; } + void set_children(std::vector> children) { + _children = std::move(children); + } void incr_created_tasks(int i, PipelineTask* task) { _num_tasks_created++; diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index bd45016adf51e64..f05a8469ce5a180 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -215,7 +215,6 @@ PipelinePtr PipelineFragmentContext::add_pipeline(PipelinePtr parent, int idx) { PipelineId id = _next_pipeline_id++; auto pipeline = std::make_shared( id, parent ? std::min(parent->num_tasks(), _num_instances) : _num_instances, - std::dynamic_pointer_cast(shared_from_this()), parent ? parent->num_tasks() : _num_instances); if (idx >= 0) { _pipelines.insert(_pipelines.begin() + idx, pipeline); @@ -1198,6 +1197,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo std::stringstream error_msg; bool enable_query_cache = request.fragment.__isset.query_cache_param; + bool fe_with_old_version = false; switch (tnode.node_type) { case TPlanNodeType::OLAP_SCAN_NODE: { op.reset(new OlapScanOperatorX( @@ -1205,6 +1205,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo enable_query_cache ? request.fragment.query_cache_param : TQueryCacheParam {})); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::GROUP_COMMIT_SCAN_NODE: { @@ -1215,6 +1216,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo op.reset(new GroupCommitOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case doris::TPlanNodeType::JDBC_SCAN_NODE: { @@ -1227,12 +1229,14 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo "Jdbc scan node is disabled, you can change be config enable_java_support " "to true and restart be."); } + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case doris::TPlanNodeType::FILE_SCAN_NODE: { op.reset(new FileScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::ES_SCAN_NODE: @@ -1240,6 +1244,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo op.reset(new EsScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::EXCHANGE_NODE: { @@ -1248,6 +1253,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo op.reset(new ExchangeSourceOperatorX(pool, tnode, next_operator_id(), descs, num_senders)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::AGGREGATION_NODE: { @@ -1609,6 +1615,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo op.reset(new DataGenSourceOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::SCHEMA_SCAN_NODE: { @@ -1633,6 +1640,10 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo return Status::InternalError("Unsupported exec type in pipeline: {}", print_plan_node_type(tnode.node_type)); } + if (request.__isset.parallel_instances && fe_with_old_version) { + cur_pipe->set_num_tasks(request.parallel_instances); + op->set_serial_operator(); + } return Status::OK(); } diff --git a/be/src/pipeline/pipeline_task.cpp b/be/src/pipeline/pipeline_task.cpp index e06b8028c9c7308..a8213b31ba8f47c 100644 --- a/be/src/pipeline/pipeline_task.cpp +++ b/be/src/pipeline/pipeline_task.cpp @@ -247,6 +247,12 @@ bool PipelineTask::_wait_to_start() { } bool PipelineTask::_is_blocked() { + Defer defer([this] { + if (_blocked_dep != nullptr) { + _task_profile->add_info_string("TaskState", "Blocked"); + _task_profile->add_info_string("BlockedByDependency", _blocked_dep->name()); + } + }); // `_dry_run = true` means we do not need data from source operator. if (!_dry_run) { for (int i = _read_dependencies.size() - 1; i >= 0; i--) { @@ -328,6 +334,8 @@ Status PipelineTask::execute(bool* eos) { RETURN_IF_ERROR(_open()); } + _task_profile->add_info_string("TaskState", "Runnable"); + _task_profile->add_info_string("BlockedByDependency", ""); while (!_fragment_context->is_canceled()) { if (_is_blocked()) { return Status::OK(); @@ -391,6 +399,7 @@ Status PipelineTask::execute(bool* eos) { *eos = status.is() ? true : *eos; if (*eos) { // just return, the scheduler will do finish work _eos = true; + _task_profile->add_info_string("TaskState", "Finished"); return Status::OK(); } } diff --git a/be/src/pipeline/task_scheduler.h b/be/src/pipeline/task_scheduler.h index 6fc6ad8d6f2e484..4caceca20d4a449 100644 --- a/be/src/pipeline/task_scheduler.h +++ b/be/src/pipeline/task_scheduler.h @@ -36,17 +36,14 @@ namespace doris { class ExecEnv; class ThreadPool; - -namespace pipeline { -class TaskQueue; -} // namespace pipeline } // namespace doris namespace doris::pipeline { +class TaskQueue; class TaskScheduler { public: - TaskScheduler(ExecEnv* exec_env, std::shared_ptr task_queue, std::string name, + TaskScheduler(std::shared_ptr task_queue, std::string name, CgroupCpuCtl* cgroup_cpu_ctl) : _task_queue(std::move(task_queue)), _shutdown(false), diff --git a/be/src/runtime/descriptors.cpp b/be/src/runtime/descriptors.cpp index cc6f9050ac39153..bea11feb916f108 100644 --- a/be/src/runtime/descriptors.cpp +++ b/be/src/runtime/descriptors.cpp @@ -286,8 +286,7 @@ JdbcTableDescriptor::JdbcTableDescriptor(const TTableDescriptor& tdesc) _connection_pool_max_size(tdesc.jdbcTable.connection_pool_max_size), _connection_pool_max_wait_time(tdesc.jdbcTable.connection_pool_max_wait_time), _connection_pool_max_life_time(tdesc.jdbcTable.connection_pool_max_life_time), - _connection_pool_keep_alive(tdesc.jdbcTable.connection_pool_keep_alive), - _enable_connection_pool(tdesc.jdbcTable.enable_connection_pool) {} + _connection_pool_keep_alive(tdesc.jdbcTable.connection_pool_keep_alive) {} std::string JdbcTableDescriptor::debug_string() const { fmt::memory_buffer buf; @@ -295,14 +294,13 @@ std::string JdbcTableDescriptor::debug_string() const { buf, "JDBCTable({} ,_jdbc_catalog_id = {}, _jdbc_resource_name={} ,_jdbc_driver_url={} " ",_jdbc_driver_class={} ,_jdbc_driver_checksum={} ,_jdbc_url={} " - ",_jdbc_table_name={} ,_jdbc_user={} ,_jdbc_passwd={} " - ",_enable_connection_pool={},_connection_pool_min_size={} " + ",_jdbc_table_name={} ,_jdbc_user={} ,_jdbc_passwd={} ,_connection_pool_min_size={} " ",_connection_pool_max_size={} ,_connection_pool_max_wait_time={} " ",_connection_pool_max_life_time={} ,_connection_pool_keep_alive={})", TableDescriptor::debug_string(), _jdbc_catalog_id, _jdbc_resource_name, _jdbc_driver_url, _jdbc_driver_class, _jdbc_driver_checksum, _jdbc_url, - _jdbc_table_name, _jdbc_user, _jdbc_passwd, _enable_connection_pool, - _connection_pool_min_size, _connection_pool_max_size, _connection_pool_max_wait_time, + _jdbc_table_name, _jdbc_user, _jdbc_passwd, _connection_pool_min_size, + _connection_pool_max_size, _connection_pool_max_wait_time, _connection_pool_max_life_time, _connection_pool_keep_alive); return fmt::to_string(buf); } diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h index b5797b0d016d751..b807c5675430383 100644 --- a/be/src/runtime/descriptors.h +++ b/be/src/runtime/descriptors.h @@ -319,7 +319,6 @@ class JdbcTableDescriptor : public TableDescriptor { int32_t connection_pool_max_wait_time() const { return _connection_pool_max_wait_time; } int32_t connection_pool_max_life_time() const { return _connection_pool_max_life_time; } bool connection_pool_keep_alive() const { return _connection_pool_keep_alive; } - bool enable_connection_pool() const { return _enable_connection_pool; } private: int64_t _jdbc_catalog_id; @@ -336,7 +335,6 @@ class JdbcTableDescriptor : public TableDescriptor { int32_t _connection_pool_max_wait_time; int32_t _connection_pool_max_life_time; bool _connection_pool_keep_alive; - bool _enable_connection_pool; }; class TupleDescriptor { diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index ff6205bf55e5d03..3cddcd60b8bd8e6 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -383,7 +383,7 @@ Status ExecEnv::init_pipeline_task_scheduler() { // TODO pipeline workload group combie two blocked schedulers. auto t_queue = std::make_shared(executors_size); _without_group_task_scheduler = - new pipeline::TaskScheduler(this, t_queue, "PipeNoGSchePool", nullptr); + new pipeline::TaskScheduler(t_queue, "PipeNoGSchePool", nullptr); RETURN_IF_ERROR(_without_group_task_scheduler->start()); _runtime_filter_timer_queue = new doris::pipeline::RuntimeFilterTimerQueue(); @@ -442,8 +442,11 @@ void ExecEnv::init_file_cache_factory(std::vector& cache_paths } for (const auto& status : cache_status) { if (!status.ok()) { - LOG(FATAL) << "failed to init file cache, err: " << status; - exit(-1); + if (!doris::config::ignore_broken_disk) { + LOG(FATAL) << "failed to init file cache, err: " << status; + exit(-1); + } + LOG(WARNING) << "failed to init file cache, err: " << status; } } } @@ -676,7 +679,7 @@ void ExecEnv::destroy() { SAFE_STOP(_write_cooldown_meta_executors); // StorageEngine must be destoried before _page_no_cache_mem_tracker.reset and _cache_manager destory - // shouldn't use SAFE_STOP. otherwise will lead to twice stop. + SAFE_STOP(_storage_engine); _storage_engine.reset(); SAFE_STOP(_spill_stream_mgr); diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 7cbb5e0f4adf6ef..0ecd22769155003 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -668,7 +668,7 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo // This may be a first fragment request of the query. // Create the query fragments context. query_ctx = QueryContext::create_shared(query_id, _exec_env, params.query_options, - params.coord, pipeline, params.is_nereids, + params.coord, params.is_nereids, params.current_connect_fe, query_source); SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(query_ctx->query_mem_tracker); RETURN_IF_ERROR(DescriptorTbl::create(&(query_ctx->obj_pool), params.desc_tbl, @@ -1138,7 +1138,6 @@ Status FragmentMgr::exec_external_plan_fragment(const TScanOpenParams& params, Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, butil::IOBufAsZeroCopyInputStream* attach_data) { - bool is_pipeline = request->has_is_pipeline() && request->is_pipeline(); int64_t start_apply = MonotonicMillis(); std::shared_ptr pip_context; @@ -1150,22 +1149,18 @@ Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, { std::unique_lock lock(_lock); for (auto fragment_id : fragment_ids) { - if (is_pipeline) { - auto iter = _pipeline_map.find( - {UniqueId(request->query_id()).to_thrift(), fragment_id}); - if (iter == _pipeline_map.end()) { - continue; - } - pip_context = iter->second; - - DCHECK(pip_context != nullptr); - runtime_filter_mgr = pip_context->get_query_ctx()->runtime_filter_mgr(); - query_thread_context = {pip_context->get_query_ctx()->query_id(), - pip_context->get_query_ctx()->query_mem_tracker, - pip_context->get_query_ctx()->workload_group()}; - } else { - return Status::InternalError("Non-pipeline is disabled!"); + auto iter = + _pipeline_map.find({UniqueId(request->query_id()).to_thrift(), fragment_id}); + if (iter == _pipeline_map.end()) { + continue; } + pip_context = iter->second; + + DCHECK(pip_context != nullptr); + runtime_filter_mgr = pip_context->get_query_ctx()->runtime_filter_mgr(); + query_thread_context = {pip_context->get_query_ctx()->query_id(), + pip_context->get_query_ctx()->query_mem_tracker, + pip_context->get_query_ctx()->workload_group()}; break; } } diff --git a/be/src/runtime/load_stream_writer.cpp b/be/src/runtime/load_stream_writer.cpp index 37243fab14bdb35..2e987edc7bd3bad 100644 --- a/be/src/runtime/load_stream_writer.cpp +++ b/be/src/runtime/load_stream_writer.cpp @@ -201,7 +201,7 @@ Status LoadStreamWriter::add_segment(uint32_t segid, const SegmentStatistics& st } DBUG_EXECUTE_IF("LoadStreamWriter.add_segment.size_not_match", { segment_file_size++; }); - if (segment_file_size + inverted_file_size != stat.data_size) { + if (segment_file_size != stat.data_size) { return Status::Corruption( "add_segment failed, segment stat {} does not match, file size={}, inverted file " "size={}, stat.data_size={}, tablet id={}", diff --git a/be/src/runtime/query_context.cpp b/be/src/runtime/query_context.cpp index 80f59d7101d3c70..0f30c0255a2aab5 100644 --- a/be/src/runtime/query_context.cpp +++ b/be/src/runtime/query_context.cpp @@ -74,12 +74,11 @@ const std::string toString(QuerySource queryType) { QueryContext::QueryContext(TUniqueId query_id, ExecEnv* exec_env, const TQueryOptions& query_options, TNetworkAddress coord_addr, - bool is_pipeline, bool is_nereids, TNetworkAddress current_connect_fe, + bool is_nereids, TNetworkAddress current_connect_fe, QuerySource query_source) : _timeout_second(-1), _query_id(query_id), _exec_env(exec_env), - _is_pipeline(is_pipeline), _is_nereids(is_nereids), _query_options(query_options), _query_source(query_source) { @@ -180,8 +179,7 @@ QueryContext::~QueryContext() { } } - //TODO: check if pipeline and tracing both enabled - if (_is_pipeline && ExecEnv::GetInstance()->pipeline_tracer_context()->enabled()) [[unlikely]] { + if (ExecEnv::GetInstance()->pipeline_tracer_context()->enabled()) [[unlikely]] { try { ExecEnv::GetInstance()->pipeline_tracer_context()->end_query(_query_id, group_id); } catch (std::exception& e) { diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index 1a05b784d5bc5c4..ef753ee62259b45 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -79,8 +79,8 @@ class QueryContext { public: QueryContext(TUniqueId query_id, ExecEnv* exec_env, const TQueryOptions& query_options, - TNetworkAddress coord_addr, bool is_pipeline, bool is_nereids, - TNetworkAddress current_connect_fe, QuerySource query_type); + TNetworkAddress coord_addr, bool is_nereids, TNetworkAddress current_connect_fe, + QuerySource query_type); ~QueryContext(); @@ -246,7 +246,6 @@ class QueryContext { ExecEnv* _exec_env = nullptr; MonotonicStopWatch _query_watcher; int64_t _bytes_limit = 0; - bool _is_pipeline = false; bool _is_nereids = false; std::atomic _running_big_mem_op_num = 0; diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index 08a229c0ecf72b6..1a238787207b173 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -29,6 +29,7 @@ #include #include +#include "common/config.h" #include "common/logging.h" #include "common/status.h" #include "exprs/bloom_filter_func.h" @@ -343,8 +344,10 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz auto* pquery_id = closure->request_->mutable_query_id(); pquery_id->set_hi(_state->query_id.hi()); pquery_id->set_lo(_state->query_id.lo()); - closure->cntl_->set_timeout_ms(std::min(3600, _state->execution_timeout) * 1000); - closure->cntl_->ignore_eovercrowded(); + closure->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(_state->execution_timeout)); + if (config::execution_ignore_eovercrowded) { + closure->cntl_->ignore_eovercrowded(); + } closure->request_->set_filter_id(filter_id); closure->request_->set_filter_size(cnt_val->global_size); @@ -449,15 +452,17 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ DummyBrpcCallback::create_shared()); closure->request_->set_filter_id(request->filter_id()); - closure->request_->set_is_pipeline(request->has_is_pipeline() && - request->is_pipeline()); closure->request_->set_merge_time(merge_time); *closure->request_->mutable_query_id() = request->query_id(); if (has_attachment) { closure->cntl_->request_attachment().append(request_attachment); } - closure->cntl_->set_timeout_ms(std::min(3600, _state->execution_timeout) * 1000); - closure->cntl_->ignore_eovercrowded(); + + closure->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(_state->execution_timeout)); + if (config::execution_ignore_eovercrowded) { + closure->cntl_->ignore_eovercrowded(); + } + // set fragment-id if (target.__isset.target_fragment_ids) { for (auto& target_fragment_id : target.target_fragment_ids) { diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 34ce79ec7a749a9..abc823bc25b291d 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -38,6 +38,7 @@ #include "agent/be_exec_version_manager.h" #include "cctz/time_zone.h" #include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" #include "common/factory_creator.h" #include "common/status.h" #include "gutil/integral_types.h" @@ -51,6 +52,10 @@ namespace doris { class IRuntimeFilter; +inline int32_t get_execution_rpc_timeout_ms(int32_t execution_timeout_sec) { + return std::min(config::execution_max_rpc_timeout_sec, execution_timeout_sec) * 1000; +} + namespace pipeline { class PipelineXLocalStateBase; class PipelineXSinkLocalStateBase; diff --git a/be/src/runtime/workload_group/workload_group.cpp b/be/src/runtime/workload_group/workload_group.cpp index 0488e9ec83c6c25..84016132da9b5ad 100644 --- a/be/src/runtime/workload_group/workload_group.cpp +++ b/be/src/runtime/workload_group/workload_group.cpp @@ -470,8 +470,8 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e } auto task_queue = std::make_shared(executors_size); std::unique_ptr pipeline_task_scheduler = - std::make_unique(exec_env, std::move(task_queue), - "Pipe_" + tg_name, cg_cpu_ctl_ptr); + std::make_unique(std::move(task_queue), "Pipe_" + tg_name, + cg_cpu_ctl_ptr); Status ret = pipeline_task_scheduler->start(); if (ret.ok()) { _task_sched = std::move(pipeline_task_scheduler); diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index 7704d07b6f94770..e7b920796a1b985 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -387,7 +387,7 @@ void HttpService::register_local_handler(StorageEngine& engine) { _ev_http_server->register_handler(HttpMethod::POST, "/api/pad_rowset", pad_rowset_action); ReportAction* report_tablet_action = _pool.add(new ReportAction( - _env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN, "REPORT_OLAP_TABLE")); + _env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN, "REPORT_OLAP_TABLET")); _ev_http_server->register_handler(HttpMethod::GET, "/api/report/tablet", report_tablet_action); ReportAction* report_disk_action = _pool.add(new ReportAction( diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 8217bd11bb91563..89b43ec52235013 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -886,13 +886,10 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController Status PInternalService::_tablet_fetch_data(const PTabletKeyLookupRequest* request, PTabletKeyLookupResponse* response) { - PointQueryExecutor lookup_util; - RETURN_IF_ERROR(lookup_util.init(request, response)); - RETURN_IF_ERROR(lookup_util.lookup_up()); - if (VLOG_DEBUG_IS_ON) { - VLOG_DEBUG << lookup_util.print_profile(); - } - LOG_EVERY_N(INFO, 500) << lookup_util.print_profile(); + PointQueryExecutor executor; + RETURN_IF_ERROR(executor.init(request, response)); + RETURN_IF_ERROR(executor.lookup_up()); + executor.print_profile(); return Status::OK(); } @@ -1159,7 +1156,10 @@ void PInternalService::fetch_remote_tablet_schema(google::protobuf::RpcControlle LOG(WARNING) << "tablet does not exist, tablet id is " << tablet_id; continue; } - tablet_schemas.push_back(res.value()->merged_tablet_schema()); + auto schema = res.value()->merged_tablet_schema(); + if (schema != nullptr) { + tablet_schemas.push_back(schema); + } } if (!tablet_schemas.empty()) { // merge all diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index 9719a672b8dff49..74dab4663403301 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,7 @@ #include "olap/olap_tuple.h" #include "olap/row_cursor.h" #include "olap/rowset/beta_rowset.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/storage_engine.h" #include "olap/tablet_manager.h" #include "olap/tablet_schema.h" @@ -313,34 +315,48 @@ Status PointQueryExecutor::lookup_up() { return Status::OK(); } -std::string PointQueryExecutor::print_profile() { +void PointQueryExecutor::print_profile() { auto init_us = _profile_metrics.init_ns.value() / 1000; auto init_key_us = _profile_metrics.init_key_ns.value() / 1000; auto lookup_key_us = _profile_metrics.lookup_key_ns.value() / 1000; auto lookup_data_us = _profile_metrics.lookup_data_ns.value() / 1000; auto output_data_us = _profile_metrics.output_data_ns.value() / 1000; + auto load_segments_key_us = _profile_metrics.load_segment_key_stage_ns.value() / 1000; + auto load_segments_data_us = _profile_metrics.load_segment_data_stage_ns.value() / 1000; auto total_us = init_us + lookup_key_us + lookup_data_us + output_data_us; auto read_stats = _profile_metrics.read_stats; - return fmt::format( - "" + const std::string stats_str = fmt::format( "[lookup profile:{}us] init:{}us, init_key:{}us," - "" - "" - "lookup_key:{}us, lookup_data:{}us, output_data:{}us, hit_lookup_cache:{}" - "" - "" + " lookup_key:{}us, load_segments_key:{}us, lookup_data:{}us, load_segments_data:{}us," + " output_data:{}us, " + "hit_lookup_cache:{}" ", is_binary_row:{}, output_columns:{}, total_keys:{}, row_cache_hits:{}" ", hit_cached_pages:{}, total_pages_read:{}, compressed_bytes_read:{}, " "io_latency:{}ns, " "uncompressed_bytes_read:{}, result_data_bytes:{}, row_hits:{}" - ", rs_column_uid:{}" - "", - total_us, init_us, init_key_us, lookup_key_us, lookup_data_us, output_data_us, - _profile_metrics.hit_lookup_cache, _binary_row_format, _reusable->output_exprs().size(), - _row_read_ctxs.size(), _profile_metrics.row_cache_hits, read_stats.cached_pages_num, + ", rs_column_uid:{}, bytes_read_from_local:{}, bytes_read_from_remote:{}, " + "local_io_timer:{}, remote_io_timer:{}, local_write_timer:{}", + total_us, init_us, init_key_us, lookup_key_us, load_segments_key_us, lookup_data_us, + load_segments_data_us, output_data_us, _profile_metrics.hit_lookup_cache, + _binary_row_format, _reusable->output_exprs().size(), _row_read_ctxs.size(), + _profile_metrics.row_cache_hits, read_stats.cached_pages_num, read_stats.total_pages_num, read_stats.compressed_bytes_read, read_stats.io_ns, read_stats.uncompressed_bytes_read, _profile_metrics.result_data_bytes, _row_hits, - _reusable->rs_column_uid()); + _reusable->rs_column_uid(), + _profile_metrics.read_stats.file_cache_stats.bytes_read_from_local, + _profile_metrics.read_stats.file_cache_stats.bytes_read_from_remote, + _profile_metrics.read_stats.file_cache_stats.local_io_timer, + _profile_metrics.read_stats.file_cache_stats.remote_io_timer, + _profile_metrics.read_stats.file_cache_stats.write_cache_io_timer); + + constexpr static int kSlowThreholdUs = 50 * 1000; // 50ms + if (total_us > kSlowThreholdUs) { + LOG(WARNING) << "slow query, " << stats_str; + } else if (VLOG_DEBUG_IS_ON) { + VLOG_DEBUG << stats_str; + } else { + LOG_EVERY_N(INFO, 1000) << stats_str; + } } Status PointQueryExecutor::_init_keys(const PTabletKeyLookupRequest* request) { @@ -380,6 +396,17 @@ Status PointQueryExecutor::_lookup_row_key() { specified_rowsets = _tablet->get_rowset_by_ids(nullptr); } std::vector> segment_caches(specified_rowsets.size()); + // init segment_cache + { + SCOPED_TIMER(&_profile_metrics.load_segment_key_stage_ns); + for (size_t i = 0; i < specified_rowsets.size(); i++) { + auto& rs = specified_rowsets[i]; + segment_caches[i] = std::make_unique(); + RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( + std::static_pointer_cast(rs), segment_caches[i].get(), true, true, + &_profile_metrics.read_stats)); + } + } for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { RowLocation location; if (!config::disable_storage_row_cache) { @@ -396,7 +423,8 @@ Status PointQueryExecutor::_lookup_row_key() { auto rowset_ptr = std::make_unique(); st = (_tablet->lookup_row_key(_row_read_ctxs[i]._primary_key, nullptr, false, specified_rowsets, &location, INT32_MAX /*rethink?*/, - segment_caches, rowset_ptr.get(), false)); + segment_caches, rowset_ptr.get(), false, nullptr, + &_profile_metrics.read_stats)); if (st.is()) { continue; } @@ -459,7 +487,11 @@ Status PointQueryExecutor::_lookup_row_data() { BetaRowsetSharedPtr rowset = std::static_pointer_cast(_tablet->get_rowset(row_loc.rowset_id)); SegmentCacheHandle segment_cache; - RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + { + SCOPED_TIMER(&_profile_metrics.load_segment_data_stage_ns); + RETURN_IF_ERROR( + SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + } // find segment auto it = std::find_if(segment_cache.get_segments().cbegin(), segment_cache.get_segments().cend(), diff --git a/be/src/service/point_query_executor.h b/be/src/service/point_query_executor.h index b22dc5bfd1d73f3..89f4ecff9b137ac 100644 --- a/be/src/service/point_query_executor.h +++ b/be/src/service/point_query_executor.h @@ -276,12 +276,16 @@ struct Metrics { init_key_ns(TUnit::TIME_NS), lookup_key_ns(TUnit::TIME_NS), lookup_data_ns(TUnit::TIME_NS), - output_data_ns(TUnit::TIME_NS) {} + output_data_ns(TUnit::TIME_NS), + load_segment_key_stage_ns(TUnit::TIME_NS), + load_segment_data_stage_ns(TUnit::TIME_NS) {} RuntimeProfile::Counter init_ns; RuntimeProfile::Counter init_key_ns; RuntimeProfile::Counter lookup_key_ns; RuntimeProfile::Counter lookup_data_ns; RuntimeProfile::Counter output_data_ns; + RuntimeProfile::Counter load_segment_key_stage_ns; + RuntimeProfile::Counter load_segment_data_stage_ns; OlapReaderStatistics read_stats; size_t row_cache_hits = 0; bool hit_lookup_cache = false; @@ -297,7 +301,9 @@ class PointQueryExecutor { Status lookup_up(); - std::string print_profile(); + void print_profile(); + + const OlapReaderStatistics& read_stats() const { return _read_stats; } private: Status _init_keys(const PTabletKeyLookupRequest* request); diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp index 2c6ed52ddde65fa..0cbb6bcd0c8916d 100644 --- a/be/src/util/arrow/row_batch.cpp +++ b/be/src/util/arrow/row_batch.cpp @@ -157,17 +157,8 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr* field, - const std::string& timezone) { - std::shared_ptr type; - RETURN_IF_ERROR(convert_to_arrow_type(desc->type(), &type, timezone)); - *field = arrow::field(desc->col_name(), type, desc->is_nullable()); - return Status::OK(); -} - -Status convert_block_arrow_schema(const vectorized::Block& block, - std::shared_ptr* result, - const std::string& timezone) { +Status get_arrow_schema(const vectorized::Block& block, std::shared_ptr* result, + const std::string& timezone) { std::vector> fields; for (const auto& type_and_name : block) { std::shared_ptr arrow_type; @@ -180,21 +171,6 @@ Status convert_block_arrow_schema(const vectorized::Block& block, return Status::OK(); } -Status convert_to_arrow_schema(const RowDescriptor& row_desc, - std::shared_ptr* result, - const std::string& timezone) { - std::vector> fields; - for (auto tuple_desc : row_desc.tuple_descriptors()) { - for (auto desc : tuple_desc->slots()) { - std::shared_ptr field; - RETURN_IF_ERROR(convert_to_arrow_field(desc, &field, timezone)); - fields.push_back(field); - } - } - *result = arrow::schema(std::move(fields)); - return Status::OK(); -} - Status convert_expr_ctxs_arrow_schema(const vectorized::VExprContextSPtrs& output_vexpr_ctxs, std::shared_ptr* result, const std::string& timezone) { diff --git a/be/src/util/arrow/row_batch.h b/be/src/util/arrow/row_batch.h index 9a33719a1cfbcce..3993003baf6e95c 100644 --- a/be/src/util/arrow/row_batch.h +++ b/be/src/util/arrow/row_batch.h @@ -45,12 +45,8 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr* result, const std::string& timezone); - -Status convert_block_arrow_schema(const vectorized::Block& block, - std::shared_ptr* result, - const std::string& timezone); +Status get_arrow_schema(const vectorized::Block& block, std::shared_ptr* result, + const std::string& timezone); Status convert_expr_ctxs_arrow_schema(const vectorized::VExprContextSPtrs& output_vexpr_ctxs, std::shared_ptr* result, diff --git a/be/src/util/mysql_row_buffer.cpp b/be/src/util/mysql_row_buffer.cpp index 4823920508a9404..3e20a2d9de72fec 100644 --- a/be/src/util/mysql_row_buffer.cpp +++ b/be/src/util/mysql_row_buffer.cpp @@ -87,9 +87,9 @@ MysqlRowBuffer::MysqlRowBuffer() _len_pos(0) {} template -void MysqlRowBuffer::start_binary_row(uint32_t num_cols) { +void MysqlRowBuffer::start_binary_row(uint64_t num_cols) { assert(is_binary_format); - int bit_fields = (num_cols + 9) / 8; + auto bit_fields = (num_cols + 9) / 8; reserve(bit_fields + 1); memset(_pos, 0, 1 + bit_fields); _pos += bit_fields + 1; diff --git a/be/src/util/mysql_row_buffer.h b/be/src/util/mysql_row_buffer.h index b740efa7764ed1f..50b17c91c170ca8 100644 --- a/be/src/util/mysql_row_buffer.h +++ b/be/src/util/mysql_row_buffer.h @@ -62,7 +62,7 @@ class MysqlRowBuffer { // Prepare for binary row buffer // init bitmap - void start_binary_row(uint32_t num_cols); + void start_binary_row(uint64_t num_cols); // TODO(zhaochun): add signed/unsigned support int push_tinyint(int8_t data); diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index f67fe14fa426003..39de0324d1415fb 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -20,6 +20,8 @@ #pragma once +#include "common/exception.h" +#include "common/status.h" #include "util/defer_op.h" #include "vec/columns/column_complex.h" #include "vec/columns/column_string.h" @@ -30,6 +32,7 @@ #include "vec/core/column_numbers.h" #include "vec/core/field.h" #include "vec/core/types.h" +#include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_string.h" namespace doris::vectorized { @@ -222,6 +225,10 @@ class IAggregateFunction { virtual AggregateFunctionPtr transmit_to_stable() { return nullptr; } + /// Verify function signature + virtual Status verify_result_type(const bool without_key, const DataTypes& argument_types, + const DataTypePtr result_type) const = 0; + protected: DataTypes argument_types; int version {}; @@ -494,6 +501,43 @@ class IAggregateFunctionHelper : public IAggregateFunction { arena); assert_cast(this)->merge(place, rhs, arena); } + + Status verify_result_type(const bool without_key, const DataTypes& argument_types_with_nullable, + const DataTypePtr result_type_with_nullable) const override { + DataTypePtr function_result_type = assert_cast(this)->get_return_type(); + + if (function_result_type->equals(*result_type_with_nullable)) { + return Status::OK(); + } + + if (!remove_nullable(function_result_type) + ->equals(*remove_nullable(result_type_with_nullable))) { + return Status::InternalError( + "Result type of {} is not matched, planner expect {}, but get {}, with group " + "by: " + "{}", + get_name(), result_type_with_nullable->get_name(), + function_result_type->get_name(), !without_key); + } + + if (without_key == true) { + if (result_type_with_nullable->is_nullable()) { + // This branch is decicated for NullableAggregateFunction. + // When they are executed without group by key, the result from planner will be AlwaysNullable + // since Planer does not know whether there are any invalid input at runtime, if so, the result + // should be Null, so the result type must be nullable. + // Backend will wrap a ColumnNullable in this situation. For example: AggLocalState::_get_without_key_result + return Status::OK(); + } + } + + // Executed with group by key, result type must be exactly same with the return type from Planner. + return Status::InternalError( + "Result type of {} is not matched, planner expect {}, but get {}, with group by: " + "{}", + get_name(), result_type_with_nullable->get_name(), function_result_type->get_name(), + !without_key); + } }; /// Implements several methods for manipulation with data. T - type of structure with data for aggregation. diff --git a/be/src/vec/common/columns_hashing.h b/be/src/vec/common/columns_hashing.h index 4bdbf51444fbbf8..6a59c5964e47a4e 100644 --- a/be/src/vec/common/columns_hashing.h +++ b/be/src/vec/common/columns_hashing.h @@ -38,11 +38,6 @@ namespace doris::vectorized { using Sizes = std::vector; -inline bool has_nullable_key(const std::vector& data_types) { - return std::ranges::any_of(data_types.begin(), data_types.end(), - [](auto t) { return t->is_nullable(); }); -} - inline Sizes get_key_sizes(const std::vector& data_types) { Sizes key_sizes; for (const auto& data_type : data_types) { @@ -101,17 +96,14 @@ struct HashMethodSerialized }; /// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits. -template +template struct HashMethodKeysFixed - : private columns_hashing_impl::BaseStateKeysFixed, - public columns_hashing_impl::HashMethodBase< - HashMethodKeysFixed, Value, Mapped, - false> { - using Self = HashMethodKeysFixed; + : public columns_hashing_impl::HashMethodBase, + Value, Mapped, false> { + using Self = HashMethodKeysFixed; using BaseHashed = columns_hashing_impl::HashMethodBase; - using Base = columns_hashing_impl::BaseStateKeysFixed; - HashMethodKeysFixed(const ColumnRawPtrs& key_columns) : Base(key_columns) {} + HashMethodKeysFixed(const ColumnRawPtrs& key_columns) {} }; template diff --git a/be/src/vec/common/columns_hashing_impl.h b/be/src/vec/common/columns_hashing_impl.h index 2665d9b797903a5..a11ec17ec705276 100644 --- a/be/src/vec/common/columns_hashing_impl.h +++ b/be/src/vec/common/columns_hashing_impl.h @@ -149,64 +149,6 @@ class HashMethodBase { } }; -template -struct MappedCache : public PaddedPODArray {}; - -template <> -struct MappedCache {}; - -/// This class is designed to provide the functionality that is required for -/// supporting nullable keys in HashMethodKeysFixed. If there are -/// no nullable keys, this class is merely implemented as an empty shell. -template -class BaseStateKeysFixed; - -/// Case where nullable keys are supported. -template -class BaseStateKeysFixed { -protected: - BaseStateKeysFixed(const ColumnRawPtrs& key_columns) { - null_maps.reserve(key_columns.size()); - actual_columns.reserve(key_columns.size()); - - for (const auto& col : key_columns) { - if (auto* nullable_col = check_and_get_column(col)) { - actual_columns.push_back(&nullable_col->get_nested_column()); - null_maps.push_back(&nullable_col->get_null_map_column()); - } else { - actual_columns.push_back(col); - null_maps.push_back(nullptr); - } - } - } - - /// Return the columns which actually contain the values of the keys. - /// For a given key column, if it is nullable, we return its nested - /// column. Otherwise we return the key column itself. - const ColumnRawPtrs& get_actual_columns() const { return actual_columns; } - - const ColumnRawPtrs& get_nullmap_columns() const { return null_maps; } - -private: - ColumnRawPtrs actual_columns; - ColumnRawPtrs null_maps; -}; - -/// Case where nullable keys are not supported. -template -class BaseStateKeysFixed { -protected: - BaseStateKeysFixed(const ColumnRawPtrs& columns) : actual_columns(columns) {} - - const ColumnRawPtrs& get_actual_columns() const { return actual_columns; } - - const ColumnRawPtrs& get_nullmap_columns() const { return null_maps; } - -private: - ColumnRawPtrs actual_columns; - ColumnRawPtrs null_maps; -}; - } // namespace columns_hashing_impl } // namespace ColumnsHashing diff --git a/be/src/vec/common/hash_table/hash_key_type.h b/be/src/vec/common/hash_table/hash_key_type.h index 38802fe716711f4..2c14e4ab687f87b 100644 --- a/be/src/vec/common/hash_table/hash_key_type.h +++ b/be/src/vec/common/hash_table/hash_key_type.h @@ -97,16 +97,16 @@ inline HashKeyType get_hash_key_type(const std::vector& return HashKeyType::without_key; } - if (!data_types[0]->have_maximum_size_of_value()) { - if (is_string(data_types[0])) { + auto t = remove_nullable(data_types[0]); + // serialized cannot be used in the case of single column, because the join operator will have some processing of column nullable, resulting in incorrect serialized results. + if (!t->have_maximum_size_of_value()) { + if (is_string(t)) { return HashKeyType::string_key; - } else { - return HashKeyType::serialized; } + throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid type, type={}", t->get_name()); } - size_t size = - data_types[0]->get_maximum_size_of_value_in_memory() - data_types[0]->is_nullable(); + size_t size = t->get_maximum_size_of_value_in_memory(); if (size == sizeof(vectorized::UInt8)) { return HashKeyType::int8_key; } else if (size == sizeof(vectorized::UInt16)) { @@ -121,7 +121,7 @@ inline HashKeyType get_hash_key_type(const std::vector& return HashKeyType::int256_key; } else { throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid type size, size={}, type={}", size, - data_types[0]->get_name()); + t->get_name()); } } diff --git a/be/src/vec/common/hash_table/hash_map.h b/be/src/vec/common/hash_table/hash_map.h index 018d134d875ca86..448ddd5b7c5dbe1 100644 --- a/be/src/vec/common/hash_table/hash_map.h +++ b/be/src/vec/common/hash_table/hash_map.h @@ -198,9 +198,6 @@ template , typename Grower = HashTableGrower<>, typename Allocator = HashTableAllocator> using HashMap = HashMapTable, Hash, Grower, Allocator>; -template > -using NormalHashMap = HashMapTable, Hash>; - template > using JoinHashMap = JoinHashTable; diff --git a/be/src/vec/common/hash_table/hash_map_context.h b/be/src/vec/common/hash_table/hash_map_context.h index 973f04f064fea34..16a793d75008c80 100644 --- a/be/src/vec/common/hash_table/hash_map_context.h +++ b/be/src/vec/common/hash_table/hash_map_context.h @@ -27,7 +27,6 @@ #include "vec/common/arena.h" #include "vec/common/assert_cast.h" #include "vec/common/columns_hashing.h" -#include "vec/common/hash_table/partitioned_hash_map.h" #include "vec/common/hash_table/string_hash_map.h" #include "vec/common/string_ref.h" #include "vec/common/typeid_cast.h" @@ -375,7 +374,7 @@ struct MethodOneNumber : public MethodBase { } }; -template +template struct MethodKeysFixed : public MethodBase { using Base = MethodBase; using typename Base::Key; @@ -384,8 +383,7 @@ struct MethodKeysFixed : public MethodBase { using Base::hash_table; using Base::iterator; - using State = ColumnsHashing::HashMethodKeysFixed; + using State = ColumnsHashing::HashMethodKeysFixed; // need keep until the hash probe end. use only in join std::vector build_stored_keys; @@ -469,20 +467,22 @@ struct MethodKeysFixed : public MethodBase { bool is_build = false, uint32_t bucket_size = 0) override { ColumnRawPtrs actual_columns; ColumnRawPtrs null_maps; - if (has_nullable_keys) { - actual_columns.reserve(key_columns.size()); - null_maps.reserve(key_columns.size()); - for (const auto& col : key_columns) { - if (const auto* nullable_col = check_and_get_column(col)) { - actual_columns.push_back(&nullable_col->get_nested_column()); - null_maps.push_back(&nullable_col->get_null_map_column()); - } else { - actual_columns.push_back(col); - null_maps.push_back(nullptr); - } + actual_columns.reserve(key_columns.size()); + null_maps.reserve(key_columns.size()); + bool has_nullable_key = false; + + for (const auto& col : key_columns) { + if (const auto* nullable_col = check_and_get_column(col)) { + actual_columns.push_back(&nullable_col->get_nested_column()); + null_maps.push_back(&nullable_col->get_null_map_column()); + has_nullable_key = true; + } else { + actual_columns.push_back(col); + null_maps.push_back(nullptr); } - } else { - actual_columns = key_columns; + } + if (!has_nullable_key) { + null_maps.clear(); } if (is_build) { @@ -503,7 +503,13 @@ struct MethodKeysFixed : public MethodBase { void insert_keys_into_columns(std::vector& input_keys, MutableColumns& key_columns, const size_t num_rows) override { // In any hash key value, column values to be read start just after the bitmap, if it exists. - size_t pos = has_nullable_keys ? get_bitmap_size(key_columns.size()) : 0; + size_t pos = 0; + for (size_t i = 0; i < key_columns.size(); ++i) { + if (key_columns[i]->is_nullable()) { + pos = get_bitmap_size(key_columns.size()); + break; + } + } for (size_t i = 0; i < key_columns.size(); ++i) { size_t size = key_sizes[i]; @@ -607,10 +613,4 @@ struct MethodSingleNullableColumn : public SingleColumnMethod { } }; -template -using PrimaryTypeHashTableContext = MethodOneNumber>>; - -template -using FixedKeyHashTableContext = MethodKeysFixed>, has_null>; - } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/common/hash_table/hash_map_util.h b/be/src/vec/common/hash_table/hash_map_util.h index 200e6372ea8ac4e..d949fafecf95025 100644 --- a/be/src/vec/common/hash_table/hash_map_util.h +++ b/be/src/vec/common/hash_table/hash_map_util.h @@ -33,13 +33,10 @@ inline std::vector get_data_types( template Status init_hash_method(DataVariants* data, const std::vector& data_types, bool is_first_phase) { - auto type = get_hash_key_type_with_phase(get_hash_key_type(data_types), !is_first_phase); + auto type = HashKeyType::EMPTY; try { - if (has_nullable_key(data_types)) { - data->template init(data_types, type); - } else { - data->template init(data_types, type); - } + type = get_hash_key_type_with_phase(get_hash_key_type(data_types), !is_first_phase); + data->init(data_types, type); } catch (const Exception& e) { // method_variant may meet valueless_by_exception, so we set it to monostate data->method_variant.template emplace(); @@ -48,7 +45,7 @@ Status init_hash_method(DataVariants* data, const std::vectormethod_variant.valueless_by_exception()); - if (type != HashKeyType::without_key && + if (type != HashKeyType::without_key && type != HashKeyType::EMPTY && data->method_variant.index() == 0) { // index is 0 means variant is monostate return Status::InternalError("method_variant init failed"); } @@ -57,15 +54,15 @@ Status init_hash_method(DataVariants* data, const std::vector typename MethodNullable, template typename MethodOneNumber, - template typename MethodFixed, template typename DataNullable> + template typename DataNullable> struct DataVariants { DataVariants() = default; DataVariants(const DataVariants&) = delete; DataVariants& operator=(const DataVariants&) = delete; MethodVariants method_variant; - template - void emplace_single() { + template + void emplace_single(bool nullable) { if (nullable) { method_variant.template emplace>>>(); } else { diff --git a/be/src/vec/common/hash_table/hash_table.h b/be/src/vec/common/hash_table/hash_table.h index 490cd5016927c25..809868e2beea86f 100644 --- a/be/src/vec/common/hash_table/hash_table.h +++ b/be/src/vec/common/hash_table/hash_table.h @@ -419,28 +419,12 @@ class HashTable : private boost::noncopyable, Cell* buf = nullptr; /// A piece of memory for all elements except the element with zero key. Grower grower; int64_t _resize_timer_ns; - // the bucket count threshold above which it's converted to partioned hash table - // > 0: enable convert dynamically - // 0: convert is disabled - int _partitioned_threshold = 0; - // if need resize and bucket count after resize will be >= _partitioned_threshold, - // this flag is set to true, and resize does not actually happen, - // PartitionedHashTable will convert this hash table to partitioned hash table - bool _need_partition = false; //factor that will trigger growing the hash table on insert. static constexpr float MAX_BUCKET_OCCUPANCY_FRACTION = 0.5f; mutable size_t collisions = 0; - void set_partitioned_threshold(int threshold) { _partitioned_threshold = threshold; } - - bool check_if_need_partition(size_t bucket_count) { - return _partitioned_threshold > 0 && bucket_count >= _partitioned_threshold; - } - - bool need_partition() { return _need_partition; } - /// Find a cell with the same key or an empty cell, starting from the specified position and further along the collision resolution chain. size_t ALWAYS_INLINE find_cell(const Key& x, size_t hash_value, size_t place_value) const { while (!buf[place_value].is_zero(*this) && @@ -609,8 +593,6 @@ class HashTable : private boost::noncopyable, std::swap(buf, rhs.buf); std::swap(m_size, rhs.m_size); std::swap(grower, rhs.grower); - std::swap(_need_partition, rhs._need_partition); - std::swap(_partitioned_threshold, rhs._partitioned_threshold); Hash::operator=(std::move(rhs)); // NOLINT(bugprone-use-after-move) Allocator::operator=(std::move(rhs)); // NOLINT(bugprone-use-after-move) @@ -740,12 +722,10 @@ class HashTable : private boost::noncopyable, throw; } - if (LIKELY(!_need_partition)) { - // The hash table was rehashed, so we have to re-find the key. - size_t new_place = find_cell(key, hash_value, grower.place(hash_value)); - assert(!buf[new_place].is_zero(*this)); - it = &buf[new_place]; - } + // The hash table was rehashed, so we have to re-find the key. + size_t new_place = find_cell(key, hash_value, grower.place(hash_value)); + assert(!buf[new_place].is_zero(*this)); + it = &buf[new_place]; } } @@ -776,12 +756,10 @@ class HashTable : private boost::noncopyable, throw; } - if (LIKELY(!_need_partition)) { - // The hash table was rehashed, so we have to re-find the key. - size_t new_place = find_cell(key, hash_value, grower.place(hash_value)); - assert(!buf[new_place].is_zero(*this)); - it = &buf[new_place]; - } + // The hash table was rehashed, so we have to re-find the key. + size_t new_place = find_cell(key, hash_value, grower.place(hash_value)); + assert(!buf[new_place].is_zero(*this)); + it = &buf[new_place]; } } @@ -1060,13 +1038,6 @@ class HashTable : private boost::noncopyable, } else new_grower.increase_size(); - // new bucket count exceed partitioned hash table bucket count threshold, - // don't resize and set need partition flag - if (check_if_need_partition(new_grower.buf_size())) { - _need_partition = true; - return; - } - /// Expand the space. buf = reinterpret_cast(Allocator::realloc(buf, get_buffer_size_in_bytes(), new_grower.buf_size() * sizeof(Cell))); diff --git a/be/src/vec/common/hash_table/join_hash_table.h b/be/src/vec/common/hash_table/join_hash_table.h index 485c5f7b3b22c9a..25ca8844cd280f2 100644 --- a/be/src/vec/common/hash_table/join_hash_table.h +++ b/be/src/vec/common/hash_table/join_hash_table.h @@ -71,20 +71,16 @@ class JoinHashTable { std::vector& get_visited() { return visited; } - template - void build(const Key* __restrict keys, const uint32_t* __restrict bucket_nums, - size_t num_elem) { + void build(const Key* __restrict keys, const uint32_t* __restrict bucket_nums, size_t num_elem, + bool keep_null_key) { build_keys = keys; for (size_t i = 1; i < num_elem; i++) { uint32_t bucket_num = bucket_nums[i]; next[i] = first[bucket_num]; first[bucket_num] = i; } - if constexpr ((JoinOpType != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && - JoinOpType != TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) || - !with_other_conjuncts) { - /// Only null aware join with other conjuncts need to access the null value in hash table - first[bucket_size] = 0; // index = bucket_num means null + if (!keep_null_key) { + first[bucket_size] = 0; // index = bucket_size means null } } diff --git a/be/src/vec/common/hash_table/partitioned_hash_map.h b/be/src/vec/common/hash_table/partitioned_hash_map.h deleted file mode 100644 index a2db6fece35207f..000000000000000 --- a/be/src/vec/common/hash_table/partitioned_hash_map.h +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/TwoLevelHashMap.h -// and modified by Doris -#pragma once - -#include "vec/common/hash_table/hash_map.h" -#include "vec/common/hash_table/partitioned_hash_table.h" -#include "vec/common/hash_table/ph_hash_map.h" -namespace doris { -template -class PartitionedHashMapTable : public PartitionedHashTable { -public: - using Impl = ImplTable; - using Base = PartitionedHashTable; - using Key = typename ImplTable::key_type; - using LookupResult = typename Impl::LookupResult; - - auto& ALWAYS_INLINE operator[](const Key& x) { - LookupResult it; - bool inserted = false; - this->emplace(x, it, inserted); - - if (inserted) { - new (lookup_result_get_mapped(it)) Base::mapped_type(); - } - - return *lookup_result_get_mapped(it); - } - - template - void for_each_mapped(Func&& func) { - for (auto& v : *this) { - func(v.get_second()); - } - } -}; - -template > -using PartitionedHashMap = - PartitionedHashMapTable>>; - -template > -using PHNormalHashMap = PHHashMap; -} // namespace doris \ No newline at end of file diff --git a/be/src/vec/common/hash_table/partitioned_hash_table.h b/be/src/vec/common/hash_table/partitioned_hash_table.h deleted file mode 100644 index c6a19b36d3a0c38..000000000000000 --- a/be/src/vec/common/hash_table/partitioned_hash_table.h +++ /dev/null @@ -1,550 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/TwoLevelHashTable.h -// and modified by Doris -#pragma once - -#include "vec/common/hash_table/hash_table.h" - -/** Partitioned hash table. - * Represents 16 (or 1ULL << BITS_FOR_SUB_TABLE) small hash tables (sub table count of the first level). - * To determine which one to use, one of the bytes of the hash function is taken. - * - * Usually works a little slower than a simple hash table. - * However, it has advantages in some cases: - * - if you need to merge two hash tables together, then you can easily parallelize it by sub tables; - * - delay during resizes is amortized, since the small hash tables will be resized separately; - * - in theory, resizes are cache-local in a larger range of sizes. - */ - -template -struct PartitionedHashTableGrower : public HashTableGrowerWithPrecalculation { - /// Increase the size of the hash table. - void increase_size() { this->increase_size_degree(this->size_degree() >= 15 ? 1 : 2); } -}; - -template -class PartitionedHashTable : private boost::noncopyable, Impl::Hash { -public: - using key_type = typename Impl::key_type; - using mapped_type = typename Impl::mapped_type; - using value_type = typename Impl::value_type; - using cell_type = typename Impl::cell_type; - using Key = typename Impl::key_type; - - using LookupResult = typename Impl::LookupResult; - using ConstLookupResult = typename Impl::ConstLookupResult; - -protected: - using Self = PartitionedHashTable; - -private: - static constexpr size_t NUM_LEVEL1_SUB_TABLES = 1ULL << BITS_FOR_SUB_TABLE; - static constexpr size_t MAX_SUB_TABLE = NUM_LEVEL1_SUB_TABLES - 1; - - //factor that will trigger growing the hash table on insert. - static constexpr float MAX_SUB_TABLE_OCCUPANCY_FRACTION = 0.5f; - - Impl level0_sub_table; - Impl level1_sub_tables[NUM_LEVEL1_SUB_TABLES]; - - bool _is_partitioned = false; - - int64_t _convert_timer_ns = 0; - -public: - PartitionedHashTable() = default; - - PartitionedHashTable(PartitionedHashTable&& rhs) { *this = std::move(rhs); } - - PartitionedHashTable& operator=(PartitionedHashTable&& rhs) { - std::swap(_is_partitioned, rhs._is_partitioned); - - level0_sub_table = std::move(rhs.level0_sub_table); - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - level1_sub_tables[i] = std::move(rhs.level1_sub_tables[i]); - } - - return *this; - } - - size_t hash(const Key& x) const { return level0_sub_table.hash(x); } - - float get_factor() const { return MAX_SUB_TABLE_OCCUPANCY_FRACTION; } - - int64_t get_convert_timer_value() const { return _convert_timer_ns; } - - bool should_be_shrink(int64_t valid_row) const { - if (_is_partitioned) { - return false; - } else { - return level0_sub_table.should_be_shrink(valid_row); - } - } - - size_t size() { - size_t count = 0; - if (_is_partitioned) { - for (auto i = 0u; i < this->NUM_LEVEL1_SUB_TABLES; ++i) { - count += this->level1_sub_tables[i].size(); - } - } else { - count = level0_sub_table.size(); - } - return count; - } - - void init_buf_size(size_t reserve_for_num_elements) { - if (_is_partitioned) { - for (auto& impl : level1_sub_tables) { - impl.init_buf_size(reserve_for_num_elements / NUM_LEVEL1_SUB_TABLES); - } - } else { - if (level0_sub_table.check_if_need_partition(reserve_for_num_elements)) { - level0_sub_table.clear_and_shrink(); - _is_partitioned = true; - - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - level1_sub_tables[i].init_buf_size(reserve_for_num_elements / - NUM_LEVEL1_SUB_TABLES); - } - } else { - level0_sub_table.init_buf_size(reserve_for_num_elements); - } - } - } - - void delete_zero_key(Key key) { - if (_is_partitioned) { - const auto key_hash = hash(key); - size_t sub_table_idx = get_sub_table_from_hash(key_hash); - level1_sub_tables[sub_table_idx].delete_zero_key(key); - } else { - level0_sub_table.delete_zero_key(key); - } - } - - int64_t get_collisions() const { - size_t collisions = level0_sub_table.get_collisions(); - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; i++) { - collisions += level1_sub_tables[i].get_collisions(); - } - return collisions; - } - - size_t get_buffer_size_in_bytes() const { - if (_is_partitioned) { - size_t buff_size = 0; - for (const auto& impl : level1_sub_tables) buff_size += impl.get_buffer_size_in_bytes(); - return buff_size; - } else { - return level0_sub_table.get_buffer_size_in_bytes(); - } - } - - size_t get_buffer_size_in_cells() const { - if (_is_partitioned) { - size_t buff_size = 0; - for (const auto& impl : level1_sub_tables) buff_size += impl.get_buffer_size_in_cells(); - return buff_size; - } else { - return level0_sub_table.get_buffer_size_in_cells(); - } - } - - std::vector get_buffer_sizes_in_cells() const { - std::vector sizes; - if (_is_partitioned) { - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - sizes.push_back(level1_sub_tables[i].get_buffer_size_in_cells()); - } - } else { - sizes.push_back(level0_sub_table.get_buffer_size_in_cells()); - } - return sizes; - } - - void reset_resize_timer() { - if (_is_partitioned) { - for (auto& impl : level1_sub_tables) { - impl.reset_resize_timer(); - } - } else { - level0_sub_table.reset_resize_timer(); - } - } - int64_t get_resize_timer_value() const { - if (_is_partitioned) { - int64_t resize_timer_ns = 0; - for (const auto& impl : level1_sub_tables) { - resize_timer_ns += impl.get_resize_timer_value(); - } - return resize_timer_ns; - } else { - return level0_sub_table.get_resize_timer_value(); - } - } - - bool has_null_key_data() const { return false; } - template - char* get_null_key_data() { - return nullptr; - } - -protected: - typename Impl::iterator begin_of_next_non_empty_sub_table_idx(size_t& sub_table_idx) { - while (sub_table_idx != NUM_LEVEL1_SUB_TABLES && level1_sub_tables[sub_table_idx].empty()) - ++sub_table_idx; - - if (sub_table_idx != NUM_LEVEL1_SUB_TABLES) return level1_sub_tables[sub_table_idx].begin(); - - --sub_table_idx; - return level1_sub_tables[MAX_SUB_TABLE].end(); - } - - typename Impl::const_iterator begin_of_next_non_empty_sub_table_idx( - size_t& sub_table_idx) const { - while (sub_table_idx != NUM_LEVEL1_SUB_TABLES && level1_sub_tables[sub_table_idx].empty()) - ++sub_table_idx; - - if (sub_table_idx != NUM_LEVEL1_SUB_TABLES) return level1_sub_tables[sub_table_idx].begin(); - - --sub_table_idx; - return level1_sub_tables[MAX_SUB_TABLE].end(); - } - -public: - void set_partitioned_threshold(int threshold) { - level0_sub_table.set_partitioned_threshold(threshold); - } - - class iterator /// NOLINT - { - Self* container {}; - size_t sub_table_idx {}; - typename Impl::iterator current_it {}; - - friend class PartitionedHashTable; - - iterator(Self* container_, size_t sub_table_idx_, typename Impl::iterator current_it_) - : container(container_), sub_table_idx(sub_table_idx_), current_it(current_it_) {} - - public: - iterator() = default; - - bool operator==(const iterator& rhs) const { - return sub_table_idx == rhs.sub_table_idx && current_it == rhs.current_it; - } - bool operator!=(const iterator& rhs) const { return !(*this == rhs); } - - iterator& operator++() { - ++current_it; - if (container->_is_partitioned) { - if (current_it == container->level1_sub_tables[sub_table_idx].end()) { - ++sub_table_idx; - current_it = container->begin_of_next_non_empty_sub_table_idx(sub_table_idx); - } - } - - return *this; - } - - auto& operator*() { return *current_it; } - auto* operator->() { return current_it.get_ptr(); } - - auto* get_ptr() { return current_it.get_ptr(); } - size_t get_hash() { return current_it.get_hash(); } - }; - - class const_iterator /// NOLINT - { - Self* container {}; - size_t sub_table_idx {}; - typename Impl::const_iterator current_it {}; - - friend class PartitionedHashTable; - - const_iterator(Self* container_, size_t sub_table_idx_, - typename Impl::const_iterator current_it_) - : container(container_), sub_table_idx(sub_table_idx_), current_it(current_it_) {} - - public: - const_iterator() = default; - const_iterator(const iterator& rhs) - : container(rhs.container), - sub_table_idx(rhs.sub_table_idx), - current_it(rhs.current_it) {} /// NOLINT - - bool operator==(const const_iterator& rhs) const { - return sub_table_idx == rhs.sub_table_idx && current_it == rhs.current_it; - } - bool operator!=(const const_iterator& rhs) const { return !(*this == rhs); } - - const_iterator& operator++() { - ++current_it; - if (container->_is_partitioned) { - if (current_it == container->level1_sub_tables[sub_table_idx].end()) { - ++sub_table_idx; - current_it = container->begin_of_next_non_empty_sub_table_idx(sub_table_idx); - } - } - - return *this; - } - - const auto& operator*() const { return *current_it; } - const auto* operator->() const { return current_it->get_ptr(); } - - const auto* get_ptr() const { return current_it.get_ptr(); } - size_t get_hash() const { return current_it.get_hash(); } - }; - - const_iterator begin() const { - if (_is_partitioned) { - size_t sub_table_idx = 0; - typename Impl::const_iterator impl_it = - begin_of_next_non_empty_sub_table_idx(sub_table_idx); - return {this, sub_table_idx, impl_it}; - } else { - return {this, NUM_LEVEL1_SUB_TABLES, level0_sub_table.begin()}; - } - } - - iterator begin() { - if (_is_partitioned) { - size_t sub_table_idx = 0; - typename Impl::iterator impl_it = begin_of_next_non_empty_sub_table_idx(sub_table_idx); - return {this, sub_table_idx, impl_it}; - } else { - return {this, NUM_LEVEL1_SUB_TABLES, level0_sub_table.begin()}; - } - } - - const_iterator end() const { - if (_is_partitioned) { - return {this, MAX_SUB_TABLE, level1_sub_tables[MAX_SUB_TABLE].end()}; - } else { - return {this, NUM_LEVEL1_SUB_TABLES, level0_sub_table.end()}; - } - } - iterator end() { - if (_is_partitioned) { - return {this, MAX_SUB_TABLE, level1_sub_tables[MAX_SUB_TABLE].end()}; - } else { - return {this, NUM_LEVEL1_SUB_TABLES, level0_sub_table.end()}; - } - } - - /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. - std::pair ALWAYS_INLINE insert(const value_type& x) { - size_t hash_value = hash(cell_type::get_key(x)); - - std::pair res; - emplace(cell_type::get_key(x), res.first, res.second, hash_value); - - if (res.second) insert_set_mapped(lookup_result_get_mapped(res.first), x); - - return res; - } - - void expanse_for_add_elem(size_t num_elem) { - if (_is_partitioned) { - size_t num_elem_per_sub_table = - (num_elem + NUM_LEVEL1_SUB_TABLES - 1) / NUM_LEVEL1_SUB_TABLES; - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - level1_sub_tables[i].expanse_for_add_elem(num_elem_per_sub_table); - } - } else { - level0_sub_table.expanse_for_add_elem(num_elem); - if (UNLIKELY(level0_sub_table.need_partition())) { - convert_to_partitioned(); - } - } - } - - template - void ALWAYS_INLINE prefetch(const Key& key, size_t hash_value) { - if (_is_partitioned) { - const auto sub_table_idx = get_sub_table_from_hash(hash_value); - level1_sub_tables[sub_table_idx].template prefetch(hash_value); - } else { - level0_sub_table.template prefetch(hash_value); - } - } - - /** Insert the key, - * return an iterator to a position that can be used for `placement new` of value, - * as well as the flag - whether a new key was inserted. - * - * You have to make `placement new` values if you inserted a new key, - * since when destroying a hash table, the destructor will be invoked for it! - * - * Example usage: - * - * Map::iterator it; - * bool inserted; - * map.emplace(key, it, inserted); - * if (inserted) - * new(&it->second) Mapped(value); - */ - template - void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, bool& inserted) { - size_t hash_value = hash(key_holder); - emplace(key_holder, it, inserted, hash_value); - } - - /// Same, but with a precalculated values of hash function. - template - void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, bool& inserted, - size_t hash_value) { - if (_is_partitioned) { - size_t sub_table_idx = get_sub_table_from_hash(hash_value); - level1_sub_tables[sub_table_idx].emplace(key_holder, it, inserted, hash_value); - } else { - level0_sub_table.emplace(key_holder, it, inserted, hash_value); - if (UNLIKELY(level0_sub_table.need_partition())) { - convert_to_partitioned(); - - // The hash table was converted to partitioned, so we have to re-find the key. - size_t sub_table_id = get_sub_table_from_hash(hash_value); - it = level1_sub_tables[sub_table_id].find(key_holder, hash_value); - } - } - } - - template - void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, size_t hash_value, - bool& inserted) { - emplace(key_holder, it, inserted, hash_value); - } - - template - void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, Func&& f) { - size_t hash_value = hash(key_holder); - lazy_emplace(key_holder, it, hash_value, std::forward(f)); - } - - template - void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, size_t hash_value, - Func&& f) { - if (_is_partitioned) { - size_t sub_table_idx = get_sub_table_from_hash(hash_value); - level1_sub_tables[sub_table_idx].lazy_emplace(key_holder, it, hash_value, - std::forward(f)); - } else { - level0_sub_table.lazy_emplace(key_holder, it, hash_value, std::forward(f)); - if (UNLIKELY(level0_sub_table.need_partition())) { - convert_to_partitioned(); - - // The hash table was converted to partitioned, so we have to re-find the key. - size_t sub_table_id = get_sub_table_from_hash(hash_value); - it = level1_sub_tables[sub_table_id].find(key_holder, hash_value); - } - } - } - - LookupResult ALWAYS_INLINE find(Key x, size_t hash_value) { - if (_is_partitioned) { - size_t sub_table_idx = get_sub_table_from_hash(hash_value); - return level1_sub_tables[sub_table_idx].find(x, hash_value); - } else { - return level0_sub_table.find(x, hash_value); - } - } - - ConstLookupResult ALWAYS_INLINE find(Key x, size_t hash_value) const { - return const_cast*>(this)->find(x, hash_value); - } - - LookupResult ALWAYS_INLINE find(Key x) { return find(x, hash(x)); } - - ConstLookupResult ALWAYS_INLINE find(Key x) const { return find(x, hash(x)); } - - size_t size() const { - if (_is_partitioned) { - size_t res = 0; - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - res += level1_sub_tables[i].size(); - } - return res; - } else { - return level0_sub_table.size(); - } - } - - std::vector sizes() const { - std::vector sizes; - if (_is_partitioned) { - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - sizes.push_back(level1_sub_tables[i].size()); - } - } else { - sizes.push_back(level0_sub_table.size()); - } - return sizes; - } - - bool empty() const { - if (_is_partitioned) { - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) - if (!level1_sub_tables[i].empty()) return false; - return true; - } else { - return level0_sub_table.empty(); - } - } - - bool add_elem_size_overflow(size_t row) const { - return !_is_partitioned && level0_sub_table.add_elem_size_overflow(row); - } - -private: - void convert_to_partitioned() { - SCOPED_RAW_TIMER(&_convert_timer_ns); - - DCHECK(!_is_partitioned); - _is_partitioned = true; - - auto bucket_count = level0_sub_table.get_buffer_size_in_cells(); - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - level1_sub_tables[i] = std::move(Impl(bucket_count / NUM_LEVEL1_SUB_TABLES)); - } - - auto it = level0_sub_table.begin(); - - /// It is assumed that the zero key (stored separately) is first in iteration order. - if (it != level0_sub_table.end() && it.get_ptr()->is_zero(level0_sub_table)) { - insert(it->get_value()); - ++it; - } - - for (; it != level0_sub_table.end(); ++it) { - const auto* cell = it.get_ptr(); - size_t hash_value = cell->get_hash(level0_sub_table); - size_t sub_table_idx = get_sub_table_from_hash(hash_value); - level1_sub_tables[sub_table_idx].insert_unique_non_zero(cell, hash_value); - } - - level0_sub_table.clear_and_shrink(); - } - - /// NOTE Bad for hash tables with more than 2^32 cells. - static size_t get_sub_table_from_hash(size_t hash_value) { - return (hash_value >> (32 - BITS_FOR_SUB_TABLE)) & MAX_SUB_TABLE; - } -}; diff --git a/be/src/vec/common/hash_table/ph_hash_map.h b/be/src/vec/common/hash_table/ph_hash_map.h index 50cf218dc87c065..de3204252234278 100644 --- a/be/src/vec/common/hash_table/ph_hash_map.h +++ b/be/src/vec/common/hash_table/ph_hash_map.h @@ -30,8 +30,7 @@ ALWAYS_INLINE inline auto lookup_result_get_mapped(std::pair* return &(it->second); } -template , - bool PartitionedHashTable = false> +template > class PHHashMap : private boost::noncopyable { public: using Self = PHHashMap; @@ -58,9 +57,6 @@ class PHHashMap : private boost::noncopyable { PHHashMap& operator=(PHHashMap&& rhs) { _hash_map.clear(); _hash_map = std::move(rhs._hash_map); - std::swap(_need_partition, rhs._need_partition); - std::swap(_partitioned_threshold, rhs._partitioned_threshold); - return *this; } @@ -130,19 +126,11 @@ class PHHashMap : private boost::noncopyable { inserted = true; ctor(key_holder, nullptr); }); - - if constexpr (PartitionedHashTable) { - _check_if_need_partition(); - } } template void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, Func&& f) { it = &*_hash_map.lazy_emplace(key_holder, [&](const auto& ctor) { f(ctor, key_holder); }); - - if constexpr (PartitionedHashTable) { - _check_if_need_partition(); - } } template @@ -157,10 +145,6 @@ class PHHashMap : private boost::noncopyable { ctor(key, mapped_type()); } }); - - if constexpr (PartitionedHashTable) { - _check_if_need_partition(); - } } template @@ -168,10 +152,6 @@ class PHHashMap : private boost::noncopyable { Func&& f) { it = &*_hash_map.lazy_emplace_with_hash(key, hash_value, [&](const auto& ctor) { f(ctor, key, key); }); - - if constexpr (PartitionedHashTable) { - _check_if_need_partition(); - } } void ALWAYS_INLINE insert(const Key& key, size_t hash_value, const Mapped& value) { @@ -225,18 +205,6 @@ class PHHashMap : private boost::noncopyable { } bool has_null_key_data() const { return false; } - bool need_partition() { return _need_partition; } - - void set_partitioned_threshold(int threshold) { _partitioned_threshold = threshold; } - - bool check_if_need_partition(size_t bucket_count) { - if constexpr (PartitionedHashTable) { - return _partitioned_threshold > 0 && bucket_count >= _partitioned_threshold; - } else { - return false; - } - } - bool empty() const { return _hash_map.empty(); } void clear_and_shrink() { _hash_map.clear(); } @@ -244,19 +212,5 @@ class PHHashMap : private boost::noncopyable { void expanse_for_add_elem(size_t num_elem) { _hash_map.reserve(num_elem); } private: - void _check_if_need_partition() { - if (UNLIKELY(check_if_need_partition(_hash_map.size() + 1))) { - _need_partition = add_elem_size_overflow(1); - } - } - HashMapImpl _hash_map; - // the bucket count threshold above which it's converted to partioned hash table - // > 0: enable convert dynamically - // 0: convert is disabled - int _partitioned_threshold = 0; - // if need resize and bucket count after resize will be >= _partitioned_threshold, - // this flag is set to true, and resize does not actually happen, - // PartitionedHashTable will convert this hash table to partitioned hash table - bool _need_partition; }; diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 4db2d62b9949e77..7dfc3c528cd88e9 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -333,6 +333,11 @@ Status NewJsonReader::get_parsed_schema(std::vector* col_names, objectValue = _json_doc; } + if (!objectValue->IsObject()) { + return Status::DataQualityError("JSON data is not an object. but: {}", + objectValue->GetType()); + } + // use jsonpaths to col_names if (!_parsed_jsonpaths.empty()) { for (auto& _parsed_jsonpath : _parsed_jsonpaths) { diff --git a/be/src/vec/exec/scan/new_jdbc_scanner.cpp b/be/src/vec/exec/scan/new_jdbc_scanner.cpp index a23e83e2426c078..7eaa9ab3eab7881 100644 --- a/be/src/vec/exec/scan/new_jdbc_scanner.cpp +++ b/be/src/vec/exec/scan/new_jdbc_scanner.cpp @@ -89,7 +89,6 @@ Status NewJdbcScanner::prepare(RuntimeState* state, const VExprContextSPtrs& con _jdbc_param.connection_pool_max_life_time = jdbc_table->connection_pool_max_life_time(); _jdbc_param.connection_pool_max_wait_time = jdbc_table->connection_pool_max_wait_time(); _jdbc_param.connection_pool_keep_alive = jdbc_table->connection_pool_keep_alive(); - _jdbc_param.enable_connection_pool = jdbc_table->enable_connection_pool(); _local_state->scanner_profile()->add_info_string("JdbcDriverClass", _jdbc_param.driver_class); _local_state->scanner_profile()->add_info_string("JdbcDriverUrl", _jdbc_param.driver_path); diff --git a/be/src/vec/exec/vjdbc_connector.cpp b/be/src/vec/exec/vjdbc_connector.cpp index 14263cf4c08b5b9..0fa33bfaad917d8 100644 --- a/be/src/vec/exec/vjdbc_connector.cpp +++ b/be/src/vec/exec/vjdbc_connector.cpp @@ -152,7 +152,6 @@ Status JdbcConnector::open(RuntimeState* state, bool read) { } ctor_params.__set_op(read ? TJdbcOperation::READ : TJdbcOperation::WRITE); ctor_params.__set_table_type(_conn_param.table_type); - ctor_params.__set_enable_connection_pool(_conn_param.enable_connection_pool); ctor_params.__set_connection_pool_min_size(_conn_param.connection_pool_min_size); ctor_params.__set_connection_pool_max_size(_conn_param.connection_pool_max_size); ctor_params.__set_connection_pool_max_wait_time(_conn_param.connection_pool_max_wait_time); diff --git a/be/src/vec/exec/vjdbc_connector.h b/be/src/vec/exec/vjdbc_connector.h index 066a95de554444b..954b0abfa78f0ce 100644 --- a/be/src/vec/exec/vjdbc_connector.h +++ b/be/src/vec/exec/vjdbc_connector.h @@ -61,7 +61,6 @@ struct JdbcConnectorParam { int32_t connection_pool_max_wait_time = -1; int32_t connection_pool_max_life_time = -1; bool connection_pool_keep_alive = false; - bool enable_connection_pool; const TupleDescriptor* tuple_desc = nullptr; }; diff --git a/be/src/vec/exprs/vectorized_agg_fn.cpp b/be/src/vec/exprs/vectorized_agg_fn.cpp index 45ad573cb5d9df0..ef2bbcb29964aaf 100644 --- a/be/src/vec/exprs/vectorized_agg_fn.cpp +++ b/be/src/vec/exprs/vectorized_agg_fn.cpp @@ -44,6 +44,8 @@ #include "vec/exprs/vexpr_context.h" #include "vec/utils/util.hpp" +static constexpr int64_t BE_VERSION_THAT_SUPPORT_NULLABLE_CHECK = 8; + namespace doris { class RowDescriptor; namespace vectorized { @@ -63,9 +65,10 @@ AggregateFunctionPtr get_agg_state_function(const DataTypes& argument_types, argument_types, return_type); } -AggFnEvaluator::AggFnEvaluator(const TExprNode& desc) +AggFnEvaluator::AggFnEvaluator(const TExprNode& desc, const bool without_key) : _fn(desc.fn), _is_merge(desc.agg_expr.is_merge_agg), + _without_key(without_key), _return_type(TypeDescriptor::from_thrift(desc.fn.ret_type)) { bool nullable = true; if (desc.__isset.is_nullable) { @@ -83,8 +86,8 @@ AggFnEvaluator::AggFnEvaluator(const TExprNode& desc) } Status AggFnEvaluator::create(ObjectPool* pool, const TExpr& desc, const TSortInfo& sort_info, - AggFnEvaluator** result) { - *result = pool->add(AggFnEvaluator::create_unique(desc.nodes[0]).release()); + const bool without_key, AggFnEvaluator** result) { + *result = pool->add(AggFnEvaluator::create_unique(desc.nodes[0], without_key).release()); auto& agg_fn_evaluator = *result; int node_idx = 0; for (int i = 0; i < desc.nodes[0].num_children; ++i) { @@ -213,6 +216,13 @@ Status AggFnEvaluator::prepare(RuntimeState* state, const RowDescriptor& desc, _function = transform_to_sort_agg_function(_function, _argument_types_with_sort, _sort_description, state); } + + if (!AggregateFunctionSimpleFactory::is_foreach(_fn.name.function_name)) { + if (state->be_exec_version() >= BE_VERSION_THAT_SUPPORT_NULLABLE_CHECK) { + RETURN_IF_ERROR( + _function->verify_result_type(_without_key, argument_types, _data_type)); + } + } _expr_name = fmt::format("{}({})", _fn.name.function_name, child_expr_name); return Status::OK(); } @@ -320,6 +330,7 @@ AggFnEvaluator* AggFnEvaluator::clone(RuntimeState* state, ObjectPool* pool) { AggFnEvaluator::AggFnEvaluator(AggFnEvaluator& evaluator, RuntimeState* state) : _fn(evaluator._fn), _is_merge(evaluator._is_merge), + _without_key(evaluator._without_key), _argument_types_with_sort(evaluator._argument_types_with_sort), _real_argument_types(evaluator._real_argument_types), _return_type(evaluator._return_type), diff --git a/be/src/vec/exprs/vectorized_agg_fn.h b/be/src/vec/exprs/vectorized_agg_fn.h index 8e4f864c474058e..a3f7030d958870c 100644 --- a/be/src/vec/exprs/vectorized_agg_fn.h +++ b/be/src/vec/exprs/vectorized_agg_fn.h @@ -50,7 +50,7 @@ class AggFnEvaluator { public: static Status create(ObjectPool* pool, const TExpr& desc, const TSortInfo& sort_info, - AggFnEvaluator** result); + const bool without_key, AggFnEvaluator** result); Status prepare(RuntimeState* state, const RowDescriptor& desc, const SlotDescriptor* intermediate_slot_desc, @@ -109,8 +109,12 @@ class AggFnEvaluator { const TFunction _fn; const bool _is_merge; + // We need this flag to distinguish between the two types of aggregation functions: + // 1. executed without group by key (agg function used with window function is also regarded as this type) + // 2. executed with group by key + const bool _without_key; - AggFnEvaluator(const TExprNode& desc); + AggFnEvaluator(const TExprNode& desc, const bool without_key); AggFnEvaluator(AggFnEvaluator& evaluator, RuntimeState* state); Status _calc_argument_columns(Block* block); diff --git a/be/src/vec/functions/function_datetime_floor_ceil.cpp b/be/src/vec/functions/function_datetime_floor_ceil.cpp index bf74deaed1daeb5..43611e68a17a89f 100644 --- a/be/src/vec/functions/function_datetime_floor_ceil.cpp +++ b/be/src/vec/functions/function_datetime_floor_ceil.cpp @@ -51,6 +51,7 @@ #include "vec/runtime/vdatetime_value.h" namespace doris { +#include "common/compile_check_begin.h" class FunctionContext; namespace vectorized { @@ -73,7 +74,7 @@ struct YearFloor; namespace doris::vectorized { -template +template class FunctionDateTimeFloorCeil : public IFunction { public: using ReturnDataType = std::conditional_t< @@ -84,7 +85,7 @@ class FunctionDateTimeFloorCeil : public IFunction { std::is_same_v, Int64, std::conditional_t>, UInt32, UInt64>>; - using DeltaDataType = DataTypeNumber; // int32/64 + using DeltaDataType = DataTypeNumber; // int32/64 static constexpr auto name = Impl::name; static FunctionPtr create() { return std::make_shared(); } @@ -166,7 +167,7 @@ class FunctionDateTimeFloorCeil : public IFunction { col_to->get_data(), null_map->get_data()); } else { // time_round(datetime,const(period)) - Impl::template vector_constant_delta( + Impl::template vector_constant_delta( sources->get_data(), delta_const_column->get_field().get(), col_to->get_data(), null_map->get_data()); } @@ -178,7 +179,7 @@ class FunctionDateTimeFloorCeil : public IFunction { col_to->get_data(), null_map->get_data()); } else { const auto* delta_vec_column1 = - check_and_get_column>(delta_column); + check_and_get_column>(delta_column); DCHECK(delta_vec_column1 != nullptr); // time_round(datetime, period) Impl::vector_vector(sources->get_data(), delta_vec_column1->get_data(), @@ -197,7 +198,7 @@ class FunctionDateTimeFloorCeil : public IFunction { arg1_col->get(0, arg1); arg2_col->get(0, arg2); // time_round(datetime,const(period) , const(origin)) - Impl::template vector_const_const( + Impl::template vector_const_const( sources->get_data(), arg1.get(), arg2.get(), col_to->get_data(), null_map->get_data()); @@ -207,27 +208,25 @@ class FunctionDateTimeFloorCeil : public IFunction { const auto arg2_column = check_and_get_column>(*arg2_col); // time_round(datetime,const(period) , origin) - Impl::template vector_const_vector( + Impl::template vector_const_vector( sources->get_data(), arg1.get(), arg2_column->get_data(), col_to->get_data(), null_map->get_data()); } else if (!arg1_const && arg2_const) { Field arg2; arg2_col->get(0, arg2); - const auto arg1_column = - check_and_get_column>(*arg1_col); + const auto arg1_column = check_and_get_column>(*arg1_col); // time_round(datetime, period , const(origin)) - Impl::template vector_vector_const( + Impl::template vector_vector_const( sources->get_data(), arg1_column->get_data(), arg2.get(), col_to->get_data(), null_map->get_data()); } else { - const auto arg1_column = - check_and_get_column>(*arg1_col); + const auto arg1_column = check_and_get_column>(*arg1_col); const auto arg2_column = check_and_get_column>(*arg2_col); DCHECK(arg1_column != nullptr); DCHECK(arg2_column != nullptr); // time_round(datetime, period, origin) - Impl::template vector_vector( + Impl::template vector_vector( sources->get_data(), arg1_column->get_data(), arg2_column->get_data(), col_to->get_data(), null_map->get_data()); } @@ -289,8 +288,8 @@ struct FloorCeilImpl { } } - template - static void vector_constant_delta(const PaddedPODArray& dates, DeltaType period, + template + static void vector_constant_delta(const PaddedPODArray& dates, Int32 period, PaddedPODArray& res, NullMap& null_map) { // time_round(datetime,const(period)) if (period < 1) { @@ -313,7 +312,7 @@ struct FloorCeilImpl { } } - template + template static void vector_const_const_with_constant_optimization( const PaddedPODArray& dates, NativeType origin_date, PaddedPODArray& res, NullMap& null_map) { @@ -333,8 +332,8 @@ struct FloorCeilImpl { } } } - template - static void vector_const_const(const PaddedPODArray& dates, const DeltaType period, + template + static void vector_const_const(const PaddedPODArray& dates, const Int32 period, NativeType origin_date, PaddedPODArray& res, NullMap& null_map) { if (period < 1) { @@ -343,63 +342,63 @@ struct FloorCeilImpl { } switch (period) { case 1: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 2: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 3: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 4: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 5: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 6: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 7: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 8: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 9: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 10: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 11: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } case 12: { - vector_const_const_with_constant_optimization( - dates, origin_date, res, null_map); + vector_const_const_with_constant_optimization(dates, origin_date, res, + null_map); break; } default: @@ -420,8 +419,8 @@ struct FloorCeilImpl { } } - template - static void vector_const_vector(const PaddedPODArray& dates, const DeltaType period, + template + static void vector_const_vector(const PaddedPODArray& dates, const Int32 period, const PaddedPODArray& origin_dates, PaddedPODArray& res, NullMap& null_map) { if (period < 1) { @@ -492,10 +491,10 @@ struct FloorCeilImpl { } } - template + template static void vector_vector(const PaddedPODArray& dates, - const PaddedPODArray& periods, - PaddedPODArray& res, NullMap& null_map) { + const PaddedPODArray& periods, PaddedPODArray& res, + NullMap& null_map) { // time_round(datetime, period) for (int i = 0; i < dates.size(); ++i) { if (periods[i] < 1) { @@ -517,9 +516,9 @@ struct FloorCeilImpl { } } - template + template static void vector_vector(const PaddedPODArray& dates, - const PaddedPODArray& periods, + const PaddedPODArray& periods, const PaddedPODArray& origin_dates, PaddedPODArray& res, NullMap& null_map) { // time_round(datetime, period, origin) @@ -931,53 +930,51 @@ struct TimeRound { } }; -#define TIME_ROUND_WITH_DELTA_TYPE(CLASS, NAME, UNIT, TYPE, DELTA) \ - using FunctionOneArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, VecDateTimeValue, DELTA, 1, \ - false>; \ - using FunctionTwoArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, VecDateTimeValue, DELTA, 2, \ - false>; \ - using FunctionThreeArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, VecDateTimeValue, DELTA, 3, \ - false>; \ - using FunctionDateV2OneArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, \ - DateV2Value, DELTA, 1, false>; \ - using FunctionDateV2TwoArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, \ - DateV2Value, DELTA, 2, false>; \ - using FunctionDateV2ThreeArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, \ - DateV2Value, DELTA, 3, false>; \ - using FunctionDateTimeV2OneArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, \ - DateV2Value, DELTA, 1, false>; \ - using FunctionDateTimeV2TwoArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, \ - DateV2Value, DELTA, 2, false>; \ - using FunctionDateTimeV2ThreeArg##CLASS##DELTA = \ - FunctionDateTimeFloorCeil>, \ - DateV2Value, DELTA, 3, false>; +#define TIME_ROUND_WITH_DELTA_TYPE(CLASS, NAME, UNIT, TYPE, DELTA) \ + using FunctionOneArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, VecDateTimeValue, 1, \ + false>; \ + using FunctionTwoArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, VecDateTimeValue, 2, \ + false>; \ + using FunctionThreeArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, VecDateTimeValue, 3, \ + false>; \ + using FunctionDateV2OneArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, \ + DateV2Value, 1, false>; \ + using FunctionDateV2TwoArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, \ + DateV2Value, 2, false>; \ + using FunctionDateV2ThreeArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, \ + DateV2Value, 3, false>; \ + using FunctionDateTimeV2OneArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, \ + DateV2Value, 1, false>; \ + using FunctionDateTimeV2TwoArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, \ + DateV2Value, 2, false>; \ + using FunctionDateTimeV2ThreeArg##CLASS##DELTA = \ + FunctionDateTimeFloorCeil>, \ + DateV2Value, 3, false>; -#define TIME_ROUND(CLASS, NAME, UNIT, TYPE) \ - struct CLASS { \ - static constexpr auto name = #NAME; \ - static constexpr TimeUnit Unit = UNIT; \ - static constexpr auto Type = TYPE; \ - }; \ - \ - TIME_ROUND_WITH_DELTA_TYPE(CLASS, NAME, UNIT, TYPE, Int32) \ - TIME_ROUND_WITH_DELTA_TYPE(CLASS, NAME, UNIT, TYPE, Int64) \ - using FunctionDateTimeV2TwoArg##CLASS = \ - FunctionDateTimeFloorCeil>, \ - DateV2Value, Int32, 2, true>; \ - using FunctionDateV2TwoArg##CLASS = \ - FunctionDateTimeFloorCeil>, \ - DateV2Value, Int32, 2, true>; \ - using FunctionDateTimeTwoArg##CLASS = \ - FunctionDateTimeFloorCeil>, VecDateTimeValue, Int32, 2, \ - true>; +#define TIME_ROUND(CLASS, NAME, UNIT, TYPE) \ + struct CLASS { \ + static constexpr auto name = #NAME; \ + static constexpr TimeUnit Unit = UNIT; \ + static constexpr auto Type = TYPE; \ + }; \ + \ + TIME_ROUND_WITH_DELTA_TYPE(CLASS, NAME, UNIT, TYPE, Int32) \ + using FunctionDateTimeV2TwoArg##CLASS = \ + FunctionDateTimeFloorCeil>, \ + DateV2Value, 2, true>; \ + using FunctionDateV2TwoArg##CLASS = \ + FunctionDateTimeFloorCeil>, \ + DateV2Value, 2, true>; \ + using FunctionDateTimeTwoArg##CLASS = \ + FunctionDateTimeFloorCeil>, VecDateTimeValue, 2, true>; TIME_ROUND(YearFloor, year_floor, YEAR, FLOOR); TIME_ROUND(MonthFloor, month_floor, MONTH, FLOOR); @@ -1010,9 +1007,7 @@ void register_function_datetime_floor_ceil(SimpleFunctionFactory& factory) { factory.register_function(); \ factory.register_function(); -#define REGISTER_FUNC(CLASS) \ - REGISTER_FUNC_WITH_DELTA_TYPE(CLASS, Int32) \ - REGISTER_FUNC_WITH_DELTA_TYPE(CLASS, Int64) +#define REGISTER_FUNC(CLASS) REGISTER_FUNC_WITH_DELTA_TYPE(CLASS, Int32) REGISTER_FUNC(YearFloor); REGISTER_FUNC(MonthFloor); diff --git a/be/src/vec/functions/url/find_symbols.h b/be/src/vec/functions/url/find_symbols.h index 0fa0588e65689ec..4eafea893f878f1 100644 --- a/be/src/vec/functions/url/find_symbols.h +++ b/be/src/vec/functions/url/find_symbols.h @@ -362,8 +362,8 @@ inline const char* find_first_symbols_dispatch(const std::string_view haystack, template inline const char* find_first_symbols(const char* begin, const char* end) { - return detail::find_first_symbols_dispatch(begin, - end); + return ::detail::find_first_symbols_dispatch(begin, + end); } /// Returning non const result for non const arguments. @@ -371,93 +371,95 @@ inline const char* find_first_symbols(const char* begin, const char* end) { template inline char* find_first_symbols(char* begin, char* end) { return const_cast( - detail::find_first_symbols_dispatch(begin, - end)); + ::detail::find_first_symbols_dispatch( + begin, end)); } inline const char* find_first_symbols(std::string_view haystack, const SearchSymbols& symbols) { - return detail::find_first_symbols_dispatch(haystack, symbols); + return ::detail::find_first_symbols_dispatch(haystack, + symbols); } template inline const char* find_first_not_symbols(const char* begin, const char* end) { - return detail::find_first_symbols_dispatch(begin, - end); + return ::detail::find_first_symbols_dispatch( + begin, end); } template inline char* find_first_not_symbols(char* begin, char* end) { return const_cast( - detail::find_first_symbols_dispatch(begin, - end)); + ::detail::find_first_symbols_dispatch( + begin, end)); } inline const char* find_first_not_symbols(std::string_view haystack, const SearchSymbols& symbols) { - return detail::find_first_symbols_dispatch(haystack, symbols); + return ::detail::find_first_symbols_dispatch(haystack, + symbols); } template inline const char* find_first_symbols_or_null(const char* begin, const char* end) { - return detail::find_first_symbols_dispatch(begin, - end); + return ::detail::find_first_symbols_dispatch( + begin, end); } template inline char* find_first_symbols_or_null(char* begin, char* end) { return const_cast( - detail::find_first_symbols_dispatch( + ::detail::find_first_symbols_dispatch( begin, end)); } inline const char* find_first_symbols_or_null(std::string_view haystack, const SearchSymbols& symbols) { - return detail::find_first_symbols_dispatch(haystack, - symbols); + return ::detail::find_first_symbols_dispatch(haystack, + symbols); } template inline const char* find_first_not_symbols_or_null(const char* begin, const char* end) { - return detail::find_first_symbols_dispatch( + return ::detail::find_first_symbols_dispatch( begin, end); } template inline char* find_first_not_symbols_or_null(char* begin, char* end) { return const_cast( - detail::find_first_symbols_dispatch( + ::detail::find_first_symbols_dispatch( begin, end)); } inline const char* find_first_not_symbols_or_null(std::string_view haystack, const SearchSymbols& symbols) { - return detail::find_first_symbols_dispatch(haystack, - symbols); + return ::detail::find_first_symbols_dispatch(haystack, + symbols); } template inline const char* find_last_symbols_or_null(const char* begin, const char* end) { - return detail::find_last_symbols_sse2(begin, - end); + return ::detail::find_last_symbols_sse2(begin, + end); } template inline char* find_last_symbols_or_null(char* begin, char* end) { return const_cast( - detail::find_last_symbols_sse2(begin, - end)); + ::detail::find_last_symbols_sse2(begin, + end)); } template inline const char* find_last_not_symbols_or_null(const char* begin, const char* end) { - return detail::find_last_symbols_sse2(begin, - end); + return ::detail::find_last_symbols_sse2(begin, + end); } template inline char* find_last_not_symbols_or_null(char* begin, char* end) { return const_cast( - detail::find_last_symbols_sse2(begin, - end)); + ::detail::find_last_symbols_sse2( + begin, end)); } /// Slightly resembles boost::split. The drawback of boost::split is that it fires a false positive in clang static analyzer. diff --git a/be/src/vec/sink/autoinc_buffer.cpp b/be/src/vec/sink/autoinc_buffer.cpp index 4bc87dff48958c4..4c45b7bc6313d7b 100644 --- a/be/src/vec/sink/autoinc_buffer.cpp +++ b/be/src/vec/sink/autoinc_buffer.cpp @@ -30,6 +30,7 @@ #include "util/thrift_rpc_helper.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" AutoIncIDBuffer::AutoIncIDBuffer(int64_t db_id, int64_t table_id, int64_t column_id) : _db_id(db_id), diff --git a/be/src/vec/sink/autoinc_buffer.h b/be/src/vec/sink/autoinc_buffer.h index 032ac18981f4dae..82be3d9faad00f7 100644 --- a/be/src/vec/sink/autoinc_buffer.h +++ b/be/src/vec/sink/autoinc_buffer.h @@ -18,12 +18,14 @@ #pragma once #include +#include "common/cast_set.h" #include "common/config.h" #include "common/factory_creator.h" #include "common/status.h" #include "util/threadpool.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class VOlapTableSink; class OlapTableBlockConvertor; @@ -118,8 +120,8 @@ class GlobalAutoIncBuffers { GlobalAutoIncBuffers() { static_cast(ThreadPoolBuilder("AsyncFetchAutoIncIDExecutor") - .set_min_threads(config::auto_inc_fetch_thread_num) - .set_max_threads(config::auto_inc_fetch_thread_num) + .set_min_threads(cast_set(config::auto_inc_fetch_thread_num)) + .set_max_threads(cast_set(config::auto_inc_fetch_thread_num)) .set_max_queue_size(std::numeric_limits::max()) .build(&_fetch_autoinc_id_executor)); } @@ -146,4 +148,5 @@ class GlobalAutoIncBuffers { std::mutex _mutex; }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/delta_writer_v2_pool.cpp b/be/src/vec/sink/delta_writer_v2_pool.cpp index bc5233ac30796ea..c9ae424d4d4fe42 100644 --- a/be/src/vec/sink/delta_writer_v2_pool.cpp +++ b/be/src/vec/sink/delta_writer_v2_pool.cpp @@ -21,6 +21,7 @@ #include "util/runtime_profile.h" namespace doris { +#include "common/compile_check_begin.h" class TExpr; namespace vectorized { diff --git a/be/src/vec/sink/delta_writer_v2_pool.h b/be/src/vec/sink/delta_writer_v2_pool.h index 7e58eea31498f64..677d062f6b33630 100644 --- a/be/src/vec/sink/delta_writer_v2_pool.h +++ b/be/src/vec/sink/delta_writer_v2_pool.h @@ -51,6 +51,7 @@ #include "util/uid_util.h" namespace doris { +#include "common/compile_check_begin.h" class DeltaWriterV2; class RuntimeProfile; @@ -108,3 +109,5 @@ class DeltaWriterV2Pool { } // namespace vectorized } // namespace doris + +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/load_stream_map_pool.cpp b/be/src/vec/sink/load_stream_map_pool.cpp index dc78d306e70666d..24a1cb77a489cc3 100644 --- a/be/src/vec/sink/load_stream_map_pool.cpp +++ b/be/src/vec/sink/load_stream_map_pool.cpp @@ -20,6 +20,7 @@ #include "util/debug_points.h" namespace doris { +#include "common/compile_check_begin.h" class TExpr; LoadStreamMap::LoadStreamMap(UniqueId load_id, int64_t src_id, int num_streams, int num_use, diff --git a/be/src/vec/sink/load_stream_map_pool.h b/be/src/vec/sink/load_stream_map_pool.h index 4ecae2f16bea7f5..f1ed7b0da16ba3f 100644 --- a/be/src/vec/sink/load_stream_map_pool.h +++ b/be/src/vec/sink/load_stream_map_pool.h @@ -65,6 +65,7 @@ #include "vec/sink/load_stream_stub.h" namespace doris { +#include "common/compile_check_begin.h" class LoadStreamStub; @@ -136,3 +137,5 @@ class LoadStreamMapPool { }; } // namespace doris + +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/load_stream_stub.cpp b/be/src/vec/sink/load_stream_stub.cpp index 2a38b179b95531f..979daf6a85e6821 100644 --- a/be/src/vec/sink/load_stream_stub.cpp +++ b/be/src/vec/sink/load_stream_stub.cpp @@ -19,6 +19,7 @@ #include +#include "common/cast_set.h" #include "olap/rowset/rowset_writer.h" #include "runtime/query_context.h" #include "util/brpc_client_cache.h" @@ -28,6 +29,7 @@ #include "util/uid_util.h" namespace doris { +#include "common/compile_check_begin.h" int LoadStreamReplyHandler::on_received_messages(brpc::StreamId id, butil::IOBuf* const messages[], size_t size) { @@ -92,7 +94,7 @@ int LoadStreamReplyHandler::on_received_messages(brpc::StreamId id, butil::IOBuf TRuntimeProfileTree tprofile; const uint8_t* buf = reinterpret_cast(response.load_stream_profile().data()); - uint32_t len = response.load_stream_profile().size(); + uint32_t len = cast_set(response.load_stream_profile().size()); auto status = deserialize_thrift_msg(buf, &len, false, &tprofile); if (status.ok()) { // TODO @@ -154,7 +156,7 @@ Status LoadStreamStub::open(BrpcClientCache* client_cache, _is_init.store(true); _dst_id = node_info.id; brpc::StreamOptions opt; - opt.max_buf_size = config::load_stream_max_buf_size; + opt.max_buf_size = cast_set(config::load_stream_max_buf_size); opt.idle_timeout_ms = idle_timeout_ms; opt.messages_in_batch = config::load_stream_messages_in_batch; opt.handler = new LoadStreamReplyHandler(_load_id, _dst_id, shared_from_this()); @@ -213,7 +215,7 @@ Status LoadStreamStub::open(BrpcClientCache* client_cache, // APPEND_DATA Status LoadStreamStub::append_data(int64_t partition_id, int64_t index_id, int64_t tablet_id, - int64_t segment_id, uint64_t offset, std::span data, + int32_t segment_id, uint64_t offset, std::span data, bool segment_eos, FileType file_type) { if (!_is_open.load()) { add_failed_tablet(tablet_id, _status); @@ -240,7 +242,7 @@ Status LoadStreamStub::append_data(int64_t partition_id, int64_t index_id, int64 // ADD_SEGMENT Status LoadStreamStub::add_segment(int64_t partition_id, int64_t index_id, int64_t tablet_id, - int64_t segment_id, const SegmentStatistics& segment_stat, + int32_t segment_id, const SegmentStatistics& segment_stat, TabletSchemaSPtr flush_schema) { if (!_is_open.load()) { add_failed_tablet(tablet_id, _status); diff --git a/be/src/vec/sink/load_stream_stub.h b/be/src/vec/sink/load_stream_stub.h index 9816770c82e6725..cad7a90492ad324 100644 --- a/be/src/vec/sink/load_stream_stub.h +++ b/be/src/vec/sink/load_stream_stub.h @@ -69,6 +69,7 @@ #include "vec/exprs/vexpr_fwd.h" namespace doris { +#include "common/compile_check_begin.h" class TabletSchema; class LoadStreamStub; @@ -133,15 +134,18 @@ class LoadStreamStub : public std::enable_shared_from_this { #ifdef BE_TEST virtual #endif + // segment_id is limited by max_segment_num_per_rowset (default value of 1000), + // so in practice it will not exceed the range of i16. + // APPEND_DATA Status append_data(int64_t partition_id, int64_t index_id, int64_t tablet_id, - int64_t segment_id, uint64_t offset, std::span data, + int32_t segment_id, uint64_t offset, std::span data, bool segment_eos = false, FileType file_type = FileType::SEGMENT_FILE); // ADD_SEGMENT Status add_segment(int64_t partition_id, int64_t index_id, int64_t tablet_id, - int64_t segment_id, const SegmentStatistics& segment_stat, + int32_t segment_id, const SegmentStatistics& segment_stat, TabletSchemaSPtr flush_schema); // CLOSE_LOAD @@ -335,3 +339,5 @@ class LoadStreamStubs { }; } // namespace doris + +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/varrow_flight_result_writer.cpp b/be/src/vec/sink/varrow_flight_result_writer.cpp index b23d1668465bbd0..77788c52ef39fcd 100644 --- a/be/src/vec/sink/varrow_flight_result_writer.cpp +++ b/be/src/vec/sink/varrow_flight_result_writer.cpp @@ -25,6 +25,7 @@ #include "vec/exprs/vexpr_context.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { VArrowFlightResultWriter::VArrowFlightResultWriter( diff --git a/be/src/vec/sink/varrow_flight_result_writer.h b/be/src/vec/sink/varrow_flight_result_writer.h index ab2578421c80bcf..fcae0350b822c9b 100644 --- a/be/src/vec/sink/varrow_flight_result_writer.h +++ b/be/src/vec/sink/varrow_flight_result_writer.h @@ -30,6 +30,7 @@ #include "vec/exprs/vexpr_fwd.h" namespace doris { +#include "common/compile_check_begin.h" class BufferControlBlock; class RuntimeState; @@ -77,3 +78,5 @@ class VArrowFlightResultWriter final : public ResultWriter { }; } // namespace vectorized } // namespace doris + +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/vdata_stream_sender.cpp b/be/src/vec/sink/vdata_stream_sender.cpp index ac820bcab2929a9..2cace094e4d1c85 100644 --- a/be/src/vec/sink/vdata_stream_sender.cpp +++ b/be/src/vec/sink/vdata_stream_sender.cpp @@ -53,6 +53,7 @@ #include "vec/sink/writer/vtablet_writer_v2.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" Status Channel::init(RuntimeState* state) { if (_brpc_dest_addr.hostname.empty()) { @@ -95,7 +96,7 @@ Status Channel::open(RuntimeState* state) { } _be_number = state->be_number(); - _brpc_timeout_ms = std::min(3600, state->execution_timeout()) * 1000; + _brpc_timeout_ms = get_execution_rpc_timeout_ms(state->execution_timeout()); _serializer.set_is_local(_is_local); @@ -238,14 +239,13 @@ Status BlockSerializer::next_serialized_block(Block* block, PBlock* dest, size_t } { + SCOPED_TIMER(_parent->merge_block_timer()); if (rows) { if (!rows->empty()) { - SCOPED_TIMER(_parent->split_block_distribute_by_channel_timer()); const auto* begin = rows->data(); RETURN_IF_ERROR(_mutable_block->add_rows(block, begin, begin + rows->size())); } } else if (!block->empty()) { - SCOPED_TIMER(_parent->merge_block_timer()); RETURN_IF_ERROR(_mutable_block->merge(*block)); } } diff --git a/be/src/vec/sink/vdata_stream_sender.h b/be/src/vec/sink/vdata_stream_sender.h index 88bb804fd8004f8..5fe35e4da119d03 100644 --- a/be/src/vec/sink/vdata_stream_sender.h +++ b/be/src/vec/sink/vdata_stream_sender.h @@ -53,6 +53,7 @@ #include "vec/sink/vtablet_finder.h" namespace doris { +#include "common/compile_check_begin.h" class ObjectPool; class RuntimeState; class MemTracker; @@ -86,6 +87,7 @@ class BlockSerializer { void reset_block() { _mutable_block.reset(); } void set_is_local(bool is_local) { _is_local = is_local; } + bool is_local() const { return _is_local; } private: pipeline::ExchangeSinkLocalState* _parent; @@ -232,3 +234,5 @@ class Channel { } // namespace vectorized } // namespace doris + +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/vmysql_result_writer.cpp b/be/src/vec/sink/vmysql_result_writer.cpp index 932ee9555907655..d1e5baa860a054c 100644 --- a/be/src/vec/sink/vmysql_result_writer.cpp +++ b/be/src/vec/sink/vmysql_result_writer.cpp @@ -29,6 +29,7 @@ #include #include +#include "common/cast_set.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "gutil/integral_types.h" @@ -70,6 +71,7 @@ #include "vec/runtime/vdatetime_value.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { template @@ -143,7 +145,7 @@ Status VMysqlResultWriter::_set_options( template Status VMysqlResultWriter::_write_one_block(RuntimeState* state, Block& block) { Status status = Status::OK(); - auto num_rows = block.rows(); + int num_rows = cast_set(block.rows()); // convert one batch auto result = std::make_unique(); result->result_batch.rows.resize(num_rows); @@ -200,7 +202,7 @@ Status VMysqlResultWriter::_write_one_block(RuntimeState* stat } } - for (size_t row_idx = 0; row_idx < num_rows; ++row_idx) { + for (int row_idx = 0; row_idx < num_rows; ++row_idx) { for (size_t col_idx = 0; col_idx < num_cols; ++col_idx) { RETURN_IF_ERROR(arguments[col_idx].serde->write_column_to_mysql( *(arguments[col_idx].column), row_buffer, row_idx, diff --git a/be/src/vec/sink/vmysql_result_writer.h b/be/src/vec/sink/vmysql_result_writer.h index b89b8cf1b9086af..1c7da4f4b377800 100644 --- a/be/src/vec/sink/vmysql_result_writer.h +++ b/be/src/vec/sink/vmysql_result_writer.h @@ -31,6 +31,7 @@ #include "vec/exprs/vexpr_fwd.h" namespace doris { +#include "common/compile_check_begin.h" class BufferControlBlock; class RuntimeState; @@ -96,3 +97,5 @@ class VMysqlResultWriter final : public ResultWriter { }; } // namespace vectorized } // namespace doris + +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/vrow_distribution.cpp b/be/src/vec/sink/vrow_distribution.cpp index b79df49f0626d6e..3c15dbc8f0f81ed 100644 --- a/be/src/vec/sink/vrow_distribution.cpp +++ b/be/src/vec/sink/vrow_distribution.cpp @@ -42,6 +42,7 @@ #include "vec/sink/writer/vtablet_writer.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" std::pair VRowDistribution::_get_partition_function() { @@ -299,7 +300,7 @@ Status VRowDistribution::_filter_block(vectorized::Block* block, Status VRowDistribution::_generate_rows_distribution_for_non_auto_partition( vectorized::Block* block, bool has_filtered_rows, std::vector& row_part_tablet_ids) { - auto num_rows = block->rows(); + int num_rows = cast_set(block->rows()); bool stop_processing = false; RETURN_IF_ERROR(_tablet_finder->find_tablets(_state, block, num_rows, _partitions, @@ -318,7 +319,7 @@ Status VRowDistribution::_deal_missing_map(vectorized::Block* block, int64_t& rows_stat_val) { // for missing partition keys, calc the missing partition and save in _partitions_need_create auto [part_ctxs, part_exprs] = _get_partition_function(); - auto part_col_num = part_exprs.size(); + int part_col_num = cast_set(part_exprs.size()); // the two vectors are in column-first-order std::vector> col_strs; std::vector col_null_maps; @@ -363,7 +364,7 @@ Status VRowDistribution::_generate_rows_distribution_for_auto_partition( vectorized::Block* block, const std::vector& partition_cols_idx, bool has_filtered_rows, std::vector& row_part_tablet_ids, int64_t& rows_stat_val) { - auto num_rows = block->rows(); + int num_rows = cast_set(block->rows()); std::vector partition_keys = _vpartition->get_partition_keys(); auto& partition_col = block->get_by_position(partition_keys[0]); @@ -393,7 +394,7 @@ Status VRowDistribution::_generate_rows_distribution_for_auto_overwrite( vectorized::Block* block, const std::vector& partition_cols_idx, bool has_filtered_rows, std::vector& row_part_tablet_ids, int64_t& rows_stat_val) { - auto num_rows = block->rows(); + int num_rows = cast_set(block->rows()); // for non-auto-partition situation, goes into two 'else' branch. just find the origin partitions, replace them by rpc, // and find the new partitions to use. @@ -504,7 +505,7 @@ Status VRowDistribution::generate_rows_distribution( VLOG_DEBUG << "Partition-calculated block:" << block->dump_data(0, 1); DCHECK(result_idx != -1); - partition_cols_idx.push_back(result_idx); + partition_cols_idx.push_back(cast_set(result_idx)); } // change the column to compare to transformed. diff --git a/be/src/vec/sink/vrow_distribution.h b/be/src/vec/sink/vrow_distribution.h index 88002c3c21139d8..87fd801984ad735 100644 --- a/be/src/vec/sink/vrow_distribution.h +++ b/be/src/vec/sink/vrow_distribution.h @@ -40,6 +40,7 @@ #include "vec/sink/vtablet_finder.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class IndexChannel; class VNodeChannel; @@ -230,3 +231,5 @@ class VRowDistribution { }; } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/vtablet_block_convertor.cpp b/be/src/vec/sink/vtablet_block_convertor.cpp index 820759af2e41b0f..26de6ea6c7e3d18 100644 --- a/be/src/vec/sink/vtablet_block_convertor.cpp +++ b/be/src/vec/sink/vtablet_block_convertor.cpp @@ -55,6 +55,7 @@ #include "vec/exprs/vexpr_context.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" Status OlapTableBlockConvertor::validate_and_convert_block( RuntimeState* state, vectorized::Block* input_block, @@ -186,11 +187,11 @@ DecimalType OlapTableBlockConvertor::_get_decimalv3_min_or_max(const TypeDescrip Status OlapTableBlockConvertor::_internal_validate_column( RuntimeState* state, const TypeDescriptor& type, bool is_nullable, vectorized::ColumnPtr column, size_t slot_index, bool* stop_processing, - fmt::memory_buffer& error_prefix, const uint32_t row_count, + fmt::memory_buffer& error_prefix, const size_t row_count, vectorized::IColumn::Permutation* rows) { DCHECK((rows == nullptr) || (rows->size() == row_count)); fmt::memory_buffer error_msg; - auto set_invalid_and_append_error_msg = [&](int row) { + auto set_invalid_and_append_error_msg = [&](size_t row) { _filter_map[row] = true; auto ret = state->append_error_msg_to_file([]() -> std::string { return ""; }, [&error_prefix, &error_msg]() -> std::string { @@ -218,7 +219,7 @@ Status OlapTableBlockConvertor::_internal_validate_column( auto* __restrict offsets = column_string->get_offsets().data(); int invalid_count = 0; - for (int j = 0; j < row_count; ++j) { + for (int64_t j = 0; j < row_count; ++j) { invalid_count += (offsets[j] - offsets[j - 1]) > limit; } @@ -452,7 +453,7 @@ Status OlapTableBlockConvertor::_internal_validate_column( } Status OlapTableBlockConvertor::_validate_data(RuntimeState* state, vectorized::Block* block, - const uint32_t rows, int& filtered_rows, + const size_t rows, int& filtered_rows, bool* stop_processing) { for (int i = 0; i < _output_tuple_desc->slots().size(); ++i) { SlotDescriptor* desc = _output_tuple_desc->slots()[i]; diff --git a/be/src/vec/sink/vtablet_block_convertor.h b/be/src/vec/sink/vtablet_block_convertor.h index 7f866c380327752..16921e082dcd62e 100644 --- a/be/src/vec/sink/vtablet_block_convertor.h +++ b/be/src/vec/sink/vtablet_block_convertor.h @@ -36,6 +36,7 @@ #include "vec/sink/autoinc_buffer.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class OlapTableBlockConvertor { public: @@ -68,7 +69,7 @@ class OlapTableBlockConvertor { Status _validate_column(RuntimeState* state, const TypeDescriptor& type, bool is_nullable, vectorized::ColumnPtr column, size_t slot_index, bool* stop_processing, - fmt::memory_buffer& error_prefix, const uint32_t row_count, + fmt::memory_buffer& error_prefix, const size_t row_count, vectorized::IColumn::Permutation* rows = nullptr) { RETURN_IF_CATCH_EXCEPTION({ return _internal_validate_column(state, type, is_nullable, column, slot_index, @@ -79,14 +80,14 @@ class OlapTableBlockConvertor { Status _internal_validate_column(RuntimeState* state, const TypeDescriptor& type, bool is_nullable, vectorized::ColumnPtr column, size_t slot_index, bool* stop_processing, - fmt::memory_buffer& error_prefix, const uint32_t row_count, + fmt::memory_buffer& error_prefix, const size_t row_count, vectorized::IColumn::Permutation* rows = nullptr); // make input data valid for OLAP table // return number of invalid/filtered rows. // invalid row number is set in Bitmap // set stop_processing if we want to stop the whole process now. - Status _validate_data(RuntimeState* state, vectorized::Block* block, const uint32_t rows, + Status _validate_data(RuntimeState* state, vectorized::Block* block, const size_t rows, int& filtered_rows, bool* stop_processing); // some output column of output expr may have different nullable property with dest slot desc @@ -123,4 +124,5 @@ class OlapTableBlockConvertor { bool _is_partial_update_and_auto_inc = false; }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/vtablet_finder.cpp b/be/src/vec/sink/vtablet_finder.cpp index 3bfd5bb4d22e96c..c72da75d02a29ba 100644 --- a/be/src/vec/sink/vtablet_finder.cpp +++ b/be/src/vec/sink/vtablet_finder.cpp @@ -32,6 +32,7 @@ #include "vec/core/block.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" Status OlapTabletFinder::find_tablets(RuntimeState* state, Block* block, int rows, std::vector& partitions, std::vector& tablet_index, bool& stop_processing, diff --git a/be/src/vec/sink/vtablet_finder.h b/be/src/vec/sink/vtablet_finder.h index 24f8e357e28976c..67eb7f08e3e93de 100644 --- a/be/src/vec/sink/vtablet_finder.h +++ b/be/src/vec/sink/vtablet_finder.h @@ -27,6 +27,7 @@ #include "vec/core/block.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class OlapTabletFinder { public: @@ -75,4 +76,5 @@ class OlapTabletFinder { Bitmap _filter_bitmap; }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/sink/writer/vjdbc_table_writer.cpp b/be/src/vec/sink/writer/vjdbc_table_writer.cpp index d54768e58fe3c6e..8c24f4746adf83d 100644 --- a/be/src/vec/sink/writer/vjdbc_table_writer.cpp +++ b/be/src/vec/sink/writer/vjdbc_table_writer.cpp @@ -52,7 +52,6 @@ JdbcConnectorParam VJdbcTableWriter::create_connect_param(const doris::TDataSink jdbc_param.connection_pool_max_wait_time = t_jdbc_sink.jdbc_table.connection_pool_max_wait_time; jdbc_param.connection_pool_max_life_time = t_jdbc_sink.jdbc_table.connection_pool_max_life_time; jdbc_param.connection_pool_keep_alive = t_jdbc_sink.jdbc_table.connection_pool_keep_alive; - jdbc_param.enable_connection_pool = t_jdbc_sink.jdbc_table.enable_connection_pool; return jdbc_param; } diff --git a/be/test/io/cache/block_file_cache_test.cpp b/be/test/io/cache/block_file_cache_test.cpp index f77dc439e955946..11e99a4805286f7 100644 --- a/be/test/io/cache/block_file_cache_test.cpp +++ b/be/test/io/cache/block_file_cache_test.cpp @@ -81,7 +81,7 @@ constexpr unsigned long long operator"" _kb(unsigned long long m) { void assert_range([[maybe_unused]] size_t assert_n, io::FileBlockSPtr file_block, const io::FileBlock::Range& expected_range, io::FileBlock::State expected_state) { auto range = file_block->range(); - + std::cout << "assert_range num: " << assert_n << std::endl; ASSERT_EQ(range.left, expected_range.left); ASSERT_EQ(range.right, expected_range.right); ASSERT_EQ(file_block->state(), expected_state); @@ -139,7 +139,6 @@ class BlockFileCacheTest : public testing::Test { public: static void SetUpTestSuite() { config::file_cache_enter_disk_resource_limit_mode_percent = 99; - config::enable_ttl_cache_evict_using_lru = false; bool exists {false}; ASSERT_TRUE(global_local_filesystem()->exists(caches_dir, &exists).ok()); if (!exists) { @@ -1110,8 +1109,10 @@ TEST_F(BlockFileCacheTest, max_ttl_size) { query_id.hi = 1; query_id.lo = 1; io::FileCacheSettings settings; - settings.query_queue_size = 100000000; - settings.query_queue_elements = 100000; + settings.query_queue_size = 50000000; + settings.query_queue_elements = 50000; + settings.ttl_queue_size = 50000000; + settings.ttl_queue_elements = 50000; settings.capacity = 100000000; settings.max_file_block_size = 100000; settings.max_query_cache_size = 30; @@ -1136,7 +1137,7 @@ TEST_F(BlockFileCacheTest, max_ttl_size) { auto holder = cache.get_or_set(key1, offset, 100000, context); auto blocks = fromHolder(holder); ASSERT_EQ(blocks.size(), 1); - if (offset < 90000000) { + if (offset < 50000000) { assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), io::FileBlock::State::EMPTY); ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); @@ -1145,7 +1146,79 @@ TEST_F(BlockFileCacheTest, max_ttl_size) { io::FileBlock::State::DOWNLOADED); } else { assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), - io::FileBlock::State::SKIP_CACHE); + io::FileBlock::State::EMPTY); + } + blocks.clear(); + } + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, max_ttl_size_with_other_cache_exist) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + settings.query_queue_size = 50000000; + settings.query_queue_elements = 50000; + settings.ttl_queue_size = 50000000; + settings.ttl_queue_elements = 50000; + settings.capacity = 100000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + auto key1 = io::BlockFileCache::hash("key5"); + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + // populate the cache with other cache type + io::CacheContext context; + context.cache_type = io::FileCacheType::NORMAL; + context.query_id = query_id; + int64_t offset = 100000000; + for (; offset < 180000000; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + blocks.clear(); + } + + // then get started with TTL + context.cache_type = io::FileCacheType::TTL; + context.query_id = query_id; + int64_t cur_time = UnixSeconds(); + context.expiration_time = cur_time + 120; + offset = 0; + for (; offset < 100000000; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + if (offset < 50000000) { + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + } else { + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); } blocks.clear(); } @@ -1195,7 +1268,7 @@ TEST_F(BlockFileCacheTest, max_ttl_size_memory_storage) { io::FileBlock::State::DOWNLOADED); } else { assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), - io::FileBlock::State::SKIP_CACHE); + io::FileBlock::State::EMPTY); } blocks.clear(); } @@ -2065,7 +2138,9 @@ TEST_F(BlockFileCacheTest, ttl_normal) { io::FileCacheSettings settings; settings.query_queue_size = 50; settings.query_queue_elements = 5; - settings.capacity = 50; + settings.ttl_queue_size = 50; + settings.ttl_queue_elements = 5; + settings.capacity = 100; settings.max_file_block_size = 30; settings.max_query_cache_size = 30; io::CacheContext context; @@ -2160,7 +2235,9 @@ TEST_F(BlockFileCacheTest, ttl_modify) { io::FileCacheSettings settings; settings.query_queue_size = 30; settings.query_queue_elements = 5; - settings.capacity = 30; + settings.ttl_queue_size = 30; + settings.ttl_queue_elements = 5; + settings.capacity = 60; settings.max_file_block_size = 30; settings.max_query_cache_size = 30; io::CacheContext context; @@ -2314,7 +2391,9 @@ TEST_F(BlockFileCacheTest, ttl_change_to_normal) { io::FileCacheSettings settings; settings.query_queue_size = 30; settings.query_queue_elements = 5; - settings.capacity = 30; + settings.ttl_queue_size = 30; + settings.ttl_queue_elements = 5; + settings.capacity = 60; settings.max_file_block_size = 30; settings.max_query_cache_size = 30; io::CacheContext context; @@ -2428,7 +2507,9 @@ TEST_F(BlockFileCacheTest, ttl_change_expiration_time) { io::FileCacheSettings settings; settings.query_queue_size = 30; settings.query_queue_elements = 5; - settings.capacity = 30; + settings.ttl_queue_size = 30; + settings.ttl_queue_elements = 5; + settings.capacity = 60; settings.max_file_block_size = 30; settings.max_query_cache_size = 30; io::CacheContext context; @@ -2450,6 +2531,16 @@ TEST_F(BlockFileCacheTest, ttl_change_expiration_time) { auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] auto blocks = fromHolder(holder); ASSERT_EQ(blocks.size(), 1); + // std::cout << "current cache size:" << cache.get_used_cache_size() << std::endl; + std::cout << "cache capacity:" << cache.capacity() << std::endl; + auto map = cache.get_stats_unsafe(); + for (auto& [key, value] : map) { + std::cout << key << " : " << value << std::endl; + } + auto key1 = io::BlockFileCache::hash("key1"); + std::cout << cache.dump_structure(key1) << std::endl; + std::cout << cache.dump_structure(key2) << std::endl; + assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); download(blocks[0]); @@ -2532,105 +2623,6 @@ TEST_F(BlockFileCacheTest, ttl_change_expiration_time_memory_storage) { } } -TEST_F(BlockFileCacheTest, ttl_reverse) { - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - test_file_cache(io::FileCacheType::NORMAL); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 36; - settings.query_queue_elements = 5; - settings.capacity = 36; - settings.max_file_block_size = 7; - settings.max_query_cache_size = 30; - io::CacheContext context; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 180; - auto key2 = io::BlockFileCache::hash("key2"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - ASSERT_TRUE(cache.get_async_open_success()); - for (size_t offset = 0; offset < 30; offset += 6) { - auto holder = cache.get_or_set(key2, offset, 6, context); - auto blocks = fromHolder(holder); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(blocks[0]); - } - { - auto holder = cache.get_or_set(key2, 50, 7, context); /// Add range [50, 57] - auto blocks = fromHolder(holder); - assert_range(1, blocks[0], io::FileBlock::Range(50, 56), io::FileBlock::State::SKIP_CACHE); - } - { - context.cache_type = io::FileCacheType::NORMAL; - auto holder = cache.get_or_set(key2, 50, 7, context); /// Add range [50, 57] - auto blocks = fromHolder(holder); - assert_range(1, blocks[0], io::FileBlock::Range(50, 56), io::FileBlock::State::SKIP_CACHE); - } - - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } -} - -TEST_F(BlockFileCacheTest, ttl_reverse_memory_storage) { - test_file_cache_memory_storage(io::FileCacheType::NORMAL); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 36; - settings.query_queue_elements = 5; - settings.capacity = 36; - settings.max_file_block_size = 7; - settings.max_query_cache_size = 30; - settings.storage = "memory"; - io::CacheContext context; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 180; - auto key2 = io::BlockFileCache::hash("key2"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - ASSERT_TRUE(cache.get_async_open_success()); - for (size_t offset = 0; offset < 30; offset += 6) { - auto holder = cache.get_or_set(key2, offset, 6, context); - auto blocks = fromHolder(holder); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download_into_memory(blocks[0]); - } - { - auto holder = cache.get_or_set(key2, 50, 7, context); /// Add range [50, 57] - auto blocks = fromHolder(holder); - assert_range(1, blocks[0], io::FileBlock::Range(50, 56), io::FileBlock::State::SKIP_CACHE); - } - { - context.cache_type = io::FileCacheType::NORMAL; - auto holder = cache.get_or_set(key2, 50, 7, context); /// Add range [50, 57] - auto blocks = fromHolder(holder); - assert_range(1, blocks[0], io::FileBlock::Range(50, 56), io::FileBlock::State::SKIP_CACHE); - } -} - TEST_F(BlockFileCacheTest, io_error) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); @@ -2906,7 +2898,8 @@ TEST_F(BlockFileCacheTest, recyle_cache_async) { cache.clear_file_cache_async(); while (cache._async_clear_file_cache) ; - EXPECT_EQ(cache._cur_cache_size, 5); + EXPECT_EQ(cache._cur_cache_size, 20); // 0-4 is used again, so all the cache data in DISPOSABLE + // remain unremoved if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); } @@ -4838,10 +4831,7 @@ TEST_F(BlockFileCacheTest, recyle_unvalid_ttl_async) { } } -TEST_F(BlockFileCacheTest, ttl_reserve_wo_evict_using_lru) { - config::file_cache_ttl_valid_check_interval_second = 4; - config::enable_ttl_cache_evict_using_lru = false; - +TEST_F(BlockFileCacheTest, reset_capacity) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); } @@ -4854,18 +4844,26 @@ TEST_F(BlockFileCacheTest, ttl_reserve_wo_evict_using_lru) { settings.query_queue_elements = 5; settings.index_queue_size = 30; settings.index_queue_elements = 5; - settings.disposable_queue_size = 0; - settings.disposable_queue_elements = 0; - settings.capacity = 60; + settings.disposable_queue_size = 30; + settings.disposable_queue_elements = 5; + settings.capacity = 90; settings.max_file_block_size = 30; settings.max_query_cache_size = 30; io::CacheContext context; context.query_id = query_id; auto key = io::BlockFileCache::hash("key1"); + auto key2 = io::BlockFileCache::hash("key2"); io::BlockFileCache cache(cache_base_path, settings); - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; - + auto sp = SyncPoint::get_instance(); + Defer defer {[sp] { + sp->clear_call_back("BlockFileCache::set_remove_batch"); + sp->clear_call_back("BlockFileCache::set_sleep_time"); + }}; + sp->set_call_back("BlockFileCache::set_sleep_time", + [](auto&& args) { *try_any_cast(args[0]) = 1; }); + sp->set_call_back("BlockFileCache::set_remove_batch", + [](auto&& args) { *try_any_cast(args[0]) = 2; }); + sp->enable_processing(); ASSERT_TRUE(cache.initialize()); for (int i = 0; i < 100; i++) { if (cache.get_async_open_success()) { @@ -4873,7 +4871,8 @@ TEST_F(BlockFileCacheTest, ttl_reserve_wo_evict_using_lru) { }; std::this_thread::sleep_for(std::chrono::milliseconds(1)); } - for (int64_t offset = 0; offset < (60 * config::max_ttl_cache_ratio / 100 - 5); offset += 5) { + for (int64_t offset = 0; offset < 45; offset += 5) { + context.cache_type = static_cast((offset / 5) % 3); auto holder = cache.get_or_set(key, offset, 5, context); auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); @@ -4885,50 +4884,55 @@ TEST_F(BlockFileCacheTest, ttl_reserve_wo_evict_using_lru) { io::FileBlock::State::DOWNLOADED); } context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; - for (int64_t offset = 60; offset < 70; offset += 5) { - auto holder = cache.get_or_set(key, offset, 5, context); + int64_t cur_time = UnixSeconds(); + context.expiration_time = cur_time + 120; + for (int64_t offset = 45; offset < 90; offset += 5) { + auto holder = cache.get_or_set(key2, offset, 5, context); auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::SKIP_CACHE); + io::FileBlock::State::EMPTY); + ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(segments[0]); + assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), + io::FileBlock::State::DOWNLOADED); } - - EXPECT_EQ(cache._cur_cache_size, 50); - EXPECT_EQ(cache._ttl_queue.cache_size, 0); + std::cout << cache.reset_capacity(30) << std::endl; + while (cache._async_clear_file_cache) + ; + EXPECT_EQ(cache._cur_cache_size, 30); if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); } } -TEST_F(BlockFileCacheTest, ttl_reserve_with_evict_using_lru) { - config::file_cache_ttl_valid_check_interval_second = 4; - config::enable_ttl_cache_evict_using_lru = true; - +TEST_F(BlockFileCacheTest, change_cache_type1) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); } fs::create_directories(cache_base_path); + auto sp = SyncPoint::get_instance(); + sp->set_call_back("FileBlock::change_cache_type", [](auto&& args) { + *try_any_cast(args[0]) = Status::IOError("inject io error"); + }); + sp->enable_processing(); TUniqueId query_id; query_id.hi = 1; query_id.lo = 1; io::FileCacheSettings settings; settings.query_queue_size = 30; settings.query_queue_elements = 5; - settings.index_queue_size = 30; - settings.index_queue_elements = 5; - settings.disposable_queue_size = 0; - settings.disposable_queue_elements = 0; - settings.capacity = 60; + settings.capacity = 30; settings.max_file_block_size = 30; settings.max_query_cache_size = 30; io::CacheContext context; + context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - auto key = io::BlockFileCache::hash("key1"); + int64_t cur_time = UnixSeconds(); + context.expiration_time = cur_time + 120; + int64_t modify_time = cur_time + 5; + auto key1 = io::BlockFileCache::hash("key1"); io::BlockFileCache cache(cache_base_path, settings); - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; - ASSERT_TRUE(cache.initialize()); for (int i = 0; i < 100; i++) { if (cache.get_async_open_success()) { @@ -4936,241 +4940,28 @@ TEST_F(BlockFileCacheTest, ttl_reserve_with_evict_using_lru) { }; std::this_thread::sleep_for(std::chrono::milliseconds(1)); } - for (int64_t offset = 0; offset < (60 * config::max_ttl_cache_ratio / 100); offset += 5) { - auto holder = cache.get_or_set(key, offset, 5, context); + { + auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::EMPTY); + assert_range(1, segments[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), + assert_range(1, segments[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); + EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::TTL); + EXPECT_EQ(segments[0]->expiration_time(), context.expiration_time); } - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; - for (int64_t offset = 60; offset < 70; offset += 5) { - auto holder = cache.get_or_set(key, offset, 5, context); + context.cache_type = io::FileCacheType::NORMAL; + context.expiration_time = 0; + { + auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), + assert_range(1, segments[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - } - - EXPECT_EQ(cache._cur_cache_size, 50); - EXPECT_EQ(cache._ttl_queue.cache_size, 50); - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } -} - -TEST_F(BlockFileCacheTest, ttl_reserve_with_evict_using_lru_meet_max_ttl_cache_ratio_limit) { - config::file_cache_ttl_valid_check_interval_second = 4; - config::enable_ttl_cache_evict_using_lru = true; - int old = config::max_ttl_cache_ratio; - config::max_ttl_cache_ratio = 50; - - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.index_queue_size = 30; - settings.index_queue_elements = 5; - settings.disposable_queue_size = 0; - settings.disposable_queue_elements = 0; - settings.capacity = 60; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - io::CacheContext context; - context.query_id = query_id; - auto key = io::BlockFileCache::hash("key1"); - io::BlockFileCache cache(cache_base_path, settings); - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; - - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - for (int64_t offset = 0; offset < (60 * config::max_ttl_cache_ratio / 100); offset += 5) { - auto holder = cache.get_or_set(key, offset, 5, context); - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::DOWNLOADED); - } - EXPECT_EQ(cache._cur_cache_size, 30); - EXPECT_EQ(cache._ttl_queue.cache_size, 30); - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; - for (int64_t offset = 60; offset < 70; offset += 5) { - auto holder = cache.get_or_set(key, offset, 5, context); - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::DOWNLOADED); - } - - EXPECT_EQ(cache._cur_cache_size, 30); - EXPECT_EQ(cache._ttl_queue.cache_size, 30); - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - config::max_ttl_cache_ratio = old; -} - -TEST_F(BlockFileCacheTest, reset_capacity) { - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.index_queue_size = 30; - settings.index_queue_elements = 5; - settings.disposable_queue_size = 30; - settings.disposable_queue_elements = 5; - settings.capacity = 90; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - io::CacheContext context; - context.query_id = query_id; - auto key = io::BlockFileCache::hash("key1"); - auto key2 = io::BlockFileCache::hash("key2"); - io::BlockFileCache cache(cache_base_path, settings); - auto sp = SyncPoint::get_instance(); - Defer defer {[sp] { - sp->clear_call_back("BlockFileCache::set_remove_batch"); - sp->clear_call_back("BlockFileCache::set_sleep_time"); - }}; - sp->set_call_back("BlockFileCache::set_sleep_time", - [](auto&& args) { *try_any_cast(args[0]) = 1; }); - sp->set_call_back("BlockFileCache::set_remove_batch", - [](auto&& args) { *try_any_cast(args[0]) = 2; }); - sp->enable_processing(); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - for (int64_t offset = 0; offset < 45; offset += 5) { - context.cache_type = static_cast((offset / 5) % 3); - auto holder = cache.get_or_set(key, offset, 5, context); - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::DOWNLOADED); - } - context.cache_type = io::FileCacheType::TTL; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; - for (int64_t offset = 45; offset < 90; offset += 5) { - auto holder = cache.get_or_set(key2, offset, 5, context); - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::DOWNLOADED); - } - std::cout << cache.reset_capacity(30) << std::endl; - while (cache._async_clear_file_cache) - ; - EXPECT_EQ(cache._cur_cache_size, 30); - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } -} - -TEST_F(BlockFileCacheTest, change_cache_type1) { - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - auto sp = SyncPoint::get_instance(); - sp->set_call_back("FileBlock::change_cache_type", [](auto&& args) { - *try_any_cast(args[0]) = Status::IOError("inject io error"); - }); - sp->enable_processing(); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.capacity = 30; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - io::CacheContext context; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; - int64_t modify_time = cur_time + 5; - auto key1 = io::BlockFileCache::hash("key1"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - { - auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::TTL); - EXPECT_EQ(segments[0]->expiration_time(), context.expiration_time); - } - context.cache_type = io::FileCacheType::NORMAL; - context.expiration_time = 0; - { - auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(segments[0]->expiration_time(), 0); + EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::NORMAL); + EXPECT_EQ(segments[0]->expiration_time(), 0); } sp->clear_call_back("FileBlock::change_cache_type"); context.cache_type = io::FileCacheType::TTL; @@ -5493,4 +5284,1388 @@ TEST_F(BlockFileCacheTest, file_cache_path_storage_parse) { } } +TEST_F(BlockFileCacheTest, populate_empty_cache_with_disposable) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + size_t limit = 1000000; + size_t cache_max = 10000000; + io::CacheContext context; + context.cache_type = io::FileCacheType::DISPOSABLE; + context.query_id = query_id; + // int64_t cur_time = UnixSeconds(); + // context.expiration_time = cur_time + 120; + auto key1 = io::BlockFileCache::hash("key1"); + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + int64_t offset = 0; + // fill the cache to its limit + for (; offset < limit; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + // grab more exceed the limit to max cache capacity + for (; offset < cache_max; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], cache_max); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 0); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE]->get_value(), 0); + + // grab more exceed the cache capacity + size_t exceed = 2000000; + for (; offset < (cache_max + exceed); offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(5, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(6, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], cache_max); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 0); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE]->get_value(), + exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, populate_empty_cache_with_normal) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + size_t limit = 3000000; + size_t cache_max = 10000000; + io::CacheContext context; + context.cache_type = io::FileCacheType::NORMAL; + context.query_id = query_id; + // int64_t cur_time = UnixSeconds(); + // context.expiration_time = cur_time + 120; + auto key1 = io::BlockFileCache::hash("key1"); + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + int64_t offset = 0; + // fill the cache to its limit + for (; offset < limit; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + // grab more exceed the limit to max cache capacity + for (; offset < cache_max; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], cache_max); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::NORMAL]->get_value(), 0); + + // grab more exceed the cache capacity + size_t exceed = 2000000; + for (; offset < (cache_max + exceed); offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(5, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(6, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], cache_max); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::NORMAL]->get_value(), exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, populate_empty_cache_with_index) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + size_t limit = 1000000; + size_t cache_max = 10000000; + io::CacheContext context; + context.cache_type = io::FileCacheType::INDEX; + context.query_id = query_id; + // int64_t cur_time = UnixSeconds(); + // context.expiration_time = cur_time + 120; + auto key1 = io::BlockFileCache::hash("key1"); + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + int64_t offset = 0; + // fill the cache to its limit + for (; offset < limit; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + // grab more exceed the limit to max cache capacity + for (; offset < cache_max; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], cache_max); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 0); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::INDEX]->get_value(), 0); + + // grab more exceed the cache capacity + size_t exceed = 2000000; + for (; offset < (cache_max + exceed); offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(5, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(6, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], cache_max); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 0); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::INDEX]->get_value(), exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, populate_empty_cache_with_ttl) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + size_t limit = 5000000; + size_t cache_max = 10000000; + io::CacheContext context; + context.cache_type = io::FileCacheType::TTL; + context.query_id = query_id; + int64_t cur_time = UnixSeconds(); + context.expiration_time = cur_time + 120; + auto key1 = io::BlockFileCache::hash("key1"); + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + int64_t offset = 0; + // fill the cache to its limit + for (; offset < limit; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + // grab more exceed the limit to max cache capacity + for (; offset < cache_max; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], cache_max); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 0); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::TTL]->get_value(), 0); + + // grab more exceed the cache capacity + size_t exceed = 2000000; + for (; offset < (cache_max + exceed); offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(5, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(6, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], cache_max); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 0); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::TTL]->get_value(), exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, disposable_seize_after_normal) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + size_t limit = 1000000; + size_t cache_max = 10000000; + + io::CacheContext context1; + context1.cache_type = io::FileCacheType::NORMAL; + context1.query_id = query_id; + auto key1 = io::BlockFileCache::hash("key1"); + + int64_t offset = 0; + // fill the cache + for (; offset < cache_max; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context1); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], cache_max); + // our hero comes to the stage + io::CacheContext context2; + context2.cache_type = io::FileCacheType::DISPOSABLE; + context2.query_id = query_id; + auto key2 = io::BlockFileCache::hash("key2"); + offset = 0; + for (; offset < limit; offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], limit); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], cache_max - limit); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] + ->get_value(), + limit); + + // grab more exceed the limit + size_t exceed = 2000000; + for (; offset < (limit + exceed); offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(5, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(6, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], limit); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], cache_max - limit); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE]->get_value(), + exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, seize_after_full) { + struct Args { + io::FileCacheType first_type; + io::FileCacheType second_type; + size_t second_limit; + std::string first_metrics; + std::string second_metrics; + }; + + std::vector args_vec = { + {io::FileCacheType::NORMAL, io::FileCacheType::DISPOSABLE, 1000000, + "normal_queue_curr_size", "disposable_queue_curr_size"}, + {io::FileCacheType::NORMAL, io::FileCacheType::INDEX, 1000000, "normal_queue_curr_size", + "index_queue_curr_size"}, + {io::FileCacheType::NORMAL, io::FileCacheType::TTL, 5000000, "normal_queue_curr_size", + "ttl_queue_curr_size"}, + {io::FileCacheType::DISPOSABLE, io::FileCacheType::NORMAL, 3000000, + "disposable_queue_curr_size", "normal_queue_curr_size"}, + {io::FileCacheType::DISPOSABLE, io::FileCacheType::INDEX, 1000000, + "disposable_queue_curr_size", "index_queue_curr_size"}, + {io::FileCacheType::DISPOSABLE, io::FileCacheType::TTL, 5000000, + "disposable_queue_curr_size", "ttl_queue_curr_size"}, + {io::FileCacheType::INDEX, io::FileCacheType::NORMAL, 3000000, "index_queue_curr_size", + "normal_queue_curr_size"}, + {io::FileCacheType::INDEX, io::FileCacheType::DISPOSABLE, 1000000, + "index_queue_curr_size", "disposable_queue_curr_size"}, + {io::FileCacheType::INDEX, io::FileCacheType::TTL, 5000000, "index_queue_curr_size", + "ttl_queue_curr_size"}, + {io::FileCacheType::TTL, io::FileCacheType::NORMAL, 3000000, "ttl_queue_curr_size", + "normal_queue_curr_size"}, + {io::FileCacheType::TTL, io::FileCacheType::DISPOSABLE, 1000000, "ttl_queue_curr_size", + "disposable_queue_curr_size"}, + {io::FileCacheType::TTL, io::FileCacheType::INDEX, 1000000, "ttl_queue_curr_size", + "index_queue_curr_size"}, + }; + + for (auto& args : args_vec) { + std::cout << "filled with " << io::BlockFileCache::cache_type_to_string(args.first_type) + << " and seize with " + << io::BlockFileCache::cache_type_to_string(args.second_type) << std::endl; + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + size_t limit = args.second_limit; + size_t cache_max = 10000000; + + io::CacheContext context1; + context1.cache_type = args.first_type; + context1.query_id = query_id; + if (args.first_type == io::FileCacheType::TTL) { + int64_t cur_time = UnixSeconds(); + context1.expiration_time = cur_time + 120; + } + auto key1 = io::BlockFileCache::hash("key1"); + + int64_t offset = 0; + // fill the cache + for (; offset < cache_max; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context1); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()[args.first_metrics], cache_max); + // our hero comes to the stage + io::CacheContext context2; + context2.cache_type = args.second_type; + context2.query_id = query_id; + if (context2.cache_type == io::FileCacheType::TTL) { + int64_t cur_time = UnixSeconds(); + context2.expiration_time = cur_time + 120; + } + auto key2 = io::BlockFileCache::hash("key2"); + offset = 0; + for (; offset < limit; offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()[args.second_metrics], limit); + ASSERT_EQ(cache.get_stats_unsafe()[args.first_metrics], cache_max - limit); + ASSERT_EQ( + cache._evict_by_size_metrics_matrix[args.first_type][args.second_type]->get_value(), + limit); + + // grab more exceed the limit + size_t exceed = 2000000; + for (; offset < (limit + exceed); offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(5, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(6, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()[args.second_metrics], limit); + ASSERT_EQ(cache.get_stats_unsafe()[args.first_metrics], cache_max - limit); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[args.second_type]->get_value(), exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + } +} + +TEST_F(BlockFileCacheTest, evict_privilege_order_for_disposable) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + io::CacheContext context1; + context1.cache_type = io::FileCacheType::NORMAL; + context1.query_id = query_id; + auto key1 = io::BlockFileCache::hash("key1"); + + int64_t offset = 0; + + for (; offset < 3500000; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context1); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context2; + context2.cache_type = io::FileCacheType::INDEX; + context2.query_id = query_id; + auto key2 = io::BlockFileCache::hash("key2"); + + offset = 0; + + for (; offset < 1300000; offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context3; + context3.cache_type = io::FileCacheType::TTL; + context3.query_id = query_id; + context3.expiration_time = UnixSeconds() + 120; + auto key3 = io::BlockFileCache::hash("key3"); + + offset = 0; + + for (; offset < 5200000; offset += 100000) { + auto holder = cache.get_or_set(key3, offset, 100000, context3); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5200000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1300000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3500000); + + // our hero comes to the stage + io::CacheContext context4; + context4.cache_type = io::FileCacheType::DISPOSABLE; + context4.query_id = query_id; + auto key4 = io::BlockFileCache::hash("key4"); + + offset = 0; + + for (; offset < 1000000; offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5000000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3000000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] + ->get_value(), + 500000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] + ->get_value(), + 300000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] + ->get_value(), + 200000); + + size_t exceed = 200000; + for (; offset < (1000000 + exceed); offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5000000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3000000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] + ->get_value(), + 500000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] + ->get_value(), + 300000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] + ->get_value(), + 200000); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE]->get_value(), + exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, evict_privilege_order_for_normal) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + io::CacheContext context1; + context1.cache_type = io::FileCacheType::DISPOSABLE; + context1.query_id = query_id; + auto key1 = io::BlockFileCache::hash("key1"); + + int64_t offset = 0; + + for (; offset < 1500000; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context1); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context2; + context2.cache_type = io::FileCacheType::INDEX; + context2.query_id = query_id; + auto key2 = io::BlockFileCache::hash("key2"); + + offset = 0; + + for (; offset < 1300000; offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context3; + context3.cache_type = io::FileCacheType::TTL; + context3.query_id = query_id; + context3.expiration_time = UnixSeconds() + 120; + auto key3 = io::BlockFileCache::hash("key3"); + + offset = 0; + + for (; offset < 7200000; offset += 100000) { + auto holder = cache.get_or_set(key3, offset, 100000, context3); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1500000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 7200000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1300000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 0); + + // our hero comes to the stage + io::CacheContext context4; + context4.cache_type = io::FileCacheType::NORMAL; + context4.query_id = query_id; + auto key4 = io::BlockFileCache::hash("key4"); + + offset = 0; + + for (; offset < 3000000; offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5000000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3000000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] + ->get_value(), + 500000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] + ->get_value(), + 300000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] + ->get_value(), + 2200000); + + size_t exceed = 200000; + for (; offset < (3000000 + exceed); offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5000000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3000000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] + ->get_value(), + 500000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] + ->get_value(), + 300000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] + ->get_value(), + 2200000); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::NORMAL]->get_value(), exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, evict_privilege_order_for_index) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + io::CacheContext context1; + context1.cache_type = io::FileCacheType::DISPOSABLE; + context1.query_id = query_id; + auto key1 = io::BlockFileCache::hash("key1"); + + int64_t offset = 0; + + for (; offset < 1500000; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context1); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context2; + context2.cache_type = io::FileCacheType::NORMAL; + context2.query_id = query_id; + auto key2 = io::BlockFileCache::hash("key2"); + + offset = 0; + + for (; offset < 3300000; offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context3; + context3.cache_type = io::FileCacheType::TTL; + context3.query_id = query_id; + context3.expiration_time = UnixSeconds() + 120; + auto key3 = io::BlockFileCache::hash("key3"); + + offset = 0; + + for (; offset < 5200000; offset += 100000) { + auto holder = cache.get_or_set(key3, offset, 100000, context3); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1500000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5200000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3300000); + + // our hero comes to the stage + io::CacheContext context4; + context4.cache_type = io::FileCacheType::INDEX; + context4.query_id = query_id; + auto key4 = io::BlockFileCache::hash("key4"); + + offset = 0; + + for (; offset < 1000000; offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5000000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3000000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] + ->get_value(), + 500000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] + ->get_value(), + 300000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] + ->get_value(), + 200000); + + size_t exceed = 200000; + for (; offset < (1000000 + exceed); offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5000000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3000000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] + ->get_value(), + 500000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] + ->get_value(), + 300000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] + ->get_value(), + 200000); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::INDEX]->get_value(), exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +TEST_F(BlockFileCacheTest, evict_privilege_order_for_ttl) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 3000000; + settings.query_queue_elements = 30000; + settings.index_queue_size = 1000000; + settings.index_queue_elements = 10000; + settings.disposable_queue_size = 1000000; + settings.disposable_queue_elements = 10000; + settings.capacity = 10000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + int i = 0; + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + io::CacheContext context1; + context1.cache_type = io::FileCacheType::DISPOSABLE; + context1.query_id = query_id; + auto key1 = io::BlockFileCache::hash("key1"); + + int64_t offset = 0; + + for (; offset < 1500000; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context1); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context2; + context2.cache_type = io::FileCacheType::INDEX; + context2.query_id = query_id; + auto key2 = io::BlockFileCache::hash("key2"); + + offset = 0; + + for (; offset < 1300000; offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context3; + context3.cache_type = io::FileCacheType::NORMAL; + context3.query_id = query_id; + auto key3 = io::BlockFileCache::hash("key3"); + + offset = 0; + + for (; offset < 7200000; offset += 100000) { + auto holder = cache.get_or_set(key3, offset, 100000, context3); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1500000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 0); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1300000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 7200000); + + // our hero comes to the stage + io::CacheContext context4; + context4.cache_type = io::FileCacheType::TTL; + context4.query_id = query_id; + context4.expiration_time = UnixSeconds() + 120; + auto key4 = io::BlockFileCache::hash("key4"); + + offset = 0; + + for (; offset < 5000000; offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5000000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3000000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] + ->get_value(), + 500000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] + ->get_value(), + 300000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] + ->get_value(), + 4200000); + + size_t exceed = 200000; + for (; offset < (5000000 + exceed); offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(3, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(4, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 5000000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 1000000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 3000000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] + ->get_value(), + 500000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] + ->get_value(), + 300000); + ASSERT_EQ(cache._evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] + ->get_value(), + 4200000); + ASSERT_EQ(cache._evict_by_self_lru_metrics_matrix[FileCacheType::TTL]->get_value(), exceed); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + } // namespace doris::io diff --git a/be/test/io/fs/stream_sink_file_writer_test.cpp b/be/test/io/fs/stream_sink_file_writer_test.cpp index 69f286b205bd6fe..ef0a75288eb6032 100644 --- a/be/test/io/fs/stream_sink_file_writer_test.cpp +++ b/be/test/io/fs/stream_sink_file_writer_test.cpp @@ -59,7 +59,7 @@ class StreamSinkFileWriterTest : public testing::Test { // APPEND_DATA virtual Status append_data(int64_t partition_id, int64_t index_id, int64_t tablet_id, - int64_t segment_id, uint64_t offset, std::span data, + int32_t segment_id, uint64_t offset, std::span data, bool segment_eos = false, FileType file_type = FileType::SEGMENT_FILE) override { EXPECT_EQ(PARTITION_ID, partition_id); diff --git a/be/test/olap/segcompaction_test.cpp b/be/test/olap/segcompaction_test.cpp index ba0d23acb02cef0..32d724d246b3b5b 100644 --- a/be/test/olap/segcompaction_test.cpp +++ b/be/test/olap/segcompaction_test.cpp @@ -34,6 +34,7 @@ #include "olap/rowset/rowset_reader_context.h" #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" +#include "olap/rowset/segment_v2/segment_writer.h" #include "olap/storage_engine.h" #include "olap/tablet_meta.h" #include "olap/tablet_schema.h" @@ -178,6 +179,24 @@ class SegCompactionTest : public testing::Test { tablet_schema->init_from_pb(tablet_schema_pb); } + void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, + const std::string& index_name, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name, + bool parser = false) { + column_pb->set_unique_id(col_unique_id); + column_pb->set_name(column_name); + column_pb->set_type(column_type); + column_pb->set_is_key(false); + column_pb->set_is_nullable(true); + tablet_index->set_index_id(index_id); + tablet_index->set_index_name(index_name); + tablet_index->set_index_type(IndexType::INVERTED); + tablet_index->add_col_unique_id(col_unique_id); + if (parser) { + auto* properties = tablet_index->mutable_properties(); + (*properties)[INVERTED_INDEX_PARSER_KEY] = INVERTED_INDEX_PARSER_UNICODE; + } + } // use different id to avoid conflict void create_rowset_writer_context(int64_t id, TabletSchemaSPtr tablet_schema, RowsetWriterContext* rowset_writer_context) { @@ -830,6 +849,51 @@ TEST_F(SegCompactionTest, SegCompactionThenReadUniqueTableSmall) { } } +TEST_F(SegCompactionTest, CreateSegCompactionWriter) { + config::enable_segcompaction = true; + Status s; + TabletSchemaSPtr tablet_schema = std::make_shared(); + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); + + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "key_index", 0, "INT", + "key"); + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index", 1, "STRING", + "v1"); + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10002, "v2_index", 2, "STRING", + "v2", true); + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, "v3_index", 3, "INT", + "v3"); + + tablet_schema.reset(new TabletSchema); + tablet_schema->init_from_pb(schema_pb); + RowsetSharedPtr rowset; + config::segcompaction_candidate_max_rows = 6000; // set threshold above + // rows_per_segment + config::segcompaction_batch_size = 3; + std::vector segment_num_rows; + { + RowsetWriterContext writer_context; + create_rowset_writer_context(10052, tablet_schema, &writer_context); + + auto res = RowsetFactory::create_rowset_writer(*l_engine, writer_context, false); + EXPECT_TRUE(res.has_value()) << res.error(); + auto rowset_writer = std::move(res).value(); + EXPECT_EQ(Status::OK(), s); + auto beta_rowset_writer = dynamic_cast(rowset_writer.get()); + EXPECT_TRUE(beta_rowset_writer != nullptr); + std::unique_ptr writer = nullptr; + auto status = beta_rowset_writer->create_segment_writer_for_segcompaction(&writer, 0, 1); + EXPECT_TRUE(beta_rowset_writer != nullptr); + EXPECT_TRUE(status == Status::OK()); + int64_t inverted_index_file_size = 0; + status = writer->close_inverted_index(&inverted_index_file_size); + EXPECT_TRUE(status == Status::OK()); + std::cout << inverted_index_file_size << std::endl; + } +} + TEST_F(SegCompactionTest, SegCompactionThenReadAggTableSmall) { config::enable_segcompaction = true; Status s; diff --git a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp deleted file mode 100644 index fc692b8f67569e4..000000000000000 --- a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp +++ /dev/null @@ -1,654 +0,0 @@ - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "gtest/gtest_pred_impl.h" -#include "olap/hll.h" -#include "runtime/descriptors.cpp" -#include "runtime/descriptors.h" -#include "util/arrow/block_convertor.h" -#include "util/arrow/row_batch.h" -#include "util/bitmap_value.h" -#include "util/quantile_state.h" -#include "util/string_parser.hpp" -#include "vec/columns/column.h" -#include "vec/columns/column_array.h" -#include "vec/columns/column_complex.h" -#include "vec/columns/column_decimal.h" -#include "vec/columns/column_map.h" -#include "vec/columns/column_nullable.h" -#include "vec/columns/column_string.h" -#include "vec/columns/column_vector.h" -#include "vec/core/block.h" -#include "vec/core/field.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type.h" -#include "vec/data_types/data_type_array.h" -#include "vec/data_types/data_type_bitmap.h" -#include "vec/data_types/data_type_date.h" -#include "vec/data_types/data_type_date_time.h" -#include "vec/data_types/data_type_decimal.h" -#include "vec/data_types/data_type_hll.h" -#include "vec/data_types/data_type_ipv4.h" -#include "vec/data_types/data_type_ipv6.h" -#include "vec/data_types/data_type_map.h" -#include "vec/data_types/data_type_nullable.h" -#include "vec/data_types/data_type_number.h" -#include "vec/data_types/data_type_quantilestate.h" -#include "vec/data_types/data_type_string.h" -#include "vec/data_types/data_type_struct.h" -#include "vec/data_types/data_type_time_v2.h" -#include "vec/io/io_helper.h" -#include "vec/runtime/vdatetime_value.h" -#include "vec/utils/arrow_column_to_doris_column.h" - -namespace doris::vectorized { - -template -void serialize_and_deserialize_arrow_test() { - vectorized::Block block; - std::vector> cols; - if constexpr (is_scalar) { - cols = { - {"k1", FieldType::OLAP_FIELD_TYPE_INT, 1, TYPE_INT, false}, - {"k7", FieldType::OLAP_FIELD_TYPE_INT, 7, TYPE_INT, true}, - {"k2", FieldType::OLAP_FIELD_TYPE_STRING, 2, TYPE_STRING, false}, - {"k3", FieldType::OLAP_FIELD_TYPE_DECIMAL128I, 3, TYPE_DECIMAL128I, false}, - {"k11", FieldType::OLAP_FIELD_TYPE_DATETIME, 11, TYPE_DATETIME, false}, - {"k4", FieldType::OLAP_FIELD_TYPE_BOOL, 4, TYPE_BOOLEAN, false}, - {"k5", FieldType::OLAP_FIELD_TYPE_DECIMAL32, 5, TYPE_DECIMAL32, false}, - {"k6", FieldType::OLAP_FIELD_TYPE_DECIMAL64, 6, TYPE_DECIMAL64, false}, - {"k12", FieldType::OLAP_FIELD_TYPE_DATETIMEV2, 12, TYPE_DATETIMEV2, false}, - {"k8", FieldType::OLAP_FIELD_TYPE_IPV4, 8, TYPE_IPV4, false}, - {"k9", FieldType::OLAP_FIELD_TYPE_IPV6, 9, TYPE_IPV6, false}, - }; - } else { - cols = {{"a", FieldType::OLAP_FIELD_TYPE_ARRAY, 6, TYPE_ARRAY, true}, - {"m", FieldType::OLAP_FIELD_TYPE_MAP, 8, TYPE_MAP, true}, - {"s", FieldType::OLAP_FIELD_TYPE_STRUCT, 5, TYPE_STRUCT, true}}; - } - - int row_num = 7; - // make desc and generate block - TupleDescriptor tuple_desc(PTupleDescriptor(), true); - for (auto t : cols) { - TSlotDescriptor tslot; - std::string col_name = std::get<0>(t); - tslot.__set_colName(col_name); - TypeDescriptor type_desc(std::get<3>(t)); - bool is_nullable(std::get<4>(t)); - switch (std::get<3>(t)) { - case TYPE_BOOLEAN: - tslot.__set_slotType(type_desc.to_thrift()); - { - auto vec = vectorized::ColumnVector::create(); - auto& data = vec->get_data(); - for (int i = 0; i < row_num; ++i) { - data.push_back(i % 2); - } - vectorized::DataTypePtr data_type(std::make_shared()); - vectorized::ColumnWithTypeAndName type_and_name(vec->get_ptr(), data_type, - col_name); - block.insert(std::move(type_and_name)); - } - break; - case TYPE_INT: - tslot.__set_slotType(type_desc.to_thrift()); - if (is_nullable) { - { - auto column_vector_int32 = vectorized::ColumnVector::create(); - auto column_nullable_vector = - vectorized::make_nullable(std::move(column_vector_int32)); - auto mutable_nullable_vector = std::move(*column_nullable_vector).mutate(); - for (int i = 0; i < row_num; i++) { - if (i % 2 == 0) { - mutable_nullable_vector->insert_default(); - } else { - mutable_nullable_vector->insert(int32(i)); - } - } - auto data_type = vectorized::make_nullable( - std::make_shared()); - vectorized::ColumnWithTypeAndName type_and_name( - mutable_nullable_vector->get_ptr(), data_type, col_name); - block.insert(type_and_name); - } - } else { - auto vec = vectorized::ColumnVector::create(); - auto& data = vec->get_data(); - for (int i = 0; i < row_num; ++i) { - data.push_back(i); - } - vectorized::DataTypePtr data_type(std::make_shared()); - vectorized::ColumnWithTypeAndName type_and_name(vec->get_ptr(), data_type, - col_name); - block.insert(std::move(type_and_name)); - } - break; - case TYPE_DECIMAL32: - type_desc.precision = 9; - type_desc.scale = 2; - tslot.__set_slotType(type_desc.to_thrift()); - { - vectorized::DataTypePtr decimal_data_type = - std::make_shared>(type_desc.precision, - type_desc.scale); - auto decimal_column = decimal_data_type->create_column(); - auto& data = ((vectorized::ColumnDecimal>*) - decimal_column.get()) - ->get_data(); - for (int i = 0; i < row_num; ++i) { - if (i == 0) { - data.push_back(Int32(0)); - continue; - } - Int32 val; - StringParser::ParseResult result = StringParser::PARSE_SUCCESS; - i % 2 == 0 ? val = StringParser::string_to_decimal( - "1234567.56", 11, type_desc.precision, type_desc.scale, - &result) - : val = StringParser::string_to_decimal( - "-1234567.56", 12, type_desc.precision, type_desc.scale, - &result); - EXPECT_TRUE(result == StringParser::PARSE_SUCCESS); - data.push_back(val); - } - - vectorized::ColumnWithTypeAndName type_and_name(decimal_column->get_ptr(), - decimal_data_type, col_name); - block.insert(type_and_name); - } - break; - case TYPE_DECIMAL64: - type_desc.precision = 18; - type_desc.scale = 6; - tslot.__set_slotType(type_desc.to_thrift()); - { - vectorized::DataTypePtr decimal_data_type = - std::make_shared>(type_desc.precision, - type_desc.scale); - auto decimal_column = decimal_data_type->create_column(); - auto& data = ((vectorized::ColumnDecimal>*) - decimal_column.get()) - ->get_data(); - for (int i = 0; i < row_num; ++i) { - if (i == 0) { - data.push_back(Int64(0)); - continue; - } - Int64 val; - StringParser::ParseResult result = StringParser::PARSE_SUCCESS; - std::string decimal_string = - i % 2 == 0 ? "-123456789012.123456" : "123456789012.123456"; - val = StringParser::string_to_decimal( - decimal_string.c_str(), decimal_string.size(), type_desc.precision, - type_desc.scale, &result); - EXPECT_TRUE(result == StringParser::PARSE_SUCCESS); - data.push_back(val); - } - vectorized::ColumnWithTypeAndName type_and_name(decimal_column->get_ptr(), - decimal_data_type, col_name); - block.insert(type_and_name); - } - break; - case TYPE_DECIMAL128I: - type_desc.precision = 27; - type_desc.scale = 9; - tslot.__set_slotType(type_desc.to_thrift()); - { - vectorized::DataTypePtr decimal_data_type( - doris::vectorized::create_decimal(27, 9, true)); - auto decimal_column = decimal_data_type->create_column(); - auto& data = ((vectorized::ColumnDecimal>*) - decimal_column.get()) - ->get_data(); - for (int i = 0; i < row_num; ++i) { - __int128_t value = __int128_t(i * pow(10, 9) + i * pow(10, 8)); - data.push_back(value); - } - vectorized::ColumnWithTypeAndName type_and_name(decimal_column->get_ptr(), - decimal_data_type, col_name); - block.insert(type_and_name); - } - break; - case TYPE_STRING: - tslot.__set_slotType(type_desc.to_thrift()); - { - auto strcol = vectorized::ColumnString::create(); - for (int i = 0; i < row_num; ++i) { - std::string is = std::to_string(i); - strcol->insert_data(is.c_str(), is.size()); - } - vectorized::DataTypePtr data_type(std::make_shared()); - vectorized::ColumnWithTypeAndName type_and_name(strcol->get_ptr(), data_type, - col_name); - block.insert(type_and_name); - } - break; - case TYPE_HLL: - tslot.__set_slotType(type_desc.to_thrift()); - { - vectorized::DataTypePtr hll_data_type(std::make_shared()); - auto hll_column = hll_data_type->create_column(); - std::vector& container = - ((vectorized::ColumnHLL*)hll_column.get())->get_data(); - for (int i = 0; i < row_num; ++i) { - HyperLogLog hll; - hll.update(i); - container.push_back(hll); - } - vectorized::ColumnWithTypeAndName type_and_name(hll_column->get_ptr(), - hll_data_type, col_name); - - block.insert(type_and_name); - } - break; - case TYPE_DATEV2: - tslot.__set_slotType(type_desc.to_thrift()); - { - auto column_vector_date_v2 = vectorized::ColumnVector::create(); - auto& date_v2_data = column_vector_date_v2->get_data(); - for (int i = 0; i < row_num; ++i) { - DateV2Value value; - value.from_date((uint32_t)((2022 << 9) | (6 << 5) | 6)); - date_v2_data.push_back(*reinterpret_cast(&value)); - } - vectorized::DataTypePtr date_v2_type( - std::make_shared()); - vectorized::ColumnWithTypeAndName test_date_v2(column_vector_date_v2->get_ptr(), - date_v2_type, col_name); - block.insert(test_date_v2); - } - break; - case TYPE_DATE: // int64 - tslot.__set_slotType(type_desc.to_thrift()); - { - auto column_vector_date = vectorized::ColumnVector::create(); - auto& date_data = column_vector_date->get_data(); - for (int i = 0; i < row_num; ++i) { - VecDateTimeValue value; - value.from_date_int64(20210501); - date_data.push_back(*reinterpret_cast(&value)); - } - vectorized::DataTypePtr date_type(std::make_shared()); - vectorized::ColumnWithTypeAndName test_date(column_vector_date->get_ptr(), - date_type, col_name); - block.insert(test_date); - } - break; - case TYPE_DATETIME: // int64 - tslot.__set_slotType(type_desc.to_thrift()); - { - auto column_vector_datetime = vectorized::ColumnVector::create(); - auto& datetime_data = column_vector_datetime->get_data(); - for (int i = 0; i < row_num; ++i) { - VecDateTimeValue value; - value.from_date_int64(20210501080910); - datetime_data.push_back(*reinterpret_cast(&value)); - } - vectorized::DataTypePtr datetime_type( - std::make_shared()); - vectorized::ColumnWithTypeAndName test_datetime(column_vector_datetime->get_ptr(), - datetime_type, col_name); - block.insert(test_datetime); - } - break; - case TYPE_DATETIMEV2: // uint64 - tslot.__set_slotType(type_desc.to_thrift()); - { - // 2022-01-01 11:11:11.111 - auto column_vector_datetimev2 = - vectorized::ColumnVector::create(); - // auto& datetimev2_data = column_vector_datetimev2->get_data(); - DateV2Value value; - string date_literal = "2022-01-01 11:11:11.111"; - value.from_date_str(date_literal.c_str(), date_literal.size()); - char to[64] = {}; - std::cout << "value: " << value.to_string(to) << std::endl; - for (int i = 0; i < row_num; ++i) { - column_vector_datetimev2->insert(value.to_date_int_val()); - } - vectorized::DataTypePtr datetimev2_type( - std::make_shared()); - vectorized::ColumnWithTypeAndName test_datetimev2( - column_vector_datetimev2->get_ptr(), datetimev2_type, col_name); - block.insert(test_datetimev2); - } - break; - case TYPE_ARRAY: // array - type_desc.add_sub_type(TYPE_STRING, true); - tslot.__set_slotType(type_desc.to_thrift()); - { - DataTypePtr s = - std::make_shared(std::make_shared()); - DataTypePtr au = std::make_shared(s); - Array a1, a2; - a1.push_back(String("sss")); - a1.push_back(Null()); - a1.push_back(String("clever amory")); - a2.push_back(String("hello amory")); - a2.push_back(Null()); - a2.push_back(String("cute amory")); - a2.push_back(String("sf")); - MutableColumnPtr array_column = au->create_column(); - array_column->reserve(2); - array_column->insert(a1); - array_column->insert(a2); - vectorized::ColumnWithTypeAndName type_and_name(array_column->get_ptr(), au, - col_name); - block.insert(type_and_name); - } - break; - case TYPE_MAP: - type_desc.add_sub_type(TYPE_STRING, true); - type_desc.add_sub_type(TYPE_STRING, true); - tslot.__set_slotType(type_desc.to_thrift()); - { - DataTypePtr s = - std::make_shared(std::make_shared()); - ; - DataTypePtr d = - std::make_shared(std::make_shared()); - DataTypePtr m = std::make_shared(s, d); - Array k1, k2, v1, v2; - k1.push_back("null"); - k1.push_back("doris"); - k1.push_back("clever amory"); - v1.push_back("ss"); - v1.push_back(Null()); - v1.push_back("NULL"); - k2.push_back("hello amory"); - k2.push_back("NULL"); - k2.push_back("cute amory"); - k2.push_back("doris"); - v2.push_back("s"); - v2.push_back("0"); - v2.push_back("sf"); - v2.push_back(Null()); - Map m1, m2; - m1.push_back(k1); - m1.push_back(v1); - m2.push_back(k2); - m2.push_back(v2); - MutableColumnPtr map_column = m->create_column(); - map_column->reserve(2); - map_column->insert(m1); - map_column->insert(m2); - vectorized::ColumnWithTypeAndName type_and_name(map_column->get_ptr(), m, col_name); - block.insert(type_and_name); - } - break; - case TYPE_STRUCT: - type_desc.add_sub_type(TYPE_STRING, "name", true); - type_desc.add_sub_type(TYPE_LARGEINT, "age", true); - type_desc.add_sub_type(TYPE_BOOLEAN, "is", true); - tslot.__set_slotType(type_desc.to_thrift()); - { - DataTypePtr s = - std::make_shared(std::make_shared()); - DataTypePtr d = - std::make_shared(std::make_shared()); - DataTypePtr m = - std::make_shared(std::make_shared()); - DataTypePtr st = - std::make_shared(std::vector {s, d, m}); - Tuple t1, t2; - t1.push_back(String("amory cute")); - t1.push_back(__int128_t(37)); - t1.push_back(true); - t2.push_back("null"); - t2.push_back(__int128_t(26)); - t2.push_back(false); - MutableColumnPtr struct_column = st->create_column(); - struct_column->reserve(2); - struct_column->insert(t1); - struct_column->insert(t2); - vectorized::ColumnWithTypeAndName type_and_name(struct_column->get_ptr(), st, - col_name); - block.insert(type_and_name); - } - break; - case TYPE_IPV4: - tslot.__set_slotType(type_desc.to_thrift()); - { - auto vec = vectorized::ColumnIPv4::create(); - auto& data = vec->get_data(); - for (int i = 0; i < row_num; ++i) { - data.push_back(i); - } - vectorized::DataTypePtr data_type(std::make_shared()); - vectorized::ColumnWithTypeAndName type_and_name(vec->get_ptr(), data_type, - col_name); - block.insert(std::move(type_and_name)); - } - break; - case TYPE_IPV6: - tslot.__set_slotType(type_desc.to_thrift()); - { - auto vec = vectorized::ColumnIPv6::create(); - auto& data = vec->get_data(); - for (int i = 0; i < row_num; ++i) { - data.push_back(i); - } - vectorized::DataTypePtr data_type(std::make_shared()); - vectorized::ColumnWithTypeAndName type_and_name(vec->get_ptr(), data_type, - col_name); - block.insert(std::move(type_and_name)); - } - break; - default: - break; - } - - tslot.__set_col_unique_id(std::get<2>(t)); - SlotDescriptor* slot = new SlotDescriptor(tslot); - tuple_desc.add_slot(slot); - } - - RowDescriptor row_desc(&tuple_desc, true); - // arrow schema - std::shared_ptr _arrow_schema; - EXPECT_EQ(convert_to_arrow_schema(row_desc, &_arrow_schema, "UTC"), Status::OK()); - - // serialize - std::shared_ptr result; - std::cout << "block data: " << block.dump_data(0, row_num) << std::endl; - std::cout << "_arrow_schema: " << _arrow_schema->ToString(true) << std::endl; - - cctz::time_zone timezone_obj; - TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, timezone_obj); - static_cast(convert_to_arrow_batch(block, _arrow_schema, arrow::default_memory_pool(), - &result, timezone_obj)); - Block new_block = block.clone_empty(); - EXPECT_TRUE(result != nullptr); - std::cout << "result: " << result->ToString() << std::endl; - // deserialize - for (auto t : cols) { - std::string real_column_name = std::get<0>(t); - auto* array = result->GetColumnByName(real_column_name).get(); - auto& column_with_type_and_name = new_block.get_by_name(real_column_name); - if (std::get<3>(t) == PrimitiveType::TYPE_DATE || - std::get<3>(t) == PrimitiveType::TYPE_DATETIME) { - { - auto strcol = vectorized::ColumnString::create(); - vectorized::DataTypePtr data_type(std::make_shared()); - vectorized::ColumnWithTypeAndName type_and_name(strcol->get_ptr(), data_type, - real_column_name); - static_cast(arrow_column_to_doris_column( - array, 0, type_and_name.column, type_and_name.type, block.rows(), "UTC")); - { - auto& col = column_with_type_and_name.column.get()->assume_mutable_ref(); - auto& date_data = static_cast&>(col).get_data(); - for (int i = 0; i < strcol->size(); ++i) { - StringRef str = strcol->get_data_at(i); - VecDateTimeValue value; - value.from_date_str(str.data, str.size); - date_data.push_back(*reinterpret_cast(&value)); - } - } - } - continue; - } else if (std::get<3>(t) == PrimitiveType::TYPE_DATEV2) { - auto strcol = vectorized::ColumnString::create(); - vectorized::DataTypePtr data_type(std::make_shared()); - vectorized::ColumnWithTypeAndName type_and_name(strcol->get_ptr(), data_type, - real_column_name); - static_cast(arrow_column_to_doris_column( - array, 0, type_and_name.column, type_and_name.type, block.rows(), "UTC")); - { - auto& col = column_with_type_and_name.column.get()->assume_mutable_ref(); - auto& date_data = static_cast&>(col).get_data(); - for (int i = 0; i < strcol->size(); ++i) { - StringRef str = strcol->get_data_at(i); - DateV2Value value; - value.from_date_str(str.data, str.size); - date_data.push_back(*reinterpret_cast(&value)); - } - } - continue; - } else if (std::get<3>(t) == PrimitiveType::TYPE_DATETIMEV2) { - // now we only support read doris datetimev2 to arrow - block.erase(real_column_name); - new_block.erase(real_column_name); - continue; - } - static_cast(arrow_column_to_doris_column(array, 0, column_with_type_and_name.column, - column_with_type_and_name.type, block.rows(), - "UTC")); - } - - std::cout << block.dump_data() << std::endl; - std::cout << new_block.dump_data() << std::endl; - EXPECT_EQ(block.dump_data(), new_block.dump_data()); -} - -TEST(DataTypeSerDeArrowTest, DataTypeScalaSerDeTest) { - serialize_and_deserialize_arrow_test(); -} - -TEST(DataTypeSerDeArrowTest, DataTypeCollectionSerDeTest) { - serialize_and_deserialize_arrow_test(); -} - -TEST(DataTypeSerDeArrowTest, DataTypeMapNullKeySerDeTest) { - TupleDescriptor tuple_desc(PTupleDescriptor(), true); - TSlotDescriptor tslot; - std::string col_name = "map_null_key"; - tslot.__set_colName(col_name); - TypeDescriptor type_desc(TYPE_MAP); - type_desc.add_sub_type(TYPE_STRING, true); - type_desc.add_sub_type(TYPE_INT, true); - tslot.__set_slotType(type_desc.to_thrift()); - vectorized::Block block; - { - DataTypePtr s = std::make_shared(std::make_shared()); - ; - DataTypePtr d = std::make_shared(std::make_shared()); - DataTypePtr m = std::make_shared(s, d); - Array k1, k2, v1, v2, k3, v3; - k1.push_back("doris"); - k1.push_back("clever amory"); - v1.push_back(Null()); - v1.push_back(30); - k2.push_back("hello amory"); - k2.push_back("NULL"); - k2.push_back("cute amory"); - k2.push_back("doris"); - v2.push_back(26); - v2.push_back(Null()); - v2.push_back(6); - v2.push_back(7); - k3.push_back("test"); - v3.push_back(11); - Map m1, m2, m3; - m1.push_back(k1); - m1.push_back(v1); - m2.push_back(k2); - m2.push_back(v2); - m3.push_back(k3); - m3.push_back(v3); - MutableColumnPtr map_column = m->create_column(); - map_column->reserve(3); - map_column->insert(m1); - map_column->insert(m2); - map_column->insert(m3); - vectorized::ColumnWithTypeAndName type_and_name(map_column->get_ptr(), m, col_name); - block.insert(type_and_name); - } - - tslot.__set_col_unique_id(1); - SlotDescriptor* slot = new SlotDescriptor(tslot); - tuple_desc.add_slot(slot); - RowDescriptor row_desc(&tuple_desc, true); - // arrow schema - std::shared_ptr _arrow_schema; - EXPECT_EQ(convert_to_arrow_schema(row_desc, &_arrow_schema, "UTC"), Status::OK()); - - // serialize - std::shared_ptr result; - std::cout << "block structure: " << block.dump_structure() << std::endl; - std::cout << "_arrow_schema: " << _arrow_schema->ToString(true) << std::endl; - - cctz::time_zone timezone_obj; - TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, timezone_obj); - static_cast(convert_to_arrow_batch(block, _arrow_schema, arrow::default_memory_pool(), - &result, timezone_obj)); - Block new_block = block.clone_empty(); - EXPECT_TRUE(result != nullptr); - std::cout << "result: " << result->ToString() << std::endl; - // deserialize - auto* array = result->GetColumnByName(col_name).get(); - auto& column_with_type_and_name = new_block.get_by_name(col_name); - static_cast(arrow_column_to_doris_column(array, 0, column_with_type_and_name.column, - column_with_type_and_name.type, block.rows(), - "UTC")); - std::cout << block.dump_data() << std::endl; - std::cout << new_block.dump_data() << std::endl; - // new block row_index 0, 2 which row has key null will be filter - EXPECT_EQ(new_block.dump_one_line(0, 1), "{\"doris\":null, \"clever amory\":30}"); - EXPECT_EQ(new_block.dump_one_line(2, 1), "{\"test\":11}"); - EXPECT_EQ(block.dump_data(1, 1), new_block.dump_data(1, 1)); -} - -} // namespace doris::vectorized diff --git a/cloud/src/common/bvars.cpp b/cloud/src/common/bvars.cpp index f053c1877fb525b..f9b11aa85b4897d 100644 --- a/cloud/src/common/bvars.cpp +++ b/cloud/src/common/bvars.cpp @@ -74,6 +74,8 @@ BvarLatencyRecorderWithTag g_bvar_ms_get_delete_bitmap("ms", "get_delete_bitmap" BvarLatencyRecorderWithTag g_bvar_ms_get_delete_bitmap_update_lock("ms", "get_delete_bitmap_update_lock"); BvarLatencyRecorderWithTag g_bvar_ms_remove_delete_bitmap("ms", "remove_delete_bitmap"); +BvarLatencyRecorderWithTag g_bvar_ms_remove_delete_bitmap_update_lock( + "ms", "remove_delete_bitmap_update_lock"); BvarLatencyRecorderWithTag g_bvar_ms_get_instance("ms", "get_instance"); BvarLatencyRecorderWithTag g_bvar_ms_get_rl_task_commit_attach("ms", "get_rl_task_commit_attach"); BvarLatencyRecorderWithTag g_bvar_ms_reset_rl_progress("ms", "reset_rl_progress"); diff --git a/cloud/src/common/bvars.h b/cloud/src/common/bvars.h index 2a9efe35302af45..4848ec4b456cefa 100644 --- a/cloud/src/common/bvars.h +++ b/cloud/src/common/bvars.h @@ -173,6 +173,7 @@ extern BvarLatencyRecorderWithTag g_bvar_ms_update_delete_bitmap; extern BvarLatencyRecorderWithTag g_bvar_ms_get_delete_bitmap; extern BvarLatencyRecorderWithTag g_bvar_ms_get_delete_bitmap_update_lock; extern BvarLatencyRecorderWithTag g_bvar_ms_remove_delete_bitmap; +extern BvarLatencyRecorderWithTag g_bvar_ms_remove_delete_bitmap_update_lock; extern BvarLatencyRecorderWithTag g_bvar_ms_get_cluster_status; extern BvarLatencyRecorderWithTag g_bvar_ms_set_cluster_status; extern BvarLatencyRecorderWithTag g_bvar_ms_get_instance; diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h index 7caba826520fb3a..da6ae113cd73494 100644 --- a/cloud/src/common/config.h +++ b/cloud/src/common/config.h @@ -135,8 +135,10 @@ CONF_mBool(snapshot_get_tablet_stats, "true"); // Value codec version CONF_mInt16(meta_schema_value_version, "1"); -// Limit kv size of Schema SchemaDictKeyList, default 10MB -CONF_mInt32(schema_dict_kv_size_limit, "10485760"); +// Limit kv size of Schema SchemaDictKeyList, default 5MB +CONF_mInt32(schema_dict_kv_size_limit, "5242880"); +// Limit the count of columns in schema dict value, default 4K +CONF_mInt32(schema_dict_key_count_limit, "4096"); // For instance check interval CONF_Int64(reserved_buffer_days, "3"); @@ -217,5 +219,4 @@ CONF_Int32(max_tablet_index_num_per_batch, "1000"); // Max aborted txn num for the same label name CONF_mInt64(max_num_aborted_txn, "100"); - } // namespace doris::cloud::config diff --git a/cloud/src/meta-service/meta_service.cpp b/cloud/src/meta-service/meta_service.cpp index 10969a0cdb68afb..dfa06ca5fa08dcd 100644 --- a/cloud/src/meta-service/meta_service.cpp +++ b/cloud/src/meta-service/meta_service.cpp @@ -1587,8 +1587,8 @@ void MetaServiceImpl::get_rowset(::google::protobuf::RpcController* controller, } if (need_read_schema_dict) { - read_schema_from_dict(code, msg, instance_id, idx.index_id(), txn.get(), - response->mutable_rowset_meta()); + read_schema_dict(code, msg, instance_id, idx.index_id(), txn.get(), response, + request->schema_op()); if (code != MetaServiceCode::OK) return; } TEST_SYNC_POINT_CALLBACK("get_rowset::finish", &response); @@ -2095,6 +2095,58 @@ void MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl } } +void MetaServiceImpl::remove_delete_bitmap_update_lock( + google::protobuf::RpcController* controller, + const RemoveDeleteBitmapUpdateLockRequest* request, + RemoveDeleteBitmapUpdateLockResponse* response, ::google::protobuf::Closure* done) { + RPC_PREPROCESS(remove_delete_bitmap_update_lock); + std::string cloud_unique_id = request->has_cloud_unique_id() ? request->cloud_unique_id() : ""; + if (cloud_unique_id.empty()) { + code = MetaServiceCode::INVALID_ARGUMENT; + msg = "cloud unique id not set"; + return; + } + + instance_id = get_instance_id(resource_mgr_, cloud_unique_id); + if (instance_id.empty()) { + code = MetaServiceCode::INVALID_ARGUMENT; + msg = "empty instance_id"; + LOG(INFO) << msg << ", cloud_unique_id=" << cloud_unique_id; + return; + } + + RPC_RATE_LIMIT(remove_delete_bitmap_update_lock) + std::unique_ptr txn; + TxnErrorCode err = txn_kv_->create_txn(&txn); + if (err != TxnErrorCode::TXN_OK) { + code = cast_as(err); + msg = "failed to init txn"; + return; + } + if (!check_delete_bitmap_lock(code, msg, ss, txn, instance_id, request->table_id(), + request->lock_id(), request->initiator())) { + LOG(WARNING) << "failed to check delete bitmap tablet lock" + << " table_id=" << request->table_id() << " tablet_id=" << request->tablet_id() + << " request lock_id=" << request->lock_id() + << " request initiator=" << request->initiator() << " msg " << msg; + return; + } + std::string lock_key = + meta_delete_bitmap_update_lock_key({instance_id, request->table_id(), -1}); + txn->remove(lock_key); + err = txn->commit(); + if (err != TxnErrorCode::TXN_OK) { + code = cast_as(err); + ss << "failed to remove delete bitmap tablet lock , err=" << err; + msg = ss.str(); + return; + } + + LOG(INFO) << "remove delete bitmap table lock table_id=" << request->table_id() + << " tablet_id=" << request->tablet_id() << " lock_id=" << request->lock_id() + << ", key=" << hex(lock_key) << ", initiator=" << request->initiator(); +} + void MetaServiceImpl::remove_delete_bitmap(google::protobuf::RpcController* controller, const RemoveDeleteBitmapRequest* request, RemoveDeleteBitmapResponse* response, diff --git a/cloud/src/meta-service/meta_service.h b/cloud/src/meta-service/meta_service.h index edc2c97a3eaca8c..7af96cbc14b8ee8 100644 --- a/cloud/src/meta-service/meta_service.h +++ b/cloud/src/meta-service/meta_service.h @@ -278,6 +278,11 @@ class MetaServiceImpl : public cloud::MetaService { RemoveDeleteBitmapResponse* response, ::google::protobuf::Closure* done) override; + void remove_delete_bitmap_update_lock(google::protobuf::RpcController* controller, + const RemoveDeleteBitmapUpdateLockRequest* request, + RemoveDeleteBitmapUpdateLockResponse* response, + ::google::protobuf::Closure* done) override; + // cloud control get cluster's status by this api void get_cluster_status(google::protobuf::RpcController* controller, const GetClusterStatusRequest* request, @@ -654,6 +659,14 @@ class MetaServiceProxy final : public MetaService { call_impl(&cloud::MetaService::remove_delete_bitmap, controller, request, response, done); } + void remove_delete_bitmap_update_lock(google::protobuf::RpcController* controller, + const RemoveDeleteBitmapUpdateLockRequest* request, + RemoveDeleteBitmapUpdateLockResponse* response, + ::google::protobuf::Closure* done) override { + call_impl(&cloud::MetaService::remove_delete_bitmap_update_lock, controller, request, + response, done); + } + // cloud control get cluster's status by this api void get_cluster_status(google::protobuf::RpcController* controller, const GetClusterStatusRequest* request, diff --git a/cloud/src/meta-service/meta_service_schema.cpp b/cloud/src/meta-service/meta_service_schema.cpp index d99f026d051612e..ca0a15d8577b31a 100644 --- a/cloud/src/meta-service/meta_service_schema.cpp +++ b/cloud/src/meta-service/meta_service_schema.cpp @@ -292,11 +292,20 @@ void write_schema_dict(MetaServiceCode& code, std::string& msg, const std::strin } // Limit the size of dict value if (dict_val.size() > config::schema_dict_kv_size_limit) { - code = MetaServiceCode::KV_TXN_COMMIT_ERR; + code = MetaServiceCode::INVALID_ARGUMENT; ss << "Failed to write dictionary for saving, txn_id=" << rowset_meta->txn_id() << ", reached the limited size threshold of SchemaDictKeyList " << config::schema_dict_kv_size_limit; msg = ss.str(); + return; + } + // Limit the count of dict keys + if (dict.column_dict_size() > config::schema_dict_key_count_limit) { + code = MetaServiceCode::INVALID_ARGUMENT; + ss << "Reached max column size limit " << config::schema_dict_key_count_limit + << ", txn_id=" << rowset_meta->txn_id(); + msg = ss.str(); + return; } // splitting large values (>90*1000) into multiple KVs cloud::put(txn, dict_key, dict_val, 0); @@ -307,9 +316,9 @@ void write_schema_dict(MetaServiceCode& code, std::string& msg, const std::strin } } -void read_schema_from_dict(MetaServiceCode& code, std::string& msg, const std::string& instance_id, - int64_t index_id, Transaction* txn, - google::protobuf::RepeatedPtrField* rowset_metas) { +void read_schema_dict(MetaServiceCode& code, std::string& msg, const std::string& instance_id, + int64_t index_id, Transaction* txn, GetRowsetResponse* response, + GetRowsetRequest::SchemaOp schema_op) { std::stringstream ss; // read dict if any rowset has dict key list @@ -331,6 +340,12 @@ void read_schema_from_dict(MetaServiceCode& code, std::string& msg, const std::s LOG(INFO) << "Get schema_dict, column size=" << dict.column_dict_size() << ", index size=" << dict.index_dict_size(); + // Return dict, let backend to fill schema with dict info + if (schema_op == GetRowsetRequest::RETURN_DICT) { + response->mutable_schema_dict()->Swap(&dict); + return; + } + auto fill_schema_with_dict = [&](RowsetMetaCloudPB* out) { std::unordered_map unique_id_map; //init map @@ -366,7 +381,7 @@ void read_schema_from_dict(MetaServiceCode& code, std::string& msg, const std::s }; // fill rowsets's schema with dict info - for (auto& rowset_meta : *rowset_metas) { + for (auto& rowset_meta : *response->mutable_rowset_meta()) { if (rowset_meta.has_schema_dict_key_list()) { fill_schema_with_dict(&rowset_meta); } diff --git a/cloud/src/meta-service/meta_service_schema.h b/cloud/src/meta-service/meta_service_schema.h index d44f01f9747128c..ec1dcc6731f4586 100644 --- a/cloud/src/meta-service/meta_service_schema.h +++ b/cloud/src/meta-service/meta_service_schema.h @@ -35,8 +35,8 @@ void write_schema_dict(MetaServiceCode& code, std::string& msg, const std::strin Transaction* txn, RowsetMetaCloudPB* rowset_meta); // Read schema from dictionary metadata, modified to rowset_metas -void read_schema_from_dict(MetaServiceCode& code, std::string& msg, const std::string& instance_id, - int64_t index_id, Transaction* txn, - google::protobuf::RepeatedPtrField* rowset_metas); +void read_schema_dict(MetaServiceCode& code, std::string& msg, const std::string& instance_id, + int64_t index_id, Transaction* txn, GetRowsetResponse* response, + GetRowsetRequest::SchemaOp schema_op); } // namespace doris::cloud diff --git a/cloud/src/meta-service/meta_service_txn.cpp b/cloud/src/meta-service/meta_service_txn.cpp index cc333c428468a42..32f6b56f51af4cd 100644 --- a/cloud/src/meta-service/meta_service_txn.cpp +++ b/cloud/src/meta-service/meta_service_txn.cpp @@ -1167,7 +1167,7 @@ void commit_txn_immediately( // Accumulate affected rows auto& stats = tablet_stats[tablet_id]; - stats.data_size += i.data_disk_size(); + stats.data_size += i.total_disk_size(); stats.num_rows += i.num_rows(); ++stats.num_rowsets; stats.num_segs += i.num_segments(); diff --git a/cloud/test/meta_service_http_test.cpp b/cloud/test/meta_service_http_test.cpp index 20dee957126e4de..e49628fcb3a7839 100644 --- a/cloud/test/meta_service_http_test.cpp +++ b/cloud/test/meta_service_http_test.cpp @@ -320,6 +320,8 @@ static doris::RowsetMetaCloudPB create_rowset(int64_t txn_id, int64_t tablet_id, rowset.set_num_segments(1); rowset.set_num_rows(num_rows); rowset.set_data_disk_size(num_rows * 100); + rowset.set_index_disk_size(num_rows * 10); + rowset.set_total_disk_size(num_rows * 110); rowset.mutable_tablet_schema()->set_schema_version(0); rowset.set_txn_expiration(::time(nullptr)); // Required by DCHECK return rowset; @@ -1285,7 +1287,7 @@ TEST(MetaServiceHttpTest, GetTabletStatsTest) { stats_tablet_data_size_key({mock_instance, table_id, index_id, partition_id, tablet_id}, &data_size_key); ASSERT_EQ(txn->get(data_size_key, &data_size_val), TxnErrorCode::TXN_OK); - EXPECT_EQ(*(int64_t*)data_size_val.data(), 20000); + EXPECT_EQ(*(int64_t*)data_size_val.data(), 22000); std::string num_rows_key, num_rows_val; stats_tablet_num_rows_key({mock_instance, table_id, index_id, partition_id, tablet_id}, &num_rows_key); @@ -1306,7 +1308,7 @@ TEST(MetaServiceHttpTest, GetTabletStatsTest) { get_tablet_stats(meta_service.get(), table_id, index_id, partition_id, tablet_id, res); ASSERT_EQ(res.status().code(), MetaServiceCode::OK); ASSERT_EQ(res.tablet_stats_size(), 1); - EXPECT_EQ(res.tablet_stats(0).data_size(), 40000); + EXPECT_EQ(res.tablet_stats(0).data_size(), 44000); EXPECT_EQ(res.tablet_stats(0).num_rows(), 400); EXPECT_EQ(res.tablet_stats(0).num_rowsets(), 5); EXPECT_EQ(res.tablet_stats(0).num_segments(), 4); diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index 3baec482710bc49..ee90e604e1c5f67 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -178,6 +178,8 @@ static doris::RowsetMetaCloudPB create_rowset(int64_t txn_id, int64_t tablet_id, rowset.set_num_segments(1); rowset.set_num_rows(num_rows); rowset.set_data_disk_size(num_rows * 100); + rowset.set_index_disk_size(num_rows * 10); + rowset.set_total_disk_size(num_rows * 110); rowset.mutable_tablet_schema()->set_schema_version(0); rowset.set_txn_expiration(::time(nullptr)); // Required by DCHECK return rowset; @@ -4429,7 +4431,7 @@ TEST(MetaServiceTest, GetTabletStatsTest) { stats_tablet_data_size_key({mock_instance, table_id, index_id, partition_id, tablet_id}, &data_size_key); ASSERT_EQ(txn->get(data_size_key, &data_size_val), TxnErrorCode::TXN_OK); - EXPECT_EQ(*(int64_t*)data_size_val.data(), 20000); + EXPECT_EQ(*(int64_t*)data_size_val.data(), 22000); std::string num_rows_key, num_rows_val; stats_tablet_num_rows_key({mock_instance, table_id, index_id, partition_id, tablet_id}, &num_rows_key); @@ -4450,7 +4452,7 @@ TEST(MetaServiceTest, GetTabletStatsTest) { get_tablet_stats(meta_service.get(), table_id, index_id, partition_id, tablet_id, res); ASSERT_EQ(res.status().code(), MetaServiceCode::OK); ASSERT_EQ(res.tablet_stats_size(), 1); - EXPECT_EQ(res.tablet_stats(0).data_size(), 40000); + EXPECT_EQ(res.tablet_stats(0).data_size(), 44000); EXPECT_EQ(res.tablet_stats(0).num_rows(), 400); EXPECT_EQ(res.tablet_stats(0).num_rowsets(), 5); EXPECT_EQ(res.tablet_stats(0).num_segments(), 4); diff --git a/cloud/test/schema_kv_test.cpp b/cloud/test/schema_kv_test.cpp index 69ee9aba4422098..07f658175c806f1 100644 --- a/cloud/test/schema_kv_test.cpp +++ b/cloud/test/schema_kv_test.cpp @@ -293,6 +293,8 @@ static doris::RowsetMetaCloudPB create_rowset(int64_t txn_id, int64_t tablet_id, rowset.set_num_rows(100); rowset.set_num_segments(1); rowset.set_data_disk_size(10000); + rowset.set_index_disk_size(1000); + rowset.set_total_disk_size(11000); if (version > 0) { rowset.set_start_version(version); rowset.set_end_version(version); @@ -478,7 +480,7 @@ TEST(DetachSchemaKVTest, RowsetTest) { EXPECT_EQ(get_rowset_res.stats().num_rows(), 100); EXPECT_EQ(get_rowset_res.stats().num_rowsets(), 2); EXPECT_EQ(get_rowset_res.stats().num_segments(), 1); - EXPECT_EQ(get_rowset_res.stats().data_size(), 10000); + EXPECT_EQ(get_rowset_res.stats().data_size(), 11000); } // new MS read rowsets committed by both old and new MS @@ -527,7 +529,7 @@ TEST(DetachSchemaKVTest, RowsetTest) { EXPECT_EQ(get_rowset_res->stats().num_rows(), 2500); EXPECT_EQ(get_rowset_res->stats().num_rowsets(), 26); EXPECT_EQ(get_rowset_res->stats().num_segments(), 25); - EXPECT_EQ(get_rowset_res->stats().data_size(), 250000); + EXPECT_EQ(get_rowset_res->stats().data_size(), 275000); if (schema != nullptr) { auto schema_version = get_rowset_res->rowset_meta(10).schema_version(); get_rowset_res->mutable_rowset_meta(10)->mutable_tablet_schema()->set_schema_version(3); diff --git a/docker/runtime/doris-compose/Dockerfile b/docker/runtime/doris-compose/Dockerfile index 48d94d612dff60a..501574e372ee145 100644 --- a/docker/runtime/doris-compose/Dockerfile +++ b/docker/runtime/doris-compose/Dockerfile @@ -38,7 +38,7 @@ RUN sed -i s@/deb.debian.org/@/mirrors.aliyun.com/@g /etc/apt/sources.list RUN apt-get clean RUN apt-get update && \ - apt-get install -y default-mysql-client python lsof tzdata curl unzip patchelf jq procps util-linux && \ + apt-get install -y default-mysql-client python lsof tzdata curl unzip patchelf jq procps util-linux gosu && \ ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ dpkg-reconfigure -f noninteractive tzdata && \ apt-get clean @@ -48,14 +48,14 @@ RUN curl -f https://repo1.maven.org/maven2/org/jacoco/jacoco/${JACOCO_VERSION}/j unzip jacoco.zip -d /jacoco # cloud -COPY cloud/CMakeLists.txt cloud/output* output/ms* /opt/apache-doris/cloud/ +COPY --chmod=777 README.md cloud/output* output/ms* /opt/apache-doris/cloud/ RUN mkdir /opt/apache-doris/fdb RUN if [ -d /opt/apache-doris/cloud/bin ]; then \ sed -i 's/\/echo/g' /opt/apache-doris/cloud/bin/start.sh ; \ fi # fe and be -COPY output /opt/apache-doris/ +COPY --chmod=777 output /opt/apache-doris/ # in docker, run 'chmod 755 doris_be' first time cost 1min, remove it. RUN sed -i 's/\/echo/g' /opt/apache-doris/be/bin/start_be.sh diff --git a/docker/runtime/doris-compose/cluster.py b/docker/runtime/doris-compose/cluster.py index 6285e4c615c4c02..ba834167bd1c634 100644 --- a/docker/runtime/doris-compose/cluster.py +++ b/docker/runtime/doris-compose/cluster.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import configparser import filelock import getpass import hashlib @@ -39,6 +40,7 @@ FE_RPC_PORT = 9020 FE_QUERY_PORT = 9030 FE_EDITLOG_PORT = 9010 +FE_JAVA_DBG_PORT = 5005 BE_PORT = 9060 BE_WEBSVR_PORT = 8040 @@ -250,8 +252,6 @@ def init_conf(self): path = self.get_path() os.makedirs(path, exist_ok=True) - config = self.get_add_init_config() - # copy config to local conf_dir = os.path.join(path, "conf") if not os.path.exists(conf_dir) or utils.is_dir_empty(conf_dir): @@ -259,12 +259,15 @@ def init_conf(self): assert not utils.is_dir_empty(conf_dir), "conf directory {} is empty, " \ "check doris path in image is correct".format(conf_dir) utils.enable_dir_with_rw_perm(conf_dir) + config = self.get_add_init_config() if config: with open(os.path.join(conf_dir, self.conf_file_name()), "a") as f: f.write("\n") + f.write("#### start doris-compose add config ####\n\n") for item in config: f.write(item + "\n") + f.write("\n#### end doris-compose add config ####\n") for sub_dir in self.expose_sub_dirs(): os.makedirs(os.path.join(path, sub_dir), exist_ok=True) @@ -325,32 +328,27 @@ def docker_env(self): enable_coverage = self.cluster.coverage_dir envs = { - "MY_IP": - self.get_ip(), - "MY_ID": - self.id, - "MY_TYPE": - self.node_type(), - "FE_QUERY_PORT": - FE_QUERY_PORT, - "FE_EDITLOG_PORT": - FE_EDITLOG_PORT, - "BE_HEARTBEAT_PORT": - BE_HEARTBEAT_PORT, - "DORIS_HOME": - os.path.join(self.docker_home_dir()), - "STOP_GRACE": - 1 if enable_coverage else 0, - "IS_CLOUD": - 1 if self.cluster.is_cloud else 0, - "SQL_MODE_NODE_MGR": - 1 if hasattr(self.cluster, 'sql_mode_node_mgr') - and self.cluster.sql_mode_node_mgr else 0 + "MY_IP": self.get_ip(), + "MY_ID": self.id, + "MY_TYPE": self.node_type(), + "FE_QUERY_PORT": FE_QUERY_PORT, + "FE_EDITLOG_PORT": FE_EDITLOG_PORT, + "BE_HEARTBEAT_PORT": BE_HEARTBEAT_PORT, + "DORIS_HOME": os.path.join(self.docker_home_dir()), + "STOP_GRACE": 1 if enable_coverage else 0, + "IS_CLOUD": 1 if self.cluster.is_cloud else 0, + "SQL_MODE_NODE_MGR": 1 if self.cluster.sql_mode_node_mgr else 0, } if self.cluster.is_cloud: envs["META_SERVICE_ENDPOINT"] = self.cluster.get_meta_server_addr() + # run as host user + if not getattr(self.cluster, 'is_root_user', True): + envs["HOST_USER"] = getpass.getuser() + envs["HOST_UID"] = os.getuid() + envs["HOST_GID"] = os.getgid() + if enable_coverage: outfile = "{}/coverage/{}-coverage-{}-{}".format( DOCKER_DORIS_PATH, self.node_type(), self.cluster.name, @@ -365,6 +363,15 @@ def docker_env(self): return envs + def entrypoint(self): + if self.start_script(): + return [ + "bash", + os.path.join(DOCKER_RESOURCE_PATH, "entrypoint.sh") + ] + self.start_script() + else: + return None + def get_add_init_config(self): return [] @@ -453,6 +460,18 @@ def get_add_init_config(self): cfg += [ "cloud_unique_id = " + self.cloud_unique_id(), ] + + with open("{}/conf/{}".format(self.get_path(), self.conf_file_name()), + "r") as f: + parser = configparser.ConfigParser() + parser.read_string('[dummy_section]\n' + f.read()) + for key in ("JAVA_OPTS", "JAVA_OPTS_FOR_JDK_17"): + value = parser["dummy_section"].get(key) + if value: + cfg.append( + f"{key} = \"{value} -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:{FE_JAVA_DBG_PORT}\"" + ) + return cfg def init_is_follower(self): @@ -474,11 +493,14 @@ def docker_env(self): def cloud_unique_id(self): return "sql_server_{}".format(self.id) - def entrypoint(self): - return ["bash", os.path.join(DOCKER_RESOURCE_PATH, "init_fe.sh")] + def start_script(self): + return ["init_fe.sh"] def docker_ports(self): - return [FE_HTTP_PORT, FE_EDITLOG_PORT, FE_RPC_PORT, FE_QUERY_PORT] + return [ + FE_HTTP_PORT, FE_EDITLOG_PORT, FE_RPC_PORT, FE_QUERY_PORT, + FE_JAVA_DBG_PORT + ] def docker_home_dir(self): return os.path.join(DOCKER_DORIS_PATH, "fe") @@ -572,8 +594,8 @@ def init_disk(self, be_disks): storage_root_path = ";".join(dir_descs) if dir_descs else '""' f.write("\nstorage_root_path = {}\n".format(storage_root_path)) - def entrypoint(self): - return ["bash", os.path.join(DOCKER_RESOURCE_PATH, "init_be.sh")] + def start_script(self): + return ["init_be.sh"] def docker_env(self): envs = super().docker_env() @@ -624,12 +646,8 @@ def get_add_init_config(self): cfg += self.cluster.ms_config return cfg - def entrypoint(self): - return [ - "bash", - os.path.join(DOCKER_RESOURCE_PATH, "init_cloud.sh"), - "--meta-service" - ] + def start_script(self): + return ["init_cloud.sh", "--meta-service"] def node_type(self): return Node.TYPE_MS @@ -649,11 +667,8 @@ def get_add_init_config(self): cfg += self.cluster.recycle_config return cfg - def entrypoint(self): - return [ - "bash", - os.path.join(DOCKER_RESOURCE_PATH, "init_cloud.sh"), "--recycler" - ] + def start_script(self): + return ["init_cloud.sh", "--recycler"] def node_type(self): return Node.TYPE_RECYCLE @@ -674,8 +689,8 @@ def copy_conf_to_local(self, local_conf_dir): with open(os.path.join(local_conf_dir, "fdb.cluster"), "w") as f: f.write(self.cluster.get_fdb_cluster()) - def entrypoint(self): - return ["bash", os.path.join(DOCKER_RESOURCE_PATH, "init_fdb.sh")] + def start_script(self): + return ["init_fdb.sh"] def docker_home_dir(self): return os.path.join(DOCKER_DORIS_PATH, "fdb") @@ -692,14 +707,15 @@ def expose_sub_dirs(self): class Cluster(object): - def __init__(self, name, subnet, image, is_cloud, fe_config, be_config, - ms_config, recycle_config, fe_follower, be_disks, be_cluster, - reg_be, coverage_dir, cloud_store_config, sql_mode_node_mgr, - be_metaservice_endpoint, be_cluster_id): + def __init__(self, name, subnet, image, is_cloud, is_root_user, fe_config, + be_config, ms_config, recycle_config, fe_follower, be_disks, + be_cluster, reg_be, coverage_dir, cloud_store_config, + sql_mode_node_mgr, be_metaservice_endpoint, be_cluster_id): self.name = name self.subnet = subnet self.image = image self.is_cloud = is_cloud + self.is_root_user = is_root_user self.fe_config = fe_config self.be_config = be_config self.ms_config = ms_config @@ -719,9 +735,9 @@ def __init__(self, name, subnet, image, is_cloud, fe_config, be_config, self.be_cluster_id = be_cluster_id @staticmethod - def new(name, image, is_cloud, fe_config, be_config, ms_config, - recycle_config, fe_follower, be_disks, be_cluster, reg_be, - coverage_dir, cloud_store_config, sql_mode_node_mgr, + def new(name, image, is_cloud, is_root_user, fe_config, be_config, + ms_config, recycle_config, fe_follower, be_disks, be_cluster, + reg_be, coverage_dir, cloud_store_config, sql_mode_node_mgr, be_metaservice_endpoint, be_cluster_id): if not os.path.exists(LOCAL_DORIS_PATH): os.makedirs(LOCAL_DORIS_PATH, exist_ok=True) @@ -731,8 +747,8 @@ def new(name, image, is_cloud, fe_config, be_config, ms_config, if os.getuid() == utils.get_path_uid(lock_file): os.chmod(lock_file, 0o666) subnet = gen_subnet_prefix16() - cluster = Cluster(name, subnet, image, is_cloud, fe_config, - be_config, ms_config, recycle_config, + cluster = Cluster(name, subnet, image, is_cloud, is_root_user, + fe_config, be_config, ms_config, recycle_config, fe_follower, be_disks, be_cluster, reg_be, coverage_dir, cloud_store_config, sql_mode_node_mgr, be_metaservice_endpoint, diff --git a/docker/runtime/doris-compose/command.py b/docker/runtime/doris-compose/command.py index 1e55e74b8a6cb3d..7a2f3f3c195f181 100644 --- a/docker/runtime/doris-compose/command.py +++ b/docker/runtime/doris-compose/command.py @@ -182,6 +182,18 @@ def _get_parser_bool_action(self, is_store_true): def _support_boolean_action(self): return sys.version_info.major == 3 and sys.version_info.minor >= 9 + def _print_table(self, header, datas): + if utils.is_enable_log(): + table = prettytable.PrettyTable( + [utils.render_green(field) for field in header]) + for row in datas: + table.add_row(row) + print(table) + return "" + else: + datas.insert(0, header) + return datas + class SimpleCommand(Command): @@ -211,7 +223,6 @@ def run(self, args): LOG.info( utils.render_green("{} succ, total related node num {}".format( show_cmd, related_node_num))) - return "" if for_all: related_nodes = cluster.get_all_nodes() @@ -261,6 +272,13 @@ def add_parser(self, args_parsers): help= "Create cloud cluster, default is false. Only use when creating new cluster." ) + parser.add_argument( + "--root", + default=False, + action=self._get_parser_bool_action(True), + help= + "Run cluster as root user, default is false, it will run as host user." + ) parser.add_argument( "--wait-timeout", @@ -500,7 +518,7 @@ def run(self, args): args.add_recycle_num = 0 cluster = CLUSTER.Cluster.new( - args.NAME, args.IMAGE, args.cloud, args.fe_config, + args.NAME, args.IMAGE, args.cloud, args.root, args.fe_config, args.be_config, args.ms_config, args.recycle_config, args.fe_follower, args.be_disks, args.be_cluster, args.reg_be, args.coverage_dir, cloud_store_config, args.sql_mode_node_mgr, @@ -1022,18 +1040,6 @@ def add_parser(self, args_parsers): action=self._get_parser_bool_action(True), help="Print more detail fields.") - def _handle_data(self, header, datas): - if utils.is_enable_log(): - table = prettytable.PrettyTable( - [utils.render_green(field) for field in header]) - for row in datas: - table.add_row(row) - print(table) - return "" - else: - datas.insert(0, header) - return datas - def run(self, args): COMPOSE_MISSING = "(missing)" COMPOSE_BAD = "(bad)" @@ -1125,7 +1131,7 @@ def parse_cluster_compose_file(cluster_name): CLUSTER.get_master_fe_endpoint(name), is_cloud, "{}{}".format(compose_file, cluster_info["status"]))) - return self._handle_data(header, rows) + return self._print_table(header, rows) header = [ "CLUSTER", "NAME", "IP", "STATUS", "CONTAINER ID", "IMAGE", @@ -1212,48 +1218,42 @@ def get_node_seq(node): for node in sorted(nodes, key=get_node_seq): rows.append(node.info(args.detail)) - return self._handle_data(header, rows) + return self._print_table(header, rows) -class GetCloudIniCommand(Command): +class InfoCommand(Command): def add_parser(self, args_parsers): - parser = args_parsers.add_parser("get-cloud-ini", - help="Get cloud.init") - parser.add_argument( - "NAME", - nargs="*", - help= - "Specify multiple clusters, if specific, show all their containers." - ) + parser = args_parsers.add_parser( + "info", help="Show info like cloud.ini, port, path, etc") self._add_parser_common_args(parser) - def _handle_data(self, header, datas): - if utils.is_enable_log(): - table = prettytable.PrettyTable( - [utils.render_green(field) for field in header]) - for row in datas: - table.add_row(row) - print(table) - return "" - else: - datas.insert(0, header) - return datas - def run(self, args): - header = ["key", "value"] - - rows = [] + header = ["key", "value", "scope"] + cloud_cfg_file_env = os.getenv("DORIS_CLOUD_CFG_FILE") + cloud_cfg_file = cloud_cfg_file_env if cloud_cfg_file_env else "${LOCAL_DORIS_PATH}/cloud.ini" + rows = [ + ("LOCAL_DORIS_PATH", CLUSTER.LOCAL_DORIS_PATH, "env variable"), + ("DORIS_CLOUD_CFG_FILE", cloud_cfg_file, "env variable"), + ("FE_QUERY_PORT", CLUSTER.FE_QUERY_PORT, "constant"), + ("FE_HTTP_PORT", CLUSTER.FE_HTTP_PORT, "constant"), + ("FE_EDITLOG_PORT", CLUSTER.FE_EDITLOG_PORT, "constant"), + ("FE_JAVA_DBG_PORT", CLUSTER.FE_JAVA_DBG_PORT, "constant"), + ("BE_HEARTBEAT_PORT", CLUSTER.BE_HEARTBEAT_PORT, "constant"), + ("BE_WEBSVR_PORT", CLUSTER.BE_WEBSVR_PORT, "constant"), + ("MS_PORT", CLUSTER.MS_PORT, "constant"), + ("RECYCLER_PORT", CLUSTER.MS_PORT, "constant"), + ] with open(CLUSTER.CLOUD_CFG_FILE, "r") as f: for line in f: line = line.strip() if line and not line.startswith('#'): key, value = line.split('=', 1) - rows.append([key.strip(), value.strip()]) + rows.append((key.strip(), value.strip(), "cloud.ini")) - return self._handle_data(header, rows) + return self._print_table(header, rows) class AddRWPermCommand(Command): @@ -1278,8 +1278,8 @@ def run(self, args): NeedStartCommand("restart", "Restart the doris containers. "), SimpleCommand("pause", "Pause the doris containers. "), SimpleCommand("unpause", "Unpause the doris containers. "), - GetCloudIniCommand("get-cloud-ini"), GenConfCommand("config"), + InfoCommand("info"), ListCommand("ls"), AddRWPermCommand("add-rw-perm"), ] diff --git a/docker/runtime/doris-compose/resource/entrypoint.sh b/docker/runtime/doris-compose/resource/entrypoint.sh new file mode 100644 index 000000000000000..a3cdaaae8f1b525 --- /dev/null +++ b/docker/runtime/doris-compose/resource/entrypoint.sh @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +DIR=$( + cd $(dirname $0) + pwd +) + +source $DIR/common.sh + +RUN_USER=root + +create_host_user() { + if [ -z ${HOST_USER} ]; then + health_log "no specific run user, run as root" + return + fi + id ${HOST_USER} + if [ $? -eq 0 ]; then + health_log "contain user ${HOST_USER}, no create new user" + RUN_USER=${HOST_USER} + return + fi + id ${HOST_UID} + if [ $? -eq 0 ]; then + health_log "contain uid ${HOST_UID}, no create new user" + return + fi + addgroup --gid ${HOST_GID} ${HOST_USER} + if [ $? -eq 0 ]; then + health_log "create group ${HOST_USER} with gid ${HOST_GID} succ" + else + health_log "create group ${HOST_USER} with gid ${HOST_GID} failed" + return + fi + adduser --disabled-password --shell /bin/bash --gecos "" --uid ${HOST_UID} --gid ${HOST_GID} ${HOST_USER} + if [ $? -eq 0 ]; then + health_log "create user ${HOST_USER} with uid ${HOST_UID} succ" + RUN_USER=${HOST_USER} + else + health_log "create user ${HOST_USER} with uid ${HOST_UID} failed" + fi +} + +create_host_user + +if command -v gosu 2>&1 >/dev/null; then + if [ -f ${LOG_FILE} ]; then + chown ${RUN_USER}:${RUN_USER} ${LOG_FILE} + fi + gosu ${RUN_USER} bash ${DIR}/${1} ${@:2} +else + bash ${DIR}/${1} ${@:2} +fi diff --git a/docker/thirdparties/docker-compose/mysql/init/04-insert.sql b/docker/thirdparties/docker-compose/mysql/init/04-insert.sql index a852012fa94216a..677a041258de8fe 100644 --- a/docker/thirdparties/docker-compose/mysql/init/04-insert.sql +++ b/docker/thirdparties/docker-compose/mysql/init/04-insert.sql @@ -1049,6 +1049,7 @@ insert into doris_test.test1 values (false, 'abc', 'efg', '2022-10-01', 4.5, 1, 2, 1024, 100000, 1.2, '2022-10-02 12:59:01', 24.000); insert into doris_test.ex_tb0 values (111, 'abc'), (112, 'abd'), (113, 'abe'),(114, 'abf'),(115, 'abg'); +analyze table doris_test.ex_tb0; insert into doris_test.ex_tb1 values ('{"k1":"v1", "k2":"v2"}'); diff --git a/docker/thirdparties/docker-compose/postgresql/init/04-insert.sql b/docker/thirdparties/docker-compose/postgresql/init/04-insert.sql index c39e9924a75ea1c..ae3570dfc186ceb 100644 --- a/docker/thirdparties/docker-compose/postgresql/init/04-insert.sql +++ b/docker/thirdparties/docker-compose/postgresql/init/04-insert.sql @@ -1042,6 +1042,7 @@ insert into doris_test.test1 values (cast(0 as bit), 'abc', 'def', '2022-10-11', 1.234, 1, 2, 1022, '2022-10-22 10:59:59', 34.123), (cast(0 as bit), 'abc', 'def', '2022-10-11', 1.234, 1, 2, 1023, '2022-10-22 10:59:59', 34.123), (cast(0 as bit), 'abc', 'def', '2022-10-11', 1.234, 1, 2, 1024, '2022-10-22 10:59:59', 34.123); +analyze doris_test.test1; insert into doris_test.test2 values (123, 'zhangsan', '2022-01-01 01:02:03', 'zhangsan1', '2022-01-01 01:02:04', 111, 122, false, 'code', 'zhangsan2', 222, 'tag', 'remark'), diff --git a/docker/thirdparties/docker-compose/sqlserver/init/04-insert.sql b/docker/thirdparties/docker-compose/sqlserver/init/04-insert.sql index 930ad497dbabceb..f671bd230500804 100644 --- a/docker/thirdparties/docker-compose/sqlserver/init/04-insert.sql +++ b/docker/thirdparties/docker-compose/sqlserver/init/04-insert.sql @@ -17,6 +17,7 @@ use doris_test; Insert into dbo.student values (1, 'doris', 18), (2, 'alice', 19), (3, 'bob', 20); +UPDATE STATISTICS dbo.student; Insert into dbo.test_int values (1, 0, 1, 1), (2, 1, -1, -1), diff --git a/fe/be-java-extensions/jdbc-scanner/src/main/java/org/apache/doris/jdbc/BaseJdbcExecutor.java b/fe/be-java-extensions/jdbc-scanner/src/main/java/org/apache/doris/jdbc/BaseJdbcExecutor.java index 61382e6c2532f3d..e05a7baa00899f1 100644 --- a/fe/be-java-extensions/jdbc-scanner/src/main/java/org/apache/doris/jdbc/BaseJdbcExecutor.java +++ b/fe/be-java-extensions/jdbc-scanner/src/main/java/org/apache/doris/jdbc/BaseJdbcExecutor.java @@ -41,7 +41,6 @@ import java.sql.Connection; import java.sql.DatabaseMetaData; import java.sql.Date; -import java.sql.Driver; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.ResultSetMetaData; @@ -52,7 +51,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.function.Function; public abstract class BaseJdbcExecutor implements JdbcExecutor { @@ -94,8 +92,7 @@ public BaseJdbcExecutor(byte[] thriftParams) throws Exception { .setConnectionPoolMaxSize(request.connection_pool_max_size) .setConnectionPoolMaxWaitTime(request.connection_pool_max_wait_time) .setConnectionPoolMaxLifeTime(request.connection_pool_max_life_time) - .setConnectionPoolKeepAlive(request.connection_pool_keep_alive) - .setEnableConnectionPool(request.enable_connection_pool); + .setConnectionPoolKeepAlive(request.connection_pool_keep_alive); JdbcDataSource.getDataSource().setCleanupInterval(request.connection_pool_cache_clear_time); System.setProperty("com.zaxxer.hikari.useWeakReferences", "true"); init(config, request.statement); @@ -120,21 +117,25 @@ public void close() throws Exception { } } finally { closeResources(resultSet, stmt, conn); - if (config.isEnableConnectionPool()) { - if (config.getConnectionPoolMinSize() == 0 && hikariDataSource != null) { - hikariDataSource.close(); - JdbcDataSource.getDataSource().getSourcesMap().remove(config.createCacheKey()); - hikariDataSource = null; - } + if (config.getConnectionPoolMinSize() == 0 && hikariDataSource != null) { + hikariDataSource.close(); + JdbcDataSource.getDataSource().getSourcesMap().remove(config.createCacheKey()); + hikariDataSource = null; } } } - private void closeResources(AutoCloseable... closeables) { - for (AutoCloseable closeable : closeables) { - if (closeable != null) { + private void closeResources(Object... resources) { + for (Object resource : resources) { + if (resource != null) { try { - closeable.close(); + if (resource instanceof ResultSet) { + ((ResultSet) resource).close(); + } else if (resource instanceof Statement) { + ((Statement) resource).close(); + } else if (resource instanceof Connection) { + ((Connection) resource).close(); + } } catch (Exception e) { LOG.warn("Cannot close resource: ", e); } @@ -147,12 +148,10 @@ protected void abortReadConnection(Connection connection, ResultSet resultSet) } public void cleanDataSource() { - if (config.isEnableConnectionPool()) { - if (hikariDataSource != null) { - hikariDataSource.close(); - JdbcDataSource.getDataSource().getSourcesMap().remove(config.createCacheKey()); - hikariDataSource = null; - } + if (hikariDataSource != null) { + hikariDataSource.close(); + JdbcDataSource.getDataSource().getSourcesMap().remove(config.createCacheKey()); + hikariDataSource = null; } } @@ -294,64 +293,51 @@ public boolean hasNext() throws JdbcExecutorException { private void init(JdbcDataSourceConfig config, String sql) throws JdbcExecutorException { ClassLoader oldClassLoader = Thread.currentThread().getContextClassLoader(); + String hikariDataSourceKey = config.createCacheKey(); try { ClassLoader parent = getClass().getClassLoader(); ClassLoader classLoader = UdfUtils.getClassLoader(config.getJdbcDriverUrl(), parent); Thread.currentThread().setContextClassLoader(classLoader); - if (config.isEnableConnectionPool()) { - String hikariDataSourceKey = config.createCacheKey(); - hikariDataSource = JdbcDataSource.getDataSource().getSource(hikariDataSourceKey); - if (hikariDataSource == null) { - synchronized (hikariDataSourceLock) { - hikariDataSource = JdbcDataSource.getDataSource().getSource(hikariDataSourceKey); - if (hikariDataSource == null) { - long start = System.currentTimeMillis(); - HikariDataSource ds = new HikariDataSource(); - ds.setDriverClassName(config.getJdbcDriverClass()); - ds.setJdbcUrl(SecurityChecker.getInstance().getSafeJdbcUrl(config.getJdbcUrl())); - ds.setUsername(config.getJdbcUser()); - ds.setPassword(config.getJdbcPassword()); - ds.setMinimumIdle(config.getConnectionPoolMinSize()); // default 1 - ds.setMaximumPoolSize(config.getConnectionPoolMaxSize()); // default 10 - ds.setConnectionTimeout(config.getConnectionPoolMaxWaitTime()); // default 5000 - ds.setMaxLifetime(config.getConnectionPoolMaxLifeTime()); // default 30 min - ds.setIdleTimeout(config.getConnectionPoolMaxLifeTime() / 2L); // default 15 min - setValidationQuery(ds); - if (config.isConnectionPoolKeepAlive()) { - ds.setKeepaliveTime(config.getConnectionPoolMaxLifeTime() / 5L); // default 6 min - } - hikariDataSource = ds; - JdbcDataSource.getDataSource().putSource(hikariDataSourceKey, hikariDataSource); - LOG.info("JdbcClient set" - + " ConnectionPoolMinSize = " + config.getConnectionPoolMinSize() - + ", ConnectionPoolMaxSize = " + config.getConnectionPoolMaxSize() - + ", ConnectionPoolMaxWaitTime = " + config.getConnectionPoolMaxWaitTime() - + ", ConnectionPoolMaxLifeTime = " + config.getConnectionPoolMaxLifeTime() - + ", ConnectionPoolKeepAlive = " + config.isConnectionPoolKeepAlive()); - LOG.info("init datasource [" + (config.getJdbcUrl() + config.getJdbcUser()) + "] cost: " + ( - System.currentTimeMillis() - start) + " ms"); + hikariDataSource = JdbcDataSource.getDataSource().getSource(hikariDataSourceKey); + if (hikariDataSource == null) { + synchronized (hikariDataSourceLock) { + hikariDataSource = JdbcDataSource.getDataSource().getSource(hikariDataSourceKey); + if (hikariDataSource == null) { + long start = System.currentTimeMillis(); + HikariDataSource ds = new HikariDataSource(); + ds.setDriverClassName(config.getJdbcDriverClass()); + ds.setJdbcUrl(SecurityChecker.getInstance().getSafeJdbcUrl(config.getJdbcUrl())); + ds.setUsername(config.getJdbcUser()); + ds.setPassword(config.getJdbcPassword()); + ds.setMinimumIdle(config.getConnectionPoolMinSize()); // default 1 + ds.setMaximumPoolSize(config.getConnectionPoolMaxSize()); // default 10 + ds.setConnectionTimeout(config.getConnectionPoolMaxWaitTime()); // default 5000 + ds.setMaxLifetime(config.getConnectionPoolMaxLifeTime()); // default 30 min + ds.setIdleTimeout(config.getConnectionPoolMaxLifeTime() / 2L); // default 15 min + setValidationQuery(ds); + if (config.isConnectionPoolKeepAlive()) { + ds.setKeepaliveTime(config.getConnectionPoolMaxLifeTime() / 5L); // default 6 min } + hikariDataSource = ds; + JdbcDataSource.getDataSource().putSource(hikariDataSourceKey, hikariDataSource); + LOG.info("JdbcClient set" + + " ConnectionPoolMinSize = " + config.getConnectionPoolMinSize() + + ", ConnectionPoolMaxSize = " + config.getConnectionPoolMaxSize() + + ", ConnectionPoolMaxWaitTime = " + config.getConnectionPoolMaxWaitTime() + + ", ConnectionPoolMaxLifeTime = " + config.getConnectionPoolMaxLifeTime() + + ", ConnectionPoolKeepAlive = " + config.isConnectionPoolKeepAlive()); + LOG.info("init datasource [" + (config.getJdbcUrl() + config.getJdbcUser()) + "] cost: " + ( + System.currentTimeMillis() - start) + " ms"); } } - conn = hikariDataSource.getConnection(); - } else { - Class driverClass = Class.forName(config.getJdbcDriverClass(), true, classLoader); - Driver driverInstance = (Driver) driverClass.getDeclaredConstructor().newInstance(); - - Properties info = new Properties(); - info.put("user", config.getJdbcUser()); - info.put("password", config.getJdbcPassword()); - - conn = driverInstance.connect(SecurityChecker.getInstance().getSafeJdbcUrl(config.getJdbcUrl()), info); - if (conn == null) { - throw new SQLException("Failed to establish a connection. The JDBC driver returned null. " - + "Please check if the JDBC URL is correct: " - + config.getJdbcUrl() - + ". Ensure that the URL format and parameters are valid for the driver: " - + driverInstance.getClass().getName()); - } } + long start = System.currentTimeMillis(); + conn = hikariDataSource.getConnection(); + LOG.info("get connection [" + (config.getJdbcUrl() + config.getJdbcUser()) + "] cost: " + ( + System.currentTimeMillis() - start) + + " ms"); + initializeStatement(conn, config, sql); } catch (MalformedURLException e) { diff --git a/fe/be-java-extensions/jdbc-scanner/src/main/java/org/apache/doris/jdbc/JdbcDataSourceConfig.java b/fe/be-java-extensions/jdbc-scanner/src/main/java/org/apache/doris/jdbc/JdbcDataSourceConfig.java index 30e94ddd37f49dd..a99377add2532db 100644 --- a/fe/be-java-extensions/jdbc-scanner/src/main/java/org/apache/doris/jdbc/JdbcDataSourceConfig.java +++ b/fe/be-java-extensions/jdbc-scanner/src/main/java/org/apache/doris/jdbc/JdbcDataSourceConfig.java @@ -35,7 +35,6 @@ public class JdbcDataSourceConfig { private int connectionPoolMaxWaitTime = 5000; private int connectionPoolMaxLifeTime = 1800000; private boolean connectionPoolKeepAlive = false; - private boolean enableConnectionPool = false; public String createCacheKey() { return catalogId + jdbcUrl + jdbcUser + jdbcPassword + jdbcDriverUrl + jdbcDriverClass @@ -168,13 +167,4 @@ public JdbcDataSourceConfig setConnectionPoolKeepAlive(boolean connectionPoolKee this.connectionPoolKeepAlive = connectionPoolKeepAlive; return this; } - - public boolean isEnableConnectionPool() { - return enableConnectionPool; - } - - public JdbcDataSourceConfig setEnableConnectionPool(boolean enableConnectionPool) { - this.enableConnectionPool = enableConnectionPool; - return this; - } } diff --git a/fe/be-java-extensions/paimon-scanner/src/main/java/org/apache/doris/paimon/PaimonJniScanner.java b/fe/be-java-extensions/paimon-scanner/src/main/java/org/apache/doris/paimon/PaimonJniScanner.java index f229134e9d83193..7bd9fa631c8da3e 100644 --- a/fe/be-java-extensions/paimon-scanner/src/main/java/org/apache/doris/paimon/PaimonJniScanner.java +++ b/fe/be-java-extensions/paimon-scanner/src/main/java/org/apache/doris/paimon/PaimonJniScanner.java @@ -125,7 +125,7 @@ private void initReader() throws IOException { int[] projected = getProjected(); readBuilder.withProjection(projected); readBuilder.withFilter(getPredicates()); - reader = readBuilder.newRead().createReader(getSplit()); + reader = readBuilder.newRead().executeFilter().createReader(getSplit()); paimonDataTypeList = Arrays.stream(projected).mapToObj(i -> table.rowType().getTypeAt(i)).collect(Collectors.toList()); } diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index f4e96bbd7a83661..dd0aca5923e74a4 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2033,6 +2033,12 @@ public class Config extends ConfigBase { @ConfField(mutable = true, masterOnly = true) public static long max_backend_heartbeat_failure_tolerance_count = 1; + /** + * Even if a backend is healthy, still write a heartbeat editlog to update backend's lastUpdateMs of bdb image. + */ + @ConfField(mutable = true, masterOnly = true) + public static int editlog_healthy_heartbeat_seconds = 300; + /** * Abort transaction time after lost heartbeat. * The default value is 300s, which means transactions of be will be aborted after lost heartbeat 300s. diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index 47a45b67aa7b365..8ce8d033108367f 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -429,6 +429,7 @@ QUANTILE_STATE: 'QUANTILE_STATE'; QUANTILE_UNION: 'QUANTILE_UNION'; QUERY: 'QUERY'; QUOTA: 'QUOTA'; +QUALIFY: 'QUALIFY'; RANDOM: 'RANDOM'; RANGE: 'RANGE'; READ: 'READ'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index f0e56dde174a801..d1e80c27958f17d 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -54,6 +54,7 @@ statementBase | constraintStatement #constraintStatementAlias | supportedDropStatement #supportedDropStatementAlias | supportedSetStatement #supportedSetStatementAlias + | supportedUnsetStatement #supportedUnsetStatementAlias | unsupportedStatement #unsupported | supportedShowStatement #supportedShowStatementAlias ; @@ -62,8 +63,7 @@ supportedShowStatement :SHOW ROLES #showRoles ; unsupportedStatement - : unsupoortedUnsetStatement - | unsupportedUseStatement + : unsupportedUseStatement | unsupportedDmlStatement | unsupportedKillStatement | unsupportedDescribeStatement @@ -843,7 +843,7 @@ isolationLevel : ISOLATION LEVEL ((READ UNCOMMITTED) | (READ COMMITTED) | (REPEATABLE READ) | (SERIALIZABLE)) ; -unsupoortedUnsetStatement +supportedUnsetStatement : UNSET (GLOBAL | SESSION | LOCAL)? VARIABLE (ALL | identifier) | UNSET DEFAULT STORAGE VAULT ; @@ -1121,6 +1121,7 @@ querySpecification whereClause? aggClause? havingClause? + qualifyClause? {doris_legacy_SQL_syntax}? queryOrganization #regularQuerySpecification ; @@ -1207,6 +1208,10 @@ havingClause : HAVING booleanExpression ; +qualifyClause + : QUALIFY booleanExpression + ; + selectHint: hintStatements+=hintStatement (COMMA? hintStatements+=hintStatement)* HINT_END; hintStatement @@ -2022,6 +2027,7 @@ nonReserved | QUANTILE_UNION | QUERY | QUOTA + | QUALIFY | RANDOM | RECENT | RECOVER diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java index 916cfd05bc9c079..0981e3e6538a6d5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java @@ -2135,6 +2135,7 @@ public int getAsInt() { index.setIndexId(existedIdx.getIndexId()); index.setColumns(existedIdx.getColumns()); index.setProperties(existedIdx.getProperties()); + index.setColumnUniqueIds(existedIdx.getColumnUniqueIds()); if (indexDef.getPartitionNames().isEmpty()) { invertedIndexOnPartitions.put(index.getIndexId(), olapTable.getPartitionNames()); } else { @@ -2735,6 +2736,7 @@ private boolean processAddIndex(CreateIndexClause alterClause, OlapTable olapTab if (column != null) { indexDef.checkColumn(column, olapTable.getKeysType(), olapTable.getTableProperty().getEnableUniqueKeyMergeOnWrite()); + indexDef.getColumnUniqueIds().add(column.getUniqueId()); } else { throw new DdlException("index column does not exist in table. invalid column: " + col); } @@ -2745,6 +2747,7 @@ private boolean processAddIndex(CreateIndexClause alterClause, OlapTable olapTab // so here update column name in CreateIndexClause after checkColumn for indexDef, // there will use the column name in olapTable insead of the column name in CreateIndexClause. alterIndex.setColumns(indexDef.getColumns()); + alterIndex.setColumnUniqueIds(indexDef.getColumnUniqueIds()); newIndexes.add(alterIndex); return false; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/BuildIndexClause.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/BuildIndexClause.java index cb7ec08de78f9c8..c65766a1ae87211 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/BuildIndexClause.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/BuildIndexClause.java @@ -73,7 +73,7 @@ public void analyze(Analyzer analyzer) throws AnalysisException { indexDef.analyze(); this.index = new Index(Env.getCurrentEnv().getNextId(), indexDef.getIndexName(), indexDef.getColumns(), indexDef.getIndexType(), - indexDef.getProperties(), indexDef.getComment()); + indexDef.getProperties(), indexDef.getComment(), indexDef.getColumnUniqueIds()); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateIndexClause.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateIndexClause.java index b39c0df4a85db5a..86df87453ad5751 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateIndexClause.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateIndexClause.java @@ -73,7 +73,7 @@ public void analyze(Analyzer analyzer) throws AnalysisException { indexDef.analyze(); this.index = new Index(Env.getCurrentEnv().getNextId(), indexDef.getIndexName(), indexDef.getColumns(), indexDef.getIndexType(), - indexDef.getProperties(), indexDef.getComment()); + indexDef.getProperties(), indexDef.getComment(), indexDef.getColumnUniqueIds()); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java index f92da90e5c43bd3..b07424056906b87 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java @@ -618,7 +618,8 @@ public void analyze(Analyzer analyzer) throws UserException { } } indexes.add(new Index(Env.getCurrentEnv().getNextId(), indexDef.getIndexName(), indexDef.getColumns(), - indexDef.getIndexType(), indexDef.getProperties(), indexDef.getComment())); + indexDef.getIndexType(), indexDef.getProperties(), indexDef.getComment(), + indexDef.getColumnUniqueIds())); distinct.add(indexDef.getIndexName()); distinctCol.add(Pair.of(indexDef.getIndexType(), indexDef.getColumns().stream().map(String::toUpperCase).collect(Collectors.toList()))); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java index a9ce85b2d3e078f..ba7aa50ec69595e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java @@ -208,7 +208,7 @@ public void analyze(Analyzer analyzer) throws UserException { } private void setJob() throws UserException { - exportJob = new ExportJob(); + exportJob = new ExportJob(Env.getCurrentEnv().getNextId()); Database db = Env.getCurrentInternalCatalog().getDbOrDdlException(this.tblName.getDb()); exportJob.setDbId(db.getId()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java index 87bf7c5aa189cbd..b2ee45372973bfb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java @@ -42,6 +42,7 @@ public class IndexDef { private Map properties; private boolean isBuildDeferred = false; private PartitionNames partitionNames; + private List columnUniqueIds = Lists.newArrayList(); public static final String NGRAM_SIZE_KEY = "gram_size"; public static final String NGRAM_BF_SIZE_KEY = "bf_size"; @@ -196,6 +197,10 @@ public List getPartitionNames() { return partitionNames == null ? Lists.newArrayList() : partitionNames.getPartitionNames(); } + public List getColumnUniqueIds() { + return columnUniqueIds; + } + public enum IndexType { BITMAP, INVERTED, diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index ef23713a5010b49..cd0c0e80d8f27ed 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -254,7 +254,6 @@ import org.apache.doris.resource.workloadschedpolicy.WorkloadSchedPolicyMgr; import org.apache.doris.resource.workloadschedpolicy.WorkloadSchedPolicyPublisher; import org.apache.doris.scheduler.manager.TransientTaskManager; -import org.apache.doris.scheduler.registry.ExportTaskRegister; import org.apache.doris.service.ExecuteEnv; import org.apache.doris.service.FrontendOptions; import org.apache.doris.statistics.AnalysisManager; @@ -395,7 +394,6 @@ public class Env { private ExternalMetaIdMgr externalMetaIdMgr; private MetastoreEventsProcessor metastoreEventsProcessor; - private ExportTaskRegister exportTaskRegister; private JobManager, ?> jobManager; private LabelProcessor labelProcessor; private TransientTaskManager transientTaskManager; @@ -709,7 +707,6 @@ public Env(boolean isCheckpointCatalog) { this.jobManager = new JobManager<>(); this.labelProcessor = new LabelProcessor(); this.transientTaskManager = new TransientTaskManager(); - this.exportTaskRegister = new ExportTaskRegister(transientTaskManager); this.replayedJournalId = new AtomicLong(0L); this.stmtIdCounter = new AtomicLong(0L); @@ -4425,11 +4422,6 @@ public SyncJobManager getSyncJobManager() { return this.syncJobManager; } - - public ExportTaskRegister getExportTaskRegister() { - return exportTaskRegister; - } - public JobManager getJobManager() { return jobManager; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java index 56a878c8f93948a..40db2f1d5b01d50 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java @@ -48,7 +48,7 @@ /** * Internal representation of index, including index type, name, columns and comments. - * This class will used in olaptable + * This class will be used in olap table */ public class Index implements Writable { public static final int INDEX_ID_INIT_VALUE = -1; @@ -65,15 +65,19 @@ public class Index implements Writable { private Map properties; @SerializedName(value = "ct", alternate = {"comment"}) private String comment; + @SerializedName(value = "cui", alternate = {"columnUniqueIds"}) + private List columnUniqueIds; public Index(long indexId, String indexName, List columns, - IndexDef.IndexType indexType, Map properties, String comment) { + IndexDef.IndexType indexType, Map properties, String comment, + List columnUniqueIds) { this.indexId = indexId; this.indexName = indexName; this.columns = columns == null ? Lists.newArrayList() : Lists.newArrayList(columns); this.indexType = indexType; this.properties = properties == null ? Maps.newHashMap() : Maps.newHashMap(properties); this.comment = comment; + this.columnUniqueIds = columnUniqueIds == null ? Lists.newArrayList() : Lists.newArrayList(columnUniqueIds); if (indexType == IndexDef.IndexType.INVERTED) { if (this.properties != null && !this.properties.isEmpty()) { if (this.properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY)) { @@ -97,6 +101,7 @@ public Index() { this.indexType = null; this.properties = null; this.comment = null; + this.columnUniqueIds = null; } public long getIndexId() { @@ -186,6 +191,14 @@ public void setComment(String comment) { this.comment = comment; } + public List getColumnUniqueIds() { + return columnUniqueIds; + } + + public void setColumnUniqueIds(List columnUniqueIds) { + this.columnUniqueIds = columnUniqueIds; + } + @Override public void write(DataOutput out) throws IOException { Text.writeString(out, GsonUtils.GSON.toJson(this)); @@ -203,7 +216,7 @@ public int hashCode() { public Index clone() { return new Index(indexId, indexName, new ArrayList<>(columns), - indexType, new HashMap<>(properties), comment); + indexType, new HashMap<>(properties), comment, columnUniqueIds); } @Override @@ -247,6 +260,7 @@ public TOlapTableIndex toThrift() { if (properties != null) { tIndex.setProperties(properties); } + tIndex.setColumnUniqueIds(columnUniqueIds); return tIndex; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java index 768ae22d202dc47..a571334660a5fa7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java @@ -139,9 +139,23 @@ public class InternalSchema { AUDIT_SCHEMA.add(new ColumnDef("scan_rows", TypeDef.create(PrimitiveType.BIGINT), ColumnNullableType.NULLABLE)); AUDIT_SCHEMA .add(new ColumnDef("return_rows", TypeDef.create(PrimitiveType.BIGINT), ColumnNullableType.NULLABLE)); + AUDIT_SCHEMA + .add(new ColumnDef("shuffle_send_rows", TypeDef.create(PrimitiveType.BIGINT), + ColumnNullableType.NULLABLE)); + AUDIT_SCHEMA + .add(new ColumnDef("shuffle_send_bytes", TypeDef.create(PrimitiveType.BIGINT), + ColumnNullableType.NULLABLE)); + AUDIT_SCHEMA + .add(new ColumnDef("scan_bytes_from_local_storage", TypeDef.create(PrimitiveType.BIGINT), + ColumnNullableType.NULLABLE)); + AUDIT_SCHEMA + .add(new ColumnDef("scan_bytes_from_remote_storage", TypeDef.create(PrimitiveType.BIGINT), + ColumnNullableType.NULLABLE)); AUDIT_SCHEMA.add(new ColumnDef("stmt_id", TypeDef.create(PrimitiveType.BIGINT), ColumnNullableType.NULLABLE)); AUDIT_SCHEMA.add(new ColumnDef("stmt_type", TypeDef.createVarchar(48), ColumnNullableType.NULLABLE)); AUDIT_SCHEMA.add(new ColumnDef("is_query", TypeDef.create(PrimitiveType.TINYINT), ColumnNullableType.NULLABLE)); + AUDIT_SCHEMA.add( + new ColumnDef("is_nereids", TypeDef.create(PrimitiveType.TINYINT), ColumnNullableType.NULLABLE)); AUDIT_SCHEMA.add(new ColumnDef("frontend_ip", TypeDef.createVarchar(128), ColumnNullableType.NULLABLE)); AUDIT_SCHEMA .add(new ColumnDef("cpu_time_ms", TypeDef.create(PrimitiveType.BIGINT), ColumnNullableType.NULLABLE)); @@ -151,6 +165,9 @@ public class InternalSchema { new ColumnDef("peak_memory_bytes", TypeDef.create(PrimitiveType.BIGINT), ColumnNullableType.NULLABLE)); AUDIT_SCHEMA.add( new ColumnDef("workload_group", TypeDef.create(PrimitiveType.STRING), ColumnNullableType.NULLABLE)); + AUDIT_SCHEMA.add( + new ColumnDef("compute_group", TypeDef.create(PrimitiveType.STRING), ColumnNullableType.NULLABLE)); + // Keep stmt as last column. So that in fe.audit.log, it will be easier to get sql string AUDIT_SCHEMA.add(new ColumnDef("stmt", TypeDef.create(PrimitiveType.STRING), ColumnNullableType.NULLABLE)); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/JdbcResource.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/JdbcResource.java index b7db351f49a4d0a..28d58b35297ac3e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/JdbcResource.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/JdbcResource.java @@ -108,7 +108,6 @@ public class JdbcResource extends Resource { public static final String CHECK_SUM = "checksum"; public static final String CREATE_TIME = "create_time"; public static final String TEST_CONNECTION = "test_connection"; - public static final String ENABLE_CONNECTION_POOL = "enable_connection_pool"; private static final ImmutableList ALL_PROPERTIES = new ImmutableList.Builder().add( JDBC_URL, @@ -129,8 +128,7 @@ public class JdbcResource extends Resource { CONNECTION_POOL_MAX_WAIT_TIME, CONNECTION_POOL_KEEP_ALIVE, TEST_CONNECTION, - ExternalCatalog.USE_META_CACHE, - ENABLE_CONNECTION_POOL + ExternalCatalog.USE_META_CACHE ).build(); // The default value of optional properties @@ -151,7 +149,6 @@ public class JdbcResource extends Resource { OPTIONAL_PROPERTIES_DEFAULT_VALUE.put(TEST_CONNECTION, "true"); OPTIONAL_PROPERTIES_DEFAULT_VALUE.put(ExternalCatalog.USE_META_CACHE, String.valueOf(ExternalCatalog.DEFAULT_USE_META_CACHE)); - OPTIONAL_PROPERTIES_DEFAULT_VALUE.put(ENABLE_CONNECTION_POOL, "false"); } // timeout for both connection and read. 10 seconds is long enough. diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/JdbcTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/JdbcTable.java index 7c3678c8ed8c6cf..6dce40a2684fbca 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/JdbcTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/JdbcTable.java @@ -97,7 +97,6 @@ public class JdbcTable extends Table { @SerializedName("cid") private long catalogId = -1; - private boolean enableConnectionPool; private int connectionPoolMinSize; private int connectionPoolMaxSize; private int connectionPoolMaxWaitTime; @@ -191,11 +190,6 @@ public long getCatalogId() { return catalogId; } - public boolean isEnableConnectionPool() { - return Boolean.parseBoolean(getFromJdbcResourceOrDefault(JdbcResource.ENABLE_CONNECTION_POOL, - String.valueOf(enableConnectionPool))); - } - public int getConnectionPoolMinSize() { return Integer.parseInt(getFromJdbcResourceOrDefault(JdbcResource.CONNECTION_POOL_MIN_SIZE, String.valueOf(connectionPoolMinSize))); @@ -244,7 +238,6 @@ public TTableDescriptor toThrift() { tJdbcTable.setJdbcDriverUrl(getDriverUrl()); tJdbcTable.setJdbcResourceName(resourceName); tJdbcTable.setJdbcDriverChecksum(checkSum); - tJdbcTable.setEnableConnectionPool(isEnableConnectionPool()); tJdbcTable.setConnectionPoolMinSize(getConnectionPoolMinSize()); tJdbcTable.setConnectionPoolMaxSize(getConnectionPoolMaxSize()); tJdbcTable.setConnectionPoolMaxWaitTime(getConnectionPoolMaxWaitTime()); @@ -401,7 +394,6 @@ private void validate(Map properties) throws DdlException { driverClass = jdbcResource.getProperty(DRIVER_CLASS); driverUrl = jdbcResource.getProperty(DRIVER_URL); checkSum = jdbcResource.getProperty(CHECK_SUM); - enableConnectionPool = Boolean.parseBoolean(jdbcResource.getProperty(JdbcResource.ENABLE_CONNECTION_POOL)); connectionPoolMinSize = Integer.parseInt(jdbcResource.getProperty(JdbcResource.CONNECTION_POOL_MIN_SIZE)); connectionPoolMaxSize = Integer.parseInt(jdbcResource.getProperty(JdbcResource.CONNECTION_POOL_MAX_SIZE)); connectionPoolMaxWaitTime = Integer.parseInt( diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/MaterializedIndexMeta.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/MaterializedIndexMeta.java index 6125e0334003c55..5dd5776c7619021 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/MaterializedIndexMeta.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/MaterializedIndexMeta.java @@ -387,6 +387,13 @@ public void initSchemaColumnUniqueId() { maxColUniqueId = Column.COLUMN_UNIQUE_ID_INIT_VALUE; this.schema.forEach(column -> { column.setUniqueId(incAndGetMaxColUniqueId()); + this.indexes.forEach(index -> { + index.getColumns().forEach(col -> { + if (col.equalsIgnoreCase(column.getName())) { + index.getColumnUniqueIds().add(column.getUniqueId()); + } + }); + }); if (LOG.isDebugEnabled()) { LOG.debug("indexId: {}, column:{}, uniqueId:{}", indexId, column, column.getUniqueId()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 999e0c43995f00a..b0d27ac7b5c7d99 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -139,8 +139,6 @@ public enum OlapTableState { WAITING_STABLE } - public static long ROW_COUNT_BEFORE_REPORT = -1; - @SerializedName(value = "tst", alternate = {"state"}) private volatile OlapTableState state; @@ -1618,12 +1616,12 @@ public long getRowCountForIndex(long indexId, boolean strict) { if (index == null) { LOG.warn("Index {} not exist in partition {}, table {}, {}", indexId, entry.getValue().getName(), id, name); - return ROW_COUNT_BEFORE_REPORT; + return UNKNOWN_ROW_COUNT; } if (strict && !index.getRowCountReported()) { - return ROW_COUNT_BEFORE_REPORT; + return UNKNOWN_ROW_COUNT; } - rowCount += index.getRowCount() == -1 ? 0 : index.getRowCount(); + rowCount += index.getRowCount() == UNKNOWN_ROW_COUNT ? 0 : index.getRowCount(); } return rowCount; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java index d98bba5edaca841..d85d98a8ea550f5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java @@ -623,7 +623,7 @@ public List getChunkSizes() { @Override public long fetchRowCount() { - return 0; + return UNKNOWN_ROW_COUNT; } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index 3a688a7b59d17ad..8f6e924f44a54d9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -55,6 +55,8 @@ public interface TableIf { Logger LOG = LogManager.getLogger(TableIf.class); + long UNKNOWN_ROW_COUNT = -1; + default void readLock() { } diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java index b35a3b9e911416e..0b83baa94d6d4a2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java @@ -429,7 +429,7 @@ private Map> warmUpNewClusterByCluster(String dstClusterName, for (Backend backend : backends) { Set beTabletIds = ((CloudEnv) Env.getCurrentEnv()) .getCloudTabletRebalancer() - .getSnapshotTabletsByBeId(backend.getId()); + .getSnapshotTabletsInPrimaryByBeId(backend.getId()); List warmUpTablets = new ArrayList<>(); for (Tablet tablet : tablets) { if (beTabletIds.contains(tablet.getId())) { @@ -559,7 +559,7 @@ private Map> warmUpNewClusterByTable(long jobId, String dstCl for (Backend backend : backends) { Set beTabletIds = ((CloudEnv) Env.getCurrentEnv()) .getCloudTabletRebalancer() - .getSnapshotTabletsByBeId(backend.getId()); + .getSnapshotTabletsInPrimaryByBeId(backend.getId()); List warmUpTablets = new ArrayList<>(); for (Tablet tablet : tablets) { if (beTabletIds.contains(tablet.getId())) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java index 78947afdb11e39f..8e5033470b0e90a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java @@ -73,6 +73,10 @@ public class CloudTabletRebalancer extends MasterDaemon { private volatile ConcurrentHashMap> beToColocateTabletsGlobal = new ConcurrentHashMap>(); + // used for cloud tablet report + private volatile ConcurrentHashMap> beToTabletsGlobalInSecondary = + new ConcurrentHashMap>(); + private Map> futureBeToTabletsGlobal; private Map> clusterToBes; @@ -164,7 +168,7 @@ private class TransferPairInfo { public boolean srcDecommissioned; } - public Set getSnapshotTabletsByBeId(Long beId) { + public Set getSnapshotTabletsInPrimaryByBeId(Long beId) { Set tabletIds = Sets.newHashSet(); List tablets = beToTabletsGlobal.get(beId); if (tablets != null) { @@ -183,6 +187,24 @@ public Set getSnapshotTabletsByBeId(Long beId) { return tabletIds; } + public Set getSnapshotTabletsInSecondaryByBeId(Long beId) { + Set tabletIds = Sets.newHashSet(); + List tablets = beToTabletsGlobalInSecondary.get(beId); + if (tablets != null) { + for (Tablet tablet : tablets) { + tabletIds.add(tablet.getId()); + } + } + return tabletIds; + } + + public Set getSnapshotTabletsInPrimaryAndSecondaryByBeId(Long beId) { + Set tabletIds = Sets.newHashSet(); + tabletIds.addAll(getSnapshotTabletsInPrimaryByBeId(beId)); + tabletIds.addAll(getSnapshotTabletsInSecondaryByBeId(beId)); + return tabletIds; + } + public int getTabletNumByBackendId(long beId) { List tablets = beToTabletsGlobal.get(beId); List colocateTablets = beToColocateTabletsGlobal.get(beId); @@ -617,6 +639,8 @@ public void fillBeToTablets(long be, long tableId, long partId, long indexId, Ta public void statRouteInfo() { ConcurrentHashMap> tmpBeToTabletsGlobal = new ConcurrentHashMap>(); + ConcurrentHashMap> tmpBeToTabletsGlobalInSecondary + = new ConcurrentHashMap>(); ConcurrentHashMap> tmpBeToColocateTabletsGlobal = new ConcurrentHashMap>(); @@ -641,11 +665,8 @@ public void statRouteInfo() { continue; } if (allBes.contains(beId)) { - List colocateTablets = tmpBeToColocateTabletsGlobal.get(beId); - if (colocateTablets == null) { - colocateTablets = new ArrayList(); - tmpBeToColocateTabletsGlobal.put(beId, colocateTablets); - } + List colocateTablets = + tmpBeToColocateTabletsGlobal.computeIfAbsent(beId, k -> new ArrayList<>()); colocateTablets.add(tablet); } continue; @@ -657,6 +678,14 @@ public void statRouteInfo() { continue; } + Backend secondaryBe = replica.getSecondaryBackend(cluster); + long secondaryBeId = secondaryBe == null ? -1L : secondaryBe.getId(); + if (allBes.contains(secondaryBeId)) { + List tablets = tmpBeToTabletsGlobalInSecondary + .computeIfAbsent(secondaryBeId, k -> new ArrayList<>()); + tablets.add(tablet); + } + InfightTablet taskKey = new InfightTablet(tablet.getId(), cluster); InfightTask task = tabletToInfightTask.get(taskKey); long futureBeId = task == null ? beId : task.destBe; @@ -670,6 +699,7 @@ public void statRouteInfo() { }); beToTabletsGlobal = tmpBeToTabletsGlobal; + beToTabletsGlobalInSecondary = tmpBeToTabletsGlobalInSecondary; beToColocateTabletsGlobal = tmpBeToColocateTabletsGlobal; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/master/CloudReportHandler.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/master/CloudReportHandler.java new file mode 100644 index 000000000000000..6564bd7d3a51a4e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/master/CloudReportHandler.java @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud.master; + +import org.apache.doris.catalog.Env; +import org.apache.doris.cloud.catalog.CloudEnv; +import org.apache.doris.master.ReportHandler; +import org.apache.doris.system.Backend; +import org.apache.doris.task.AgentBatchTask; +import org.apache.doris.task.AgentTaskExecutor; +import org.apache.doris.task.DropReplicaTask; +import org.apache.doris.thrift.TTablet; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +public class CloudReportHandler extends ReportHandler { + private static final Logger LOG = LogManager.getLogger(CloudReportHandler.class); + + @Override + public void tabletReport(long backendId, Map backendTablets, + Map backendPartitionsVersion, long backendReportVersion, long numTablets) { + long start = System.currentTimeMillis(); + LOG.info("backend[{}] have {} tablet(s), {} need deal tablet(s). report version: {}", + backendId, numTablets, backendTablets.size(), backendReportVersion); + // current be useful + Set tabletIdsInFe = ((CloudEnv) Env.getCurrentEnv()).getCloudTabletRebalancer() + .getSnapshotTabletsInPrimaryAndSecondaryByBeId(backendId); + + Set tabletIdsInBe = backendTablets.keySet(); + // handle (be - meta) + Set tabletIdsNeedDrop = diffTablets(tabletIdsInFe, tabletIdsInBe); + // drop agent task + deleteFromBackend(backendId, tabletIdsNeedDrop); + + Backend be = Env.getCurrentSystemInfo().getBackend(backendId); + LOG.info("finished to handle task report from backend {}-{}, " + + "diff task num: {}, cost: {} ms.", + backendId, be != null ? be.getHost() : "", + tabletIdsNeedDrop.size(), + (System.currentTimeMillis() - start)); + } + + // tabletIdsInFe, tablet is used in Primary or Secondary + // tabletIdsInBe, tablet report exceed time, need to check + // returns tabletIds need to drop + private Set diffTablets(Set tabletIdsInFe, Set tabletIdsInBe) { + // tabletsInBe - tabletsInFe + Set result = new HashSet<>(tabletIdsInBe); + result.removeAll(tabletIdsInFe); + return result; + } + + private static void deleteFromBackend(long backendId, Set tabletIdsWillDrop) { + int deleteFromBackendCounter = 0; + AgentBatchTask batchTask = new AgentBatchTask(); + for (Long tabletId : tabletIdsWillDrop) { + DropReplicaTask task = new DropReplicaTask(backendId, tabletId, -1, -1, false); + batchTask.addTask(task); + LOG.info("delete tablet[{}] from backend[{}]", tabletId, backendId); + ++deleteFromBackendCounter; + } + + if (batchTask.getTaskNum() != 0) { + AgentTaskExecutor.submit(batchTask); + } + + LOG.info("delete {} tablet(s) from backend[{}]", deleteFromBackendCounter, backendId); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceClient.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceClient.java index c4d28fb3bc256c4..904f3ec15d9c6b8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceClient.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceClient.java @@ -345,6 +345,17 @@ public Cloud.GetDeleteBitmapUpdateLockResponse getDeleteBitmapUpdateLock( return blockingStub.getDeleteBitmapUpdateLock(request); } + public Cloud.RemoveDeleteBitmapUpdateLockResponse removeDeleteBitmapUpdateLock( + Cloud.RemoveDeleteBitmapUpdateLockRequest request) { + if (!request.hasCloudUniqueId()) { + Cloud.RemoveDeleteBitmapUpdateLockRequest.Builder builder = Cloud.RemoveDeleteBitmapUpdateLockRequest + .newBuilder(); + builder.mergeFrom(request); + return blockingStub.removeDeleteBitmapUpdateLock(builder.setCloudUniqueId(Config.cloud_unique_id).build()); + } + return blockingStub.removeDeleteBitmapUpdateLock(request); + } + public Cloud.GetInstanceResponse getInstance(Cloud.GetInstanceRequest request) { if (!request.hasCloudUniqueId()) { Cloud.GetInstanceRequest.Builder builder = Cloud.GetInstanceRequest.newBuilder(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java index 6ed0eb81b781fb8..d7f718e3ca46bfe 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/rpc/MetaServiceProxy.java @@ -335,6 +335,12 @@ public Cloud.GetDeleteBitmapUpdateLockResponse getDeleteBitmapUpdateLock( return w.executeRequest((client) -> client.getDeleteBitmapUpdateLock(request)); } + public Cloud.RemoveDeleteBitmapUpdateLockResponse removeDeleteBitmapUpdateLock( + Cloud.RemoveDeleteBitmapUpdateLockRequest request) + throws RpcException { + return w.executeRequest((client) -> client.removeDeleteBitmapUpdateLock(request)); + } + public Cloud.AlterObjStoreInfoResponse alterObjStoreInfo(Cloud.AlterObjStoreInfoRequest request) throws RpcException { return w.executeRequest((client) -> client.alterObjStoreInfo(request)); diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java index 13d5afe5dbbebe8..5bdb2a44c111af8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java @@ -56,6 +56,8 @@ import org.apache.doris.cloud.proto.Cloud.MetaServiceCode; import org.apache.doris.cloud.proto.Cloud.PrecommitTxnRequest; import org.apache.doris.cloud.proto.Cloud.PrecommitTxnResponse; +import org.apache.doris.cloud.proto.Cloud.RemoveDeleteBitmapUpdateLockRequest; +import org.apache.doris.cloud.proto.Cloud.RemoveDeleteBitmapUpdateLockResponse; import org.apache.doris.cloud.proto.Cloud.SubTxnInfo; import org.apache.doris.cloud.proto.Cloud.TableStatsPB; import org.apache.doris.cloud.proto.Cloud.TabletIndexPB; @@ -648,7 +650,13 @@ private void calcDeleteBitmapForMow(long dbId, List tableList, long t Map> backendToPartitionInfos = getCalcDeleteBitmapInfo( backendToPartitionTablets, partitionVersions, baseCompactionCnts, cumulativeCompactionCnts, cumulativePoints); - sendCalcDeleteBitmaptask(dbId, transactionId, backendToPartitionInfos); + try { + sendCalcDeleteBitmaptask(dbId, transactionId, backendToPartitionInfos); + } catch (UserException e) { + LOG.warn("failed to sendCalcDeleteBitmaptask for txn=" + transactionId + ",exception=" + e.getMessage()); + removeDeleteBitmapUpdateLock(tableToPartitions, transactionId); + throw e; + } } private void getPartitionInfo(List tableList, @@ -869,6 +877,33 @@ private void getDeleteBitmapUpdateLock(Map> tableToParttions, lo } } + private void removeDeleteBitmapUpdateLock(Map> tableToParttions, long transactionId) { + for (Map.Entry> entry : tableToParttions.entrySet()) { + RemoveDeleteBitmapUpdateLockRequest.Builder builder = RemoveDeleteBitmapUpdateLockRequest.newBuilder(); + builder.setTableId(entry.getKey()) + .setLockId(transactionId) + .setInitiator(-1); + final RemoveDeleteBitmapUpdateLockRequest request = builder.build(); + RemoveDeleteBitmapUpdateLockResponse response = null; + try { + response = MetaServiceProxy.getInstance().removeDeleteBitmapUpdateLock(request); + if (LOG.isDebugEnabled()) { + LOG.debug("remove delete bitmap lock, transactionId={}, Request: {}, Response: {}", + transactionId, request, response); + } + Preconditions.checkNotNull(response); + Preconditions.checkNotNull(response.getStatus()); + if (response.getStatus().getCode() != MetaServiceCode.OK) { + LOG.warn("remove delete bitmap lock failed, transactionId={}, response:{}", + transactionId, response); + } + } catch (Exception e) { + LOG.warn("ignore get delete bitmap lock exception, transactionId={}, exception={}", + transactionId, e); + } + } + } + private void sendCalcDeleteBitmaptask(long dbId, long transactionId, Map> backendToPartitionInfos) throws UserException { diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java index 267e20a1f959bca..4604e4deabb2b75 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java @@ -131,7 +131,19 @@ private LocationPath(String originLocation, Map props, boolean c tmpLocation = convertPath ? convertToS3(tmpLocation) : tmpLocation; break; case FeConstants.FS_PREFIX_OSS: - if (isHdfsOnOssEndpoint(tmpLocation)) { + String endpoint = ""; + if (props.containsKey(OssProperties.ENDPOINT)) { + endpoint = props.get(OssProperties.ENDPOINT); + if (endpoint.startsWith(OssProperties.OSS_PREFIX)) { + // may use oss.oss-cn-beijing.aliyuncs.com + endpoint = endpoint.replace(OssProperties.OSS_PREFIX, ""); + } + } else if (props.containsKey(S3Properties.ENDPOINT)) { + endpoint = props.get(S3Properties.ENDPOINT); + } else if (props.containsKey(S3Properties.Env.ENDPOINT)) { + endpoint = props.get(S3Properties.Env.ENDPOINT); + } + if (isHdfsOnOssEndpoint(endpoint)) { this.scheme = Scheme.OSS_HDFS; } else { if (useS3EndPoint(props)) { @@ -398,7 +410,7 @@ private static String normalizedLakefsPath(String location) { } } - private FileSystemType getFileSystemType() { + public FileSystemType getFileSystemType() { FileSystemType fsType; switch (scheme) { case S3: diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalRowCountCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalRowCountCache.java index 075091e682d7224..fc955c4964a30c0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalRowCountCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalRowCountCache.java @@ -102,7 +102,7 @@ protected Optional doLoad(RowCountKey rowCountKey) { } /** - * Get cached row count for the given table. Return 0 if cached not loaded or table not exists. + * Get cached row count for the given table. Return -1 if cached not loaded or table not exists. * Cached will be loaded async. * @return Cached row count or -1 if not exist */ @@ -111,13 +111,13 @@ public long getCachedRowCount(long catalogId, long dbId, long tableId) { try { CompletableFuture> f = rowCountCache.get(key); if (f.isDone()) { - return f.get().orElse(-1L); + return f.get().orElse(TableIf.UNKNOWN_ROW_COUNT); } LOG.info("Row count for table {}.{}.{} is still processing.", catalogId, dbId, tableId); } catch (Exception e) { LOG.warn("Unexpected exception while returning row count", e); } - return -1; + return TableIf.UNKNOWN_ROW_COUNT; } /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalScanNode.java index e85fed8b62a8797..0d67a9e44b6f29e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalScanNode.java @@ -46,7 +46,8 @@ public abstract class ExternalScanNode extends ScanNode { protected boolean needCheckColumnPriv; protected final FederationBackendPolicy backendPolicy = (ConnectContext.get() != null - && ConnectContext.get().getSessionVariable().enableFileCache) + && (ConnectContext.get().getSessionVariable().enableFileCache + || ConnectContext.get().getSessionVariable().getUseConsistentHashForExternalScan())) ? new FederationBackendPolicy(NodeSelectionStrategy.CONSISTENT_HASHING) : new FederationBackendPolicy(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalTable.java index 5c57c13b4b85ad9..1eadb46fe82eedd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalTable.java @@ -195,7 +195,7 @@ public long getRowCount() { makeSureInitialized(); } catch (Exception e) { LOG.warn("Failed to initialize table {}.{}.{}", catalog.getName(), dbName, name, e); - return -1; + return TableIf.UNKNOWN_ROW_COUNT; } // All external table should get external row count from cache. return Env.getCurrentEnv().getExtMetaCacheMgr().getRowCountCache().getCachedRowCount(catalog.getId(), dbId, id); @@ -221,7 +221,7 @@ public long getCachedRowCount() { * This is called by ExternalRowCountCache to load row count cache. */ public long fetchRowCount() { - return -1; + return UNKNOWN_ROW_COUNT; } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java index a2b902fd744a7fe..1e1787c1f649478 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FederationBackendPolicy.java @@ -497,7 +497,7 @@ public void funnel(Backend backend, PrimitiveSink primitiveSink) { private static class SplitHash implements Funnel { @Override public void funnel(Split split, PrimitiveSink primitiveSink) { - primitiveSink.putBytes(split.getPathString().getBytes(StandardCharsets.UTF_8)); + primitiveSink.putBytes(split.getConsistentHashString().getBytes(StandardCharsets.UTF_8)); primitiveSink.putLong(split.getStart()); primitiveSink.putLong(split.getLength()); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplit.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplit.java index 7eaa87b74aab63b..1ebb390e90438f7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileSplit.java @@ -47,6 +47,9 @@ public class FileSplit implements Split { // the location type for BE, eg: HDFS, LOCAL, S3 protected TFileType locationType; + public Long selfSplitWeight; + public Long targetSplitSize; + public FileSplit(LocationPath path, long start, long length, long fileLength, long modificationTime, String[] hosts, List partitionValues) { this.path = path; @@ -89,4 +92,20 @@ public Split create(LocationPath path, long start, long length, long fileLength, return new FileSplit(path, start, length, fileLength, modificationTime, hosts, partitionValues); } } + + @Override + public void setTargetSplitSize(Long targetSplitSize) { + this.targetSplitSize = targetSplitSize; + } + + @Override + public SplitWeight getSplitWeight() { + if (selfSplitWeight != null && targetSplitSize != null) { + double computedWeight = selfSplitWeight * 1.0 / targetSplitSize; + // Clamp the value be between the minimum weight and 1.0 (standard weight) + return SplitWeight.fromProportion(Math.min(Math.max(computedWeight, 0.01), 1.0)); + } else { + return SplitWeight.standard(); + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/TablePartitionValues.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/TablePartitionValues.java index d5e8a39e605a8b0..c7f2ce6f712d6b8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/TablePartitionValues.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/TablePartitionValues.java @@ -34,11 +34,7 @@ import com.google.common.collect.RangeMap; import lombok.Data; -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -80,11 +76,6 @@ public TablePartitionValues(List partitionNames, List> part addPartitions(partitionNames, partitionValues, types); } - public TablePartitionValues(List partitionNames, List types) { - this(); - addPartitions(partitionNames, types); - } - public void addPartitions(List partitionNames, List> partitionValues, List types) { Preconditions.checkState(partitionNames.size() == partitionValues.size()); List addPartitionNames = new ArrayList<>(); @@ -105,10 +96,6 @@ public void addPartitions(List partitionNames, List> partit addPartitionItems(addPartitionNames, addPartitionItems, types); } - public void addPartitions(List partitionNames, List types) { - addPartitions(partitionNames, - partitionNames.stream().map(this::getHivePartitionValues).collect(Collectors.toList()), types); - } private void addPartitionItems(List partitionNames, List partitionItems, List types) { Preconditions.checkState(partitionNames.size() == partitionItems.size()); @@ -196,23 +183,6 @@ private ListPartitionItem toListPartitionItem(List partitionValues, List } } - private List getHivePartitionValues(String partitionName) { - // Partition name will be in format: nation=cn/city=beijing - // parse it to get values "cn" and "beijing" - return Arrays.stream(partitionName.split("/")).map(part -> { - String[] kv = part.split("="); - Preconditions.checkState(kv.length == 2, partitionName); - String partitionValue; - try { - // hive partition value maybe contains special characters like '=' and '/' - partitionValue = URLDecoder.decode(kv[1], StandardCharsets.UTF_8.name()); - } catch (UnsupportedEncodingException e) { - // It should not be here - throw new RuntimeException(e); - } - return partitionValue; - }).collect(Collectors.toList()); - } @Data public static class TablePartitionKey { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java index f72421da8a11343..b48b47acf1378e5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java @@ -337,7 +337,7 @@ public long getCreateTime() { } private long getRowCountFromExternalSource() { - long rowCount = -1; + long rowCount = UNKNOWN_ROW_COUNT; switch (dlaType) { case HIVE: rowCount = StatisticsUtil.getHiveRowCount(this); @@ -350,7 +350,7 @@ private long getRowCountFromExternalSource() { LOG.debug("getRowCount for dlaType {} is not supported.", dlaType); } } - return rowCount; + return rowCount > 0 ? rowCount : UNKNOWN_ROW_COUNT; } @Override @@ -524,7 +524,7 @@ public long fetchRowCount() { // Get row count from hive metastore property. long rowCount = getRowCountFromExternalSource(); // Only hive table supports estimate row count by listing file. - if (rowCount == -1 && dlaType.equals(DLAType.HIVE)) { + if (rowCount == UNKNOWN_ROW_COUNT && dlaType.equals(DLAType.HIVE)) { LOG.info("Will estimate row count for table {} from file list.", name); rowCount = getRowCountFromFileList(); } @@ -834,11 +834,11 @@ public boolean isPartitionColumnAllowNull() { */ private long getRowCountFromFileList() { if (!GlobalVariable.enable_get_row_count_from_file_list) { - return -1; + return UNKNOWN_ROW_COUNT; } if (isView()) { - LOG.info("Table {} is view, return 0.", name); - return 0; + LOG.info("Table {} is view, return -1.", name); + return UNKNOWN_ROW_COUNT; } HiveMetaStoreCache.HivePartitionValues partitionValues = getAllPartitionValues(); @@ -865,8 +865,8 @@ private long getRowCountFromFileList() { estimatedRowSize += column.getDataType().getSlotSize(); } if (estimatedRowSize == 0) { - LOG.warn("Table {} estimated size is 0, return 0.", name); - return 0; + LOG.warn("Table {} estimated size is 0, return -1.", name); + return UNKNOWN_ROW_COUNT; } int totalPartitionSize = partitionValues == null ? 1 : partitionValues.getIdToPartitionItem().size(); @@ -878,7 +878,7 @@ private long getRowCountFromFileList() { long rows = totalSize / estimatedRowSize; LOG.info("Table {} rows {}, total size is {}, estimatedRowSize is {}", name, rows, totalSize, estimatedRowSize); - return rows; + return rows > 0 ? rows : UNKNOWN_ROW_COUNT; } // Get all partition values from cache. diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSTransaction.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSTransaction.java index 6183c277c1bdf53..02c99a695c8b5e5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSTransaction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSTransaction.java @@ -101,7 +101,7 @@ public class HMSTransaction implements Transaction { private final Executor fileSystemExecutor; private HmsCommitter hmsCommitter; private List hivePartitionUpdates = Lists.newArrayList(); - private String declaredIntentionsToWrite; + private Optional stagingDirectory; private boolean isMockedPartitionUpdate = false; private static class UncompletedMpuPendingUpload { @@ -184,10 +184,14 @@ public void rollback() { } public void beginInsertTable(HiveInsertCommandContext ctx) { - declaredIntentionsToWrite = ctx.getWritePath(); queryId = ctx.getQueryId(); isOverwrite = ctx.isOverwrite(); fileType = ctx.getFileType(); + if (fileType == TFileType.FILE_S3) { + stagingDirectory = Optional.empty(); + } else { + stagingDirectory = Optional.of(ctx.getWritePath()); + } } public void finishInsertTable(SimpleTableInfo tableInfo) { @@ -207,10 +211,12 @@ public void finishInsertTable(SimpleTableInfo tableInfo) { } }); } else { - fs.makeDir(declaredIntentionsToWrite); - setLocation(new THiveLocationParams() {{ - setWritePath(declaredIntentionsToWrite); - } + stagingDirectory.ifPresent((v) -> { + fs.makeDir(v); + setLocation(new THiveLocationParams() {{ + setWritePath(v); + } + }); }); } } @@ -643,15 +649,23 @@ private void recursiveDeleteItems(Path directory, boolean deleteEmptyDir, boolea if (!deleteResult.getNotDeletedEligibleItems().isEmpty()) { LOG.warn("Failed to delete directory {}. Some eligible items can't be deleted: {}.", directory.toString(), deleteResult.getNotDeletedEligibleItems()); + throw new RuntimeException( + "Failed to delete directory for files: " + deleteResult.getNotDeletedEligibleItems()); } else if (deleteEmptyDir && !deleteResult.dirNotExists()) { LOG.warn("Failed to delete directory {} due to dir isn't empty", directory.toString()); + throw new RuntimeException("Failed to delete directory for empty dir: " + directory.toString()); } } private DeleteRecursivelyResult recursiveDeleteFiles(Path directory, boolean deleteEmptyDir, boolean reverse) { try { - if (!fs.directoryExists(directory.toString()).ok()) { + Status status = fs.directoryExists(directory.toString()); + if (status.getErrCode().equals(Status.ErrCode.NOT_FOUND)) { return new DeleteRecursivelyResult(true, ImmutableList.of()); + } else if (!status.ok()) { + ImmutableList.Builder notDeletedEligibleItems = ImmutableList.builder(); + notDeletedEligibleItems.add(directory.toString() + "/*"); + return new DeleteRecursivelyResult(false, notDeletedEligibleItems.build()); } } catch (Exception e) { ImmutableList.Builder notDeletedEligibleItems = ImmutableList.builder(); @@ -1447,7 +1461,7 @@ private void doUpdateStatisticsTasks() { } private void pruneAndDeleteStagingDirectories() { - recursiveDeleteItems(new Path(declaredIntentionsToWrite), true, false); + stagingDirectory.ifPresent((v) -> recursiveDeleteItems(new Path(v), true, false)); } private void abortMultiUploads() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index fbfd7dd27986689..ea42dfa2f52a01d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -244,7 +244,7 @@ public Long getValue() { } private HivePartitionValues loadPartitionValues(PartitionValueCacheKey key) { - // partition name format: nation=cn/city=beijing + // partition name format: nation=cn/city=beijing,`listPartitionNames` returned string is the encoded string. List partitionNames = catalog.getClient().listPartitionNames(key.dbName, key.tblName); if (LOG.isDebugEnabled()) { LOG.debug("load #{} partitions for {} in catalog {}", partitionNames.size(), key, catalog.getName()); @@ -281,11 +281,10 @@ private HivePartitionValues loadPartitionValues(PartitionValueCacheKey key) { public ListPartitionItem toListPartitionItem(String partitionName, List types) { // Partition name will be in format: nation=cn/city=beijing // parse it to get values "cn" and "beijing" - String[] parts = partitionName.split("/"); - Preconditions.checkState(parts.length == types.size(), partitionName + " vs. " + types); + List partitionValues = HiveUtil.toPartitionValues(partitionName); + Preconditions.checkState(partitionValues.size() == types.size(), partitionName + " vs. " + types); List values = Lists.newArrayListWithExpectedSize(types.size()); - for (String part : parts) { - String partitionValue = HiveUtil.getHivePartitionValue(part); + for (String partitionValue : partitionValues) { values.add(new PartitionValue(partitionValue, HIVE_DEFAULT_PARTITION.equals(partitionValue))); } try { @@ -325,9 +324,9 @@ private Map loadPartitions(Iterable List(["c1","a"], ["c2","b"], ["c3","c"]) + // Similar to the `toPartitionValues` method, except that it adds the partition column name. + public static List toPartitionColNameAndValues(String partitionName) { + + String[] parts = partitionName.split("/"); + List result = new ArrayList<>(parts.length); + for (String part : parts) { + String[] kv = part.split("="); + Preconditions.checkState(kv.length == 2, String.format("Malformed partition name %s", part)); + + result.add(new String[] { + FileUtils.unescapePathName(kv[0]), + FileUtils.unescapePathName(kv[1]) + }); } + return result; } // "c1=a/c2=b/c3=c" ---> List("a","b","c") @@ -151,6 +154,8 @@ public static List toPartitionValues(String partitionName) { if (start > partitionName.length()) { break; } + //Ref: common/src/java/org/apache/hadoop/hive/common/FileUtils.java + //makePartName(List partCols, List vals,String defaultStr) resultBuilder.add(FileUtils.unescapePathName(partitionName.substring(start, end))); start = end + 1; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergExternalTable.java index d4361a47797a2e1..feded88ea326f03 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergExternalTable.java @@ -83,7 +83,8 @@ public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) { @Override public long fetchRowCount() { makeSureInitialized(); - return IcebergUtils.getIcebergRowCount(getCatalog(), getDbName(), getName()); + long rowCount = IcebergUtils.getIcebergRowCount(getCatalog(), getDbName(), getName()); + return rowCount > 0 ? rowCount : UNKNOWN_ROW_COUNT; } public Table getIcebergTable() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java index 7ae600756f17a5a..ba6d628e492c209 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java @@ -41,6 +41,7 @@ import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.StructField; import org.apache.doris.catalog.StructType; +import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Type; import org.apache.doris.common.UserException; import org.apache.doris.common.info.SimpleTableInfo; @@ -604,9 +605,9 @@ public static long getIcebergRowCount(ExternalCatalog catalog, String dbName, St .getIcebergTable(catalog, dbName, tbName); Snapshot snapshot = icebergTable.currentSnapshot(); if (snapshot == null) { - LOG.info("Iceberg table {}.{}.{} is empty, return row count 0.", catalog.getName(), dbName, tbName); + LOG.info("Iceberg table {}.{}.{} is empty, return -1.", catalog.getName(), dbName, tbName); // empty table - return 0; + return TableIf.UNKNOWN_ROW_COUNT; } Map summary = snapshot.summary(); long rows = Long.parseLong(summary.get(TOTAL_RECORDS)) - Long.parseLong(summary.get(TOTAL_POSITION_DELETES)); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergDeleteFileFilter.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergDeleteFileFilter.java index 394bc849a56a492..b876732ff3f4e3e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergDeleteFileFilter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergDeleteFileFilter.java @@ -25,23 +25,25 @@ @Data public class IcebergDeleteFileFilter { private String deleteFilePath; + private long filesize; - public IcebergDeleteFileFilter(String deleteFilePath) { + public IcebergDeleteFileFilter(String deleteFilePath, long filesize) { this.deleteFilePath = deleteFilePath; + this.filesize = filesize; } public static PositionDelete createPositionDelete(String deleteFilePath, Long positionLowerBound, - Long positionUpperBound) { - return new PositionDelete(deleteFilePath, positionLowerBound, positionUpperBound); + Long positionUpperBound, long filesize) { + return new PositionDelete(deleteFilePath, positionLowerBound, positionUpperBound, filesize); } - public static EqualityDelete createEqualityDelete(String deleteFilePath, List fieldIds) { + public static EqualityDelete createEqualityDelete(String deleteFilePath, List fieldIds, long fileSize) { // todo: // Schema deleteSchema = TypeUtil.select(scan.schema(), new HashSet<>(fieldIds)); // StructLikeSet deleteSet = StructLikeSet.create(deleteSchema.asStruct()); // pass deleteSet to BE // compare two StructLike value, if equals, filtered - return new EqualityDelete(deleteFilePath, fieldIds); + return new EqualityDelete(deleteFilePath, fieldIds, fileSize); } static class PositionDelete extends IcebergDeleteFileFilter { @@ -49,8 +51,8 @@ static class PositionDelete extends IcebergDeleteFileFilter { private final Long positionUpperBound; public PositionDelete(String deleteFilePath, Long positionLowerBound, - Long positionUpperBound) { - super(deleteFilePath); + Long positionUpperBound, long fileSize) { + super(deleteFilePath, fileSize); this.positionLowerBound = positionLowerBound; this.positionUpperBound = positionUpperBound; } @@ -67,8 +69,8 @@ public OptionalLong getPositionUpperBound() { static class EqualityDelete extends IcebergDeleteFileFilter { private List fieldIds; - public EqualityDelete(String deleteFilePath, List fieldIds) { - super(deleteFilePath); + public EqualityDelete(String deleteFilePath, List fieldIds, long fileSize) { + super(deleteFilePath, fileSize); this.fieldIds = fieldIds; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java index fe6c54cf53b9760..56dda7b4fe28b2d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java @@ -282,7 +282,7 @@ private List doGetSplits() throws UserException { } selectedPartitionNum = partitionPathSet.size(); - + splits.forEach(s -> s.setTargetSplitSize(fileSplitSize)); return splits; } @@ -315,10 +315,11 @@ private List getDeleteFileFilters(FileScanTask spitTask .map(m -> m.get(MetadataColumns.DELETE_FILE_POS.fieldId())) .map(bytes -> Conversions.fromByteBuffer(MetadataColumns.DELETE_FILE_POS.type(), bytes)); filters.add(IcebergDeleteFileFilter.createPositionDelete(delete.path().toString(), - positionLowerBound.orElse(-1L), positionUpperBound.orElse(-1L))); + positionLowerBound.orElse(-1L), positionUpperBound.orElse(-1L), + delete.fileSizeInBytes())); } else if (delete.content() == FileContent.EQUALITY_DELETES) { filters.add(IcebergDeleteFileFilter.createEqualityDelete( - delete.path().toString(), delete.equalityFieldIds())); + delete.path().toString(), delete.equalityFieldIds(), delete.fileSizeInBytes())); } else { throw new IllegalStateException("Unknown delete content: " + delete.content()); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergSplit.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergSplit.java index 46e8f96ba35daf6..580d3cf1bb23f38 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergSplit.java @@ -47,6 +47,7 @@ public IcebergSplit(LocationPath file, long start, long length, long fileLength, this.formatVersion = formatVersion; this.config = config; this.originalPath = originalPath; + this.selfSplitWeight = length; } public long getRowCount() { @@ -56,4 +57,9 @@ public long getRowCount() { public void setRowCount(long rowCount) { this.rowCount = rowCount; } + + public void setDeleteFileFilters(List deleteFileFilters) { + this.deleteFileFilters = deleteFileFilters; + this.selfSplitWeight += deleteFileFilters.stream().mapToLong(IcebergDeleteFileFilter::getFilesize).sum(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/JdbcExternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/JdbcExternalCatalog.java index e7e7634cff0207a..fb26265d19fe93c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/JdbcExternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/JdbcExternalCatalog.java @@ -93,8 +93,6 @@ public void checkProperties() throws DdlException { JdbcResource.checkBooleanProperty(JdbcResource.CONNECTION_POOL_KEEP_ALIVE, String.valueOf(isConnectionPoolKeepAlive())); JdbcResource.checkBooleanProperty(JdbcResource.TEST_CONNECTION, String.valueOf(isTestConnection())); - JdbcResource.checkBooleanProperty(JdbcResource.ENABLE_CONNECTION_POOL, - String.valueOf(isEnableConnectionPool())); JdbcResource.checkDatabaseListProperties(getOnlySpecifiedDatabase(), getIncludeDatabaseMap(), getExcludeDatabaseMap()); JdbcResource.checkConnectionPoolProperties(getConnectionPoolMinSize(), getConnectionPoolMaxSize(), @@ -114,27 +112,6 @@ public void setDefaultPropsIfMissing(boolean isReplay) { throw new IllegalArgumentException("Jdbc catalog property lower_case_table_names is not supported," + " please use lower_case_meta_names instead."); } - if (catalogProperty.getOrDefault(JdbcResource.ENABLE_CONNECTION_POOL, "").isEmpty()) { - // If not setting enable_connection_pool in replay logic, - // set default value true to be compatible with older version. - catalogProperty.addProperty(JdbcResource.ENABLE_CONNECTION_POOL, - isReplay ? "true" : String.valueOf(JdbcResource - .getDefaultPropertyValue(JdbcResource.ENABLE_CONNECTION_POOL))); - } - } - - @Override - public void tryModifyCatalogProps(Map props) { - // It is forbidden to modify the enable_connection_pool attribute and driver_url attribute of jdbc catalog - if (props.containsKey(JdbcResource.ENABLE_CONNECTION_POOL)) { - throw new IllegalArgumentException("Can not modify enable_connection_pool property of jdbc catalog," - + "please re-create the catalog"); - } - if (props.containsKey(JdbcResource.DRIVER_URL)) { - throw new IllegalArgumentException("Can not modify driver_url property of jdbc catalog" - + "please re-create the catalog"); - } - super.tryModifyCatalogProps(props); } @Override @@ -245,11 +222,6 @@ public boolean isTestConnection() { .getDefaultPropertyValue(JdbcResource.TEST_CONNECTION))); } - public boolean isEnableConnectionPool() { - return Boolean.parseBoolean(catalogProperty.getOrDefault(JdbcResource.ENABLE_CONNECTION_POOL, JdbcResource - .getDefaultPropertyValue(JdbcResource.ENABLE_CONNECTION_POOL))); - } - @Override protected void initLocalObjectsImpl() { JdbcClientConfig jdbcClientConfig = new JdbcClientConfig() @@ -268,8 +240,7 @@ protected void initLocalObjectsImpl() { .setConnectionPoolMaxSize(getConnectionPoolMaxSize()) .setConnectionPoolMaxLifeTime(getConnectionPoolMaxLifeTime()) .setConnectionPoolMaxWaitTime(getConnectionPoolMaxWaitTime()) - .setConnectionPoolKeepAlive(isConnectionPoolKeepAlive()) - .setEnableConnectionPool(isEnableConnectionPool()); + .setConnectionPoolKeepAlive(isConnectionPoolKeepAlive()); jdbcClient = JdbcClient.createJdbcClient(jdbcClientConfig); } @@ -349,7 +320,6 @@ public void configureJdbcTable(JdbcTable jdbcTable, String tableName) { jdbcTable.setConnectionPoolMaxLifeTime(this.getConnectionPoolMaxLifeTime()); jdbcTable.setConnectionPoolMaxWaitTime(this.getConnectionPoolMaxWaitTime()); jdbcTable.setConnectionPoolKeepAlive(this.isConnectionPoolKeepAlive()); - jdbcTable.setEnableConnectionPool(this.isEnableConnectionPool()); } private void testJdbcConnection() throws DdlException { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/JdbcExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/JdbcExternalTable.java index d60006af7090143..9e188a711b02975 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/JdbcExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/JdbcExternalTable.java @@ -47,8 +47,24 @@ public class JdbcExternalTable extends ExternalTable { private static final Logger LOG = LogManager.getLogger(JdbcExternalTable.class); - public static final String MYSQL_ROW_COUNT_SQL = "SELECT * FROM QUERY" - + "(\"catalog\"=\"${ctlName}\", \"query\"=\"show table status from `${dbName}` like '${tblName}'\");"; + public static final String MYSQL_ROW_COUNT_SQL = "SELECT max(row_count) as rows FROM (" + + "(SELECT TABLE_ROWS AS row_count FROM INFORMATION_SCHEMA.TABLES " + + "WHERE TABLE_SCHEMA = '${dbName}' AND TABLE_NAME = '${tblName}' " + + "AND TABLE_TYPE = 'BASE TABLE') " + + "UNION ALL " + + "(SELECT CARDINALITY AS row_count FROM INFORMATION_SCHEMA.STATISTICS " + + "WHERE TABLE_SCHEMA = '${dbName}' AND TABLE_NAME = '${tblName}' " + + "AND CARDINALITY IS NOT NULL)) t"; + + public static final String PG_ROW_COUNT_SQL = "SELECT reltuples as rows FROM pg_class " + + "WHERE relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = '${dbName}') " + + "AND relname = '${tblName}'"; + + public static final String SQLSERVER_ROW_COUNT_SQL = "SELECT sum(rows) as rows FROM sys.partitions " + + "WHERE object_id = (SELECT object_id('${dbName}.${tblName}')) AND index_id IN (0, 1)"; + + public static final String FETCH_ROW_COUNT_TEMPLATE = "SELECT * FROM QUERY" + + "(\"catalog\"=\"${ctlName}\", \"query\"=\"${sql}\");"; private JdbcTable jdbcTable; @@ -119,41 +135,55 @@ public long fetchRowCount() { params.put("tblName", name); switch (((JdbcExternalCatalog) catalog).getDatabaseTypeName()) { case JdbcResource.MYSQL: - try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext(false)) { - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(MYSQL_ROW_COUNT_SQL); - StmtExecutor stmtExecutor = new StmtExecutor(r.connectContext, sql); - List resultRows = stmtExecutor.executeInternalQuery(); - if (resultRows == null || resultRows.size() != 1) { - LOG.info("No mysql status found for table {}.{}.{}", catalog.getName(), dbName, name); - return -1; - } - StatementBase parsedStmt = stmtExecutor.getParsedStmt(); - if (parsedStmt == null || parsedStmt.getColLabels() == null) { - LOG.info("No column label found for table {}.{}.{}", catalog.getName(), dbName, name); - return -1; - } - ResultRow resultRow = resultRows.get(0); - List colLabels = parsedStmt.getColLabels(); - int index = colLabels.indexOf("TABLE_ROWS"); - if (index == -1) { - LOG.info("No TABLE_ROWS in status for table {}.{}.{}", catalog.getName(), dbName, name); - return -1; - } - long rows = Long.parseLong(resultRow.get(index)); - LOG.info("Get mysql table {}.{}.{} row count {}", catalog.getName(), dbName, name, rows); - return rows; - } catch (Exception e) { - LOG.warn("Failed to fetch mysql row count for table {}.{}.{}. Reason [{}]", - catalog.getName(), dbName, name, e.getMessage()); - return -1; - } - case JdbcResource.ORACLE: + params.put("sql", MYSQL_ROW_COUNT_SQL); + return getRowCount(params); case JdbcResource.POSTGRESQL: + params.put("sql", PG_ROW_COUNT_SQL); + return getRowCount(params); case JdbcResource.SQLSERVER: + params.put("sql", SQLSERVER_ROW_COUNT_SQL); + return getRowCount(params); + case JdbcResource.ORACLE: default: break; } - return -1; + return UNKNOWN_ROW_COUNT; + } + + protected long getRowCount(Map params) { + try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext(false)) { + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + String sql = stringSubstitutor.replace(FETCH_ROW_COUNT_TEMPLATE); + StmtExecutor stmtExecutor = new StmtExecutor(r.connectContext, sql); + List resultRows = stmtExecutor.executeInternalQuery(); + if (resultRows == null || resultRows.size() != 1) { + LOG.info("No status found for table {}.{}.{}", catalog.getName(), dbName, name); + return UNKNOWN_ROW_COUNT; + } + StatementBase parsedStmt = stmtExecutor.getParsedStmt(); + if (parsedStmt == null || parsedStmt.getColLabels() == null) { + LOG.info("No column label found for table {}.{}.{}", catalog.getName(), dbName, name); + return UNKNOWN_ROW_COUNT; + } + ResultRow resultRow = resultRows.get(0); + List colLabels = parsedStmt.getColLabels(); + int index = colLabels.indexOf("rows"); + if (index == -1) { + LOG.info("No TABLE_ROWS in status for table {}.{}.{}", catalog.getName(), dbName, name); + return UNKNOWN_ROW_COUNT; + } + long rows = Long.parseLong(resultRow.get(index)); + if (rows <= 0) { + LOG.info("Table {}.{}.{} row count is {}, discard it and use -1 instead", + catalog.getName(), dbName, name, rows); + return UNKNOWN_ROW_COUNT; + } + LOG.info("Get table {}.{}.{} row count {}", catalog.getName(), dbName, name, rows); + return rows; + } catch (Exception e) { + LOG.warn("Failed to fetch row count for table {}.{}.{}. Reason [{}]", + catalog.getName(), dbName, name, e.getMessage()); + return UNKNOWN_ROW_COUNT; + } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcClient.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcClient.java index e863a42c122a9be..458142ff518fe67 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcClient.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcClient.java @@ -39,7 +39,6 @@ import java.net.URLClassLoader; import java.sql.Connection; import java.sql.DatabaseMetaData; -import java.sql.Driver; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.ResultSetMetaData; @@ -49,7 +48,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Properties; import java.util.Set; import java.util.function.Consumer; @@ -62,11 +60,7 @@ public abstract class JdbcClient { private String catalogName; protected String dbType; protected String jdbcUser; - protected String jdbcUrl; - protected String jdbcPassword; - protected String jdbcDriverClass; protected ClassLoader classLoader = null; - protected boolean enableConnectionPool; protected HikariDataSource dataSource = null; protected boolean isOnlySpecifiedDatabase; protected boolean isLowerCaseMetaNames; @@ -109,9 +103,6 @@ protected JdbcClient(JdbcClientConfig jdbcClientConfig) { System.setProperty("com.zaxxer.hikari.useWeakReferences", "true"); this.catalogName = jdbcClientConfig.getCatalog(); this.jdbcUser = jdbcClientConfig.getUser(); - this.jdbcPassword = jdbcClientConfig.getPassword(); - this.jdbcUrl = jdbcClientConfig.getJdbcUrl(); - this.jdbcDriverClass = jdbcClientConfig.getDriverClass(); this.isOnlySpecifiedDatabase = Boolean.parseBoolean(jdbcClientConfig.getOnlySpecifiedDatabase()); this.isLowerCaseMetaNames = Boolean.parseBoolean(jdbcClientConfig.getIsLowerCaseMetaNames()); this.metaNamesMapping = jdbcClientConfig.getMetaNamesMapping(); @@ -119,12 +110,10 @@ protected JdbcClient(JdbcClientConfig jdbcClientConfig) { Optional.ofNullable(jdbcClientConfig.getIncludeDatabaseMap()).orElse(Collections.emptyMap()); this.excludeDatabaseMap = Optional.ofNullable(jdbcClientConfig.getExcludeDatabaseMap()).orElse(Collections.emptyMap()); - this.enableConnectionPool = jdbcClientConfig.isEnableConnectionPool(); + String jdbcUrl = jdbcClientConfig.getJdbcUrl(); this.dbType = parseDbType(jdbcUrl); initializeClassLoader(jdbcClientConfig); - if (enableConnectionPool) { - initializeDataSource(jdbcClientConfig); - } + initializeDataSource(jdbcClientConfig); this.jdbcLowerCaseMetaMatching = new JdbcIdentifierMapping(isLowerCaseMetaNames, metaNamesMapping, this); } @@ -148,6 +137,7 @@ private void initializeDataSource(JdbcClientConfig config) { dataSource.setConnectionTimeout(config.getConnectionPoolMaxWaitTime()); // default 5000 dataSource.setMaxLifetime(config.getConnectionPoolMaxLifeTime()); // default 30 min dataSource.setIdleTimeout(config.getConnectionPoolMaxLifeTime() / 2L); // default 15 min + dataSource.setConnectionTestQuery(getTestQuery()); LOG.info("JdbcClient set" + " ConnectionPoolMinSize = " + config.getConnectionPoolMinSize() + ", ConnectionPoolMaxSize = " + config.getConnectionPoolMaxSize() @@ -179,57 +169,15 @@ public static String parseDbType(String jdbcUrl) { } public void closeClient() { - if (enableConnectionPool && dataSource != null) { - dataSource.close(); - } + dataSource.close(); } public Connection getConnection() throws JdbcClientException { - if (enableConnectionPool) { - return getConnectionWithPool(); - } else { - return getConnectionWithoutPool(); - } - } - - private Connection getConnectionWithoutPool() throws JdbcClientException { - ClassLoader oldClassLoader = Thread.currentThread().getContextClassLoader(); - try { - Thread.currentThread().setContextClassLoader(this.classLoader); - - Class driverClass = Class.forName(jdbcDriverClass, true, this.classLoader); - Driver driverInstance = (Driver) driverClass.getDeclaredConstructor().newInstance(); - - Properties info = new Properties(); - info.put("user", jdbcUser); - info.put("password", jdbcPassword); - - Connection connection = driverInstance.connect(SecurityChecker.getInstance().getSafeJdbcUrl(jdbcUrl), info); - - if (connection == null) { - throw new SQLException("Failed to establish a connection. The JDBC driver returned null. " - + "Please check if the JDBC URL is correct: " - + jdbcUrl - + ". Ensure that the URL format and parameters are valid for the driver: " - + driverInstance.getClass().getName()); - } - - return connection; - } catch (Exception e) { - String errorMessage = String.format("Can not connect to jdbc due to error: %s, Catalog name: %s", - e.getMessage(), this.getCatalogName()); - throw new JdbcClientException(errorMessage, e); - } finally { - Thread.currentThread().setContextClassLoader(oldClassLoader); - } - } - - - private Connection getConnectionWithPool() throws JdbcClientException { ClassLoader oldClassLoader = Thread.currentThread().getContextClassLoader(); + Connection conn; try { Thread.currentThread().setContextClassLoader(this.classLoader); - return dataSource.getConnection(); + conn = dataSource.getConnection(); } catch (Exception e) { String errorMessage = String.format( "Catalog `%s` can not connect to jdbc due to error: %s", @@ -238,15 +186,22 @@ private Connection getConnectionWithPool() throws JdbcClientException { } finally { Thread.currentThread().setContextClassLoader(oldClassLoader); } + return conn; } - public void close(AutoCloseable... closeables) { - for (AutoCloseable closeable : closeables) { - if (closeable != null) { + public void close(Object... resources) { + for (Object resource : resources) { + if (resource != null) { try { - closeable.close(); - } catch (Exception e) { - throw new JdbcClientException("Can not close : ", e); + if (resource instanceof ResultSet) { + ((ResultSet) resource).close(); + } else if (resource instanceof Statement) { + ((Statement) resource).close(); + } else if (resource instanceof Connection) { + ((Connection) resource).close(); + } + } catch (SQLException e) { + LOG.warn("Failed to close resource: {}", e.getMessage(), e); } } } @@ -258,9 +213,10 @@ public void close(AutoCloseable... closeables) { * @param origStmt, the raw stmt string */ public void executeStmt(String origStmt) { - Connection conn = getConnection(); + Connection conn = null; Statement stmt = null; try { + conn = getConnection(); stmt = conn.createStatement(); int effectedRows = stmt.executeUpdate(origStmt); if (LOG.isDebugEnabled()) { @@ -280,10 +236,12 @@ public void executeStmt(String origStmt) { * @return List */ public List getColumnsFromQuery(String query) { - Connection conn = getConnection(); + Connection conn = null; + PreparedStatement pstmt = null; List columns = Lists.newArrayList(); try { - PreparedStatement pstmt = conn.prepareStatement(query); + conn = getConnection(); + pstmt = conn.prepareStatement(query); ResultSetMetaData metaData = pstmt.getMetaData(); if (metaData == null) { throw new JdbcClientException("Query not supported: Failed to get ResultSetMetaData from query: %s", @@ -298,12 +256,11 @@ public List getColumnsFromQuery(String query) { } catch (SQLException e) { throw new JdbcClientException("Failed to get columns from query: %s", e, query); } finally { - close(conn); + close(pstmt, conn); } return columns; } - /** * Get schema from ResultSetMetaData * @@ -326,10 +283,11 @@ public List getSchemaFromResultSetMetaData(ResultSetMetaData me * @return list of database names */ public List getDatabaseNameList() { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; List remoteDatabaseNames = Lists.newArrayList(); try { + conn = getConnection(); if (isOnlySpecifiedDatabase && includeDatabaseMap.isEmpty() && excludeDatabaseMap.isEmpty()) { String currentDatabase = conn.getSchema(); remoteDatabaseNames.add(currentDatabase); @@ -388,12 +346,13 @@ public boolean isTableExist(String localDbName, String localTableName) { * get all columns of one table */ public List getJdbcColumnsInfo(String localDbName, String localTableName) { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; List tableSchema = Lists.newArrayList(); String remoteDbName = getRemoteDatabaseName(localDbName); String remoteTableName = getRemoteTableName(localDbName, localTableName); try { + conn = getConnection(); DatabaseMetaData databaseMetaData = conn.getMetaData(); String catalogName = getCatalogName(conn); rs = getRemoteColumns(databaseMetaData, catalogName, remoteDbName, remoteTableName); @@ -435,7 +394,7 @@ public Map getRemoteColumnNames(String localDbName, String local return jdbcLowerCaseMetaMatching.getRemoteColumnNames(localDbName, localTableName); } - // protected methods,for subclass to override + // protected methods, for subclass to override protected String getCatalogName(Connection conn) throws SQLException { return conn.getCatalog(); } @@ -446,9 +405,10 @@ protected String[] getTableTypes() { protected void processTable(String remoteDbName, String remoteTableName, String[] tableTypes, Consumer resultSetConsumer) { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; try { + conn = getConnection(); DatabaseMetaData databaseMetaData = conn.getMetaData(); String catalogName = getCatalogName(conn); rs = databaseMetaData.getTables(catalogName, remoteDbName, remoteTableName, tableTypes); @@ -520,15 +480,21 @@ protected Type createDecimalOrStringType(int precision, int scale) { public void testConnection() { String testQuery = getTestQuery(); - try (Connection conn = getConnection(); - Statement stmt = conn.createStatement(); - ResultSet rs = stmt.executeQuery(testQuery)) { + Connection conn = null; + Statement stmt = null; + ResultSet rs = null; + try { + conn = getConnection(); + stmt = conn.createStatement(); + rs = stmt.executeQuery(testQuery); if (!rs.next()) { throw new JdbcClientException( "Failed to test connection in FE: query executed but returned no results."); } } catch (SQLException e) { throw new JdbcClientException("Failed to test connection in FE: " + e.getMessage(), e); + } finally { + close(rs, stmt, conn); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcClientConfig.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcClientConfig.java index f3ab9953e050af9..85f3bd8f256d8b9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcClientConfig.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcClientConfig.java @@ -39,7 +39,6 @@ public class JdbcClientConfig implements Cloneable { private int connectionPoolMaxWaitTime; private int connectionPoolMaxLifeTime; private boolean connectionPoolKeepAlive; - private boolean enableConnectionPool; private Map includeDatabaseMap; private Map excludeDatabaseMap; @@ -59,8 +58,6 @@ public JdbcClientConfig() { JdbcResource.getDefaultPropertyValue(JdbcResource.CONNECTION_POOL_MAX_LIFE_TIME)); this.connectionPoolKeepAlive = Boolean.parseBoolean( JdbcResource.getDefaultPropertyValue(JdbcResource.CONNECTION_POOL_KEEP_ALIVE)); - this.enableConnectionPool = Boolean.parseBoolean( - JdbcResource.getDefaultPropertyValue(JdbcResource.ENABLE_CONNECTION_POOL)); this.includeDatabaseMap = Maps.newHashMap(); this.excludeDatabaseMap = Maps.newHashMap(); this.customizedProperties = Maps.newHashMap(); @@ -76,7 +73,6 @@ public JdbcClientConfig clone() { cloned.connectionPoolMaxLifeTime = connectionPoolMaxLifeTime; cloned.connectionPoolMaxWaitTime = connectionPoolMaxWaitTime; cloned.connectionPoolKeepAlive = connectionPoolKeepAlive; - cloned.enableConnectionPool = enableConnectionPool; cloned.includeDatabaseMap = Maps.newHashMap(includeDatabaseMap); cloned.excludeDatabaseMap = Maps.newHashMap(excludeDatabaseMap); cloned.customizedProperties = Maps.newHashMap(customizedProperties); @@ -212,15 +208,6 @@ public JdbcClientConfig setConnectionPoolKeepAlive(boolean connectionPoolKeepAli return this; } - public boolean isEnableConnectionPool() { - return enableConnectionPool; - } - - public JdbcClientConfig setEnableConnectionPool(boolean enableConnectionPool) { - this.enableConnectionPool = enableConnectionPool; - return this; - } - public Map getIncludeDatabaseMap() { return includeDatabaseMap; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcDB2Client.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcDB2Client.java index dafb00ca9e8781f..a353b7ac361353f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcDB2Client.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcDB2Client.java @@ -41,10 +41,11 @@ public String getTestQuery() { @Override public List getDatabaseNameList() { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; List remoteDatabaseNames = Lists.newArrayList(); try { + conn = getConnection(); if (isOnlySpecifiedDatabase && includeDatabaseMap.isEmpty() && excludeDatabaseMap.isEmpty()) { String currentDatabase = conn.getSchema().trim(); remoteDatabaseNames.add(currentDatabase); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcGbaseClient.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcGbaseClient.java index 5aaacb3e673bb4b..7ba393e0d0aae63 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcGbaseClient.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcGbaseClient.java @@ -41,10 +41,11 @@ protected JdbcGbaseClient(JdbcClientConfig jdbcClientConfig) { @Override public List getDatabaseNameList() { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; List remoteDatabaseNames = Lists.newArrayList(); try { + conn = getConnection(); if (isOnlySpecifiedDatabase && includeDatabaseMap.isEmpty() && excludeDatabaseMap.isEmpty()) { String currentDatabase = conn.getCatalog(); remoteDatabaseNames.add(currentDatabase); @@ -87,12 +88,13 @@ protected ResultSet getRemoteColumns(DatabaseMetaData databaseMetaData, String c @Override public List getJdbcColumnsInfo(String localDbName, String localTableName) { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; List tableSchema = Lists.newArrayList(); String remoteDbName = getRemoteDatabaseName(localDbName); String remoteTableName = getRemoteTableName(localDbName, localTableName); try { + conn = getConnection(); DatabaseMetaData databaseMetaData = conn.getMetaData(); String catalogName = getCatalogName(conn); rs = getRemoteColumns(databaseMetaData, catalogName, remoteDbName, remoteTableName); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcMySQLClient.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcMySQLClient.java index 465a3c152acfa5b..a8263f1621a3a88 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcMySQLClient.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcMySQLClient.java @@ -76,10 +76,11 @@ protected JdbcMySQLClient(JdbcClientConfig jdbcClientConfig, String dbType) { @Override public List getDatabaseNameList() { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; List remoteDatabaseNames = Lists.newArrayList(); try { + conn = getConnection(); if (isOnlySpecifiedDatabase && includeDatabaseMap.isEmpty() && excludeDatabaseMap.isEmpty()) { String currentDatabase = conn.getCatalog(); remoteDatabaseNames.add(currentDatabase); @@ -130,12 +131,13 @@ protected ResultSet getRemoteColumns(DatabaseMetaData databaseMetaData, String c */ @Override public List getJdbcColumnsInfo(String localDbName, String localTableName) { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; List tableSchema = Lists.newArrayList(); String remoteDbName = getRemoteDatabaseName(localDbName); String remoteTableName = getRemoteTableName(localDbName, localTableName); try { + conn = getConnection(); DatabaseMetaData databaseMetaData = conn.getMetaData(); String catalogName = getCatalogName(conn); rs = getRemoteColumns(databaseMetaData, catalogName, remoteDbName, remoteTableName); @@ -294,30 +296,33 @@ private boolean isConvertDatetimeToNull(JdbcClientConfig jdbcClientConfig) { * get all columns like DatabaseMetaData.getColumns in mysql-jdbc-connector */ private Map getColumnsDataTypeUseQuery(String remoteDbName, String remoteTableName) { - Connection conn = getConnection(); + Connection conn = null; + Statement stmt = null; ResultSet resultSet = null; - Map fieldtoType = Maps.newHashMap(); + Map fieldToType = Maps.newHashMap(); StringBuilder queryBuf = new StringBuilder("SHOW FULL COLUMNS FROM "); queryBuf.append(remoteTableName); queryBuf.append(" FROM "); queryBuf.append(remoteDbName); - try (Statement stmt = conn.createStatement()) { + try { + conn = getConnection(); + stmt = conn.createStatement(); resultSet = stmt.executeQuery(queryBuf.toString()); while (resultSet.next()) { // get column name String fieldName = resultSet.getString("Field"); // get original type name String typeName = resultSet.getString("Type"); - fieldtoType.put(fieldName, typeName); + fieldToType.put(fieldName, typeName); } } catch (SQLException e) { throw new JdbcClientException("failed to get jdbc columns info for remote table `%s.%s`: %s", remoteDbName, remoteTableName, Util.getRootCauseMessage(e)); } finally { - close(resultSet, conn); + close(resultSet, stmt, conn); } - return fieldtoType; + return fieldToType; } private Type dorisTypeToDoris(JdbcFieldSchema fieldSchema) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcOracleClient.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcOracleClient.java index d37b36cbf3de155..9968de79ab3a7de 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcOracleClient.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/jdbc/client/JdbcOracleClient.java @@ -50,12 +50,13 @@ public String getTestQuery() { @Override public List getJdbcColumnsInfo(String localDbName, String localTableName) { - Connection conn = getConnection(); + Connection conn = null; ResultSet rs = null; List tableSchema = Lists.newArrayList(); String remoteDbName = getRemoteDatabaseName(localDbName); String remoteTableName = getRemoteTableName(localDbName, localTableName); try { + conn = getConnection(); DatabaseMetaData databaseMetaData = conn.getMetaData(); String catalogName = getCatalogName(conn); String modifiedTableName; diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonExternalTable.java index 4b364ef45ca321d..c9eaf1b7df32ef1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonExternalTable.java @@ -193,12 +193,16 @@ public long fetchRowCount() { Table paimonTable = schemaCacheValue.map(value -> ((PaimonSchemaCacheValue) value).getPaimonTable()) .orElse(null); if (paimonTable == null) { - return -1; + LOG.info("Paimon table {} is null.", name); + return UNKNOWN_ROW_COUNT; } List splits = paimonTable.newReadBuilder().newScan().plan().splits(); for (Split split : splits) { rowCount += split.rowCount(); } - return rowCount; + if (rowCount == 0) { + LOG.info("Paimon table {} row count is 0, return -1", name); + } + return rowCount > 0 ? rowCount : UNKNOWN_ROW_COUNT; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonFileExternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonFileExternalCatalog.java index 9b956a551d5b934..e74f3deeaf501e2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonFileExternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonFileExternalCatalog.java @@ -17,9 +17,11 @@ package org.apache.doris.datasource.paimon; +import org.apache.doris.common.util.LocationPath; import org.apache.doris.datasource.property.PropertyConverter; import org.apache.doris.datasource.property.constants.CosProperties; import org.apache.doris.datasource.property.constants.ObsProperties; +import org.apache.doris.datasource.property.constants.OssProperties; import org.apache.doris.datasource.property.constants.PaimonProperties; import org.apache.logging.log4j.LogManager; @@ -53,12 +55,17 @@ protected void setPaimonCatalogOptions(Map properties, Map splitStats = new ArrayList<>(); + private SessionVariable sessionVariable; - public PaimonScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumnPriv) { + public PaimonScanNode(PlanNodeId id, + TupleDescriptor desc, + boolean needCheckColumnPriv, + SessionVariable sessionVariable) { super(id, desc, "PAIMON_SCAN_NODE", StatisticalType.PAIMON_SCAN_NODE, needCheckColumnPriv); + this.sessionVariable = sessionVariable; } @Override @@ -176,7 +181,9 @@ private void setPaimonParams(TFileRangeDesc rangeDesc, PaimonSplit paimonSplit) @Override public List getSplits() throws UserException { - boolean forceJniScanner = ConnectContext.get().getSessionVariable().isForceJniScanner(); + boolean forceJniScanner = sessionVariable.isForceJniScanner(); + SessionVariable.IgnoreSplitType ignoreSplitType = + SessionVariable.IgnoreSplitType.valueOf(sessionVariable.getIgnoreSplitType()); List splits = new ArrayList<>(); int[] projected = desc.getSlots().stream().mapToInt( slot -> (source.getPaimonTable().rowType().getFieldNames().indexOf(slot.getColumn().getName()))) @@ -196,7 +203,11 @@ public List getSplits() throws UserException { selectedPartitionValues.add(partitionValue); Optional> optRawFiles = dataSplit.convertToRawFiles(); Optional> optDeletionFiles = dataSplit.deletionFiles(); + if (supportNativeReader(optRawFiles)) { + if (ignoreSplitType == SessionVariable.IgnoreSplitType.IGNORE_NATIVE) { + continue; + } splitStat.setType(SplitReadType.NATIVE); splitStat.setRawFileConvertable(true); List rawFiles = optRawFiles.get(); @@ -252,10 +263,16 @@ public List getSplits() throws UserException { } } } else { + if (ignoreSplitType == SessionVariable.IgnoreSplitType.IGNORE_JNI) { + continue; + } splits.add(new PaimonSplit(split)); ++paimonSplitNum; } } else { + if (ignoreSplitType == SessionVariable.IgnoreSplitType.IGNORE_JNI) { + continue; + } splits.add(new PaimonSplit(split)); ++paimonSplitNum; } @@ -263,6 +280,8 @@ public List getSplits() throws UserException { } this.selectedPartitionNum = selectedPartitionValues.size(); // TODO: get total partition number + // We should set fileSplitSize at the end because fileSplitSize may be modified in splitFile. + splits.forEach(s -> s.setTargetSplitSize(fileSplitSize)); return splits; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonSplit.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonSplit.java index ffd063d77e8bab8..3ab38c7db28e9e8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonSplit.java @@ -23,11 +23,14 @@ import org.apache.doris.datasource.TableFormatType; import com.google.common.collect.Maps; +import org.apache.paimon.io.DataFileMeta; +import org.apache.paimon.table.source.DataSplit; import org.apache.paimon.table.source.DeletionFile; import org.apache.paimon.table.source.Split; import java.util.List; import java.util.Optional; +import java.util.UUID; public class PaimonSplit extends FileSplit { private static final LocationPath DUMMY_PATH = new LocationPath("/dummyPath", Maps.newHashMap()); @@ -35,11 +38,20 @@ public class PaimonSplit extends FileSplit { private TableFormatType tableFormatType; private Optional optDeletionFile; + public PaimonSplit(Split split) { super(DUMMY_PATH, 0, 0, 0, 0, null, null); this.split = split; this.tableFormatType = TableFormatType.PAIMON; this.optDeletionFile = Optional.empty(); + + if (split instanceof DataSplit) { + List dataFileMetas = ((DataSplit) split).dataFiles(); + this.path = new LocationPath("/" + dataFileMetas.get(0).fileName()); + this.selfSplitWeight = dataFileMetas.stream().mapToLong(DataFileMeta::fileSize).sum(); + } else { + this.selfSplitWeight = split.rowCount(); + } } private PaimonSplit(LocationPath file, long start, long length, long fileLength, long modificationTime, @@ -47,6 +59,15 @@ private PaimonSplit(LocationPath file, long start, long length, long fileLength, super(file, start, length, fileLength, modificationTime, hosts, partitionList); this.tableFormatType = TableFormatType.PAIMON; this.optDeletionFile = Optional.empty(); + this.selfSplitWeight = length; + } + + @Override + public String getConsistentHashString() { + if (this.path == DUMMY_PATH) { + return UUID.randomUUID().toString(); + } + return getPathString(); } public Split getSplit() { @@ -66,6 +87,7 @@ public Optional getDeletionFile() { } public void setDeletionFile(DeletionFile deletionFile) { + this.selfSplitWeight += deletionFile.length(); this.optDeletionFile = Optional.of(deletionFile); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/remote/RemoteFSPhantomManager.java b/fe/fe-core/src/main/java/org/apache/doris/fs/remote/RemoteFSPhantomManager.java index 282361c4cb63b01..c0e48a1346651a3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/fs/remote/RemoteFSPhantomManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/fs/remote/RemoteFSPhantomManager.java @@ -19,6 +19,7 @@ import org.apache.doris.common.CustomThreadFactory; +import com.google.common.collect.Sets; import org.apache.hadoop.fs.FileSystem; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -27,6 +28,7 @@ import java.lang.ref.PhantomReference; import java.lang.ref.Reference; import java.lang.ref.ReferenceQueue; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; @@ -63,6 +65,8 @@ public class RemoteFSPhantomManager { private static final ConcurrentHashMap, FileSystem> referenceMap = new ConcurrentHashMap<>(); + private static final Set fsSet = Sets.newConcurrentHashSet(); + // Flag indicating whether the cleanup thread has been started private static final AtomicBoolean isStarted = new AtomicBoolean(false); @@ -77,9 +81,13 @@ public static void registerPhantomReference(RemoteFileSystem remoteFileSystem) { start(); isStarted.set(true); } + if (fsSet.contains(remoteFileSystem.dfsFileSystem)) { + throw new RuntimeException("FileSystem already exists: " + remoteFileSystem.dfsFileSystem.getUri()); + } RemoteFileSystemPhantomReference phantomReference = new RemoteFileSystemPhantomReference(remoteFileSystem, referenceQueue); referenceMap.put(phantomReference, remoteFileSystem.dfsFileSystem); + fsSet.add(remoteFileSystem.dfsFileSystem); } /** @@ -102,6 +110,7 @@ public static void start() { if (fs != null) { try { fs.close(); + fsSet.remove(fs); LOG.info("Closed file system: {}", fs.getUri()); } catch (IOException e) { LOG.warn("Failed to close file system", e); diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/remote/S3FileSystem.java b/fe/fe-core/src/main/java/org/apache/doris/fs/remote/S3FileSystem.java index 87ba086baecbf12..f8805bd0d4fb9a4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/fs/remote/S3FileSystem.java +++ b/fe/fe-core/src/main/java/org/apache/doris/fs/remote/S3FileSystem.java @@ -20,6 +20,8 @@ import org.apache.doris.analysis.StorageBackend; import org.apache.doris.backup.Status; import org.apache.doris.common.UserException; +import org.apache.doris.common.security.authentication.AuthenticationConfig; +import org.apache.doris.common.security.authentication.HadoopAuthenticator; import org.apache.doris.datasource.property.PropertyConverter; import org.apache.doris.fs.obj.S3ObjStorage; import org.apache.doris.fs.remote.dfs.DFSFileSystem; @@ -34,6 +36,7 @@ import org.apache.logging.log4j.Logger; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.List; import java.util.Map; @@ -74,12 +77,20 @@ protected FileSystem nativeFileSystem(String remotePath) throws UserException { PropertyConverter.convertToHadoopFSProperties(properties).entrySet().stream() .filter(entry -> entry.getKey() != null && entry.getValue() != null) .forEach(entry -> conf.set(entry.getKey(), entry.getValue())); + AuthenticationConfig authConfig = AuthenticationConfig.getKerberosConfig(conf); + HadoopAuthenticator authenticator = HadoopAuthenticator.getHadoopAuthenticator(authConfig); try { - dfsFileSystem = FileSystem.get(new Path(remotePath).toUri(), conf); + dfsFileSystem = authenticator.doAs(() -> { + try { + return FileSystem.get(new Path(remotePath).toUri(), conf); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + RemoteFSPhantomManager.registerPhantomReference(this); } catch (Exception e) { throw new UserException("Failed to get S3 FileSystem for " + e.getMessage(), e); } - RemoteFSPhantomManager.registerPhantomReference(this); } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/remote/dfs/DFSFileSystem.java b/fe/fe-core/src/main/java/org/apache/doris/fs/remote/dfs/DFSFileSystem.java index 7034641a9fc1287..2146472aec7b21c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/fs/remote/dfs/DFSFileSystem.java +++ b/fe/fe-core/src/main/java/org/apache/doris/fs/remote/dfs/DFSFileSystem.java @@ -99,11 +99,11 @@ public FileSystem nativeFileSystem(String remotePath) throws UserException { throw new RuntimeException(e); } }); + operations = new HDFSFileOperations(dfsFileSystem); + RemoteFSPhantomManager.registerPhantomReference(this); } catch (Exception e) { - throw new UserException(e); + throw new UserException("Failed to get dfs FileSystem for " + e.getMessage(), e); } - operations = new HDFSFileOperations(dfsFileSystem); - RemoteFSPhantomManager.registerPhantomReference(this); } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java index 33418531f2cda8f..0a07b5b6851eac5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java @@ -98,7 +98,6 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Optional; -import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; @Data @@ -207,9 +206,7 @@ public class ExportJob implements Writable { // backend_address => snapshot path private List> snapshotPaths = Lists.newArrayList(); - private List jobExecutorList; - - private ConcurrentHashMap taskIdToExecutor = new ConcurrentHashMap<>(); + private List jobExecutorList = Lists.newArrayList(); private Integer finishedTaskCount = 0; private List> allOutfileInfo = Lists.newArrayList(); @@ -399,8 +396,8 @@ private StatementBase generateLogicalPlanAdapter(LogicalPlan outfileLogicalPlan) return statementBase; } - public List getTaskExecutors() { - return jobExecutorList; + public List getCopiedTaskExecutors() { + return Lists.newArrayList(jobExecutorList); } private void generateExportJobExecutor() { @@ -690,11 +687,11 @@ private void cancelExportTask(ExportFailMsg.CancelType type, String msg) throws } // we need cancel all task - taskIdToExecutor.keySet().forEach(id -> { + jobExecutorList.forEach(executor -> { try { - Env.getCurrentEnv().getTransientTaskManager().cancelMemoryTask(id); + Env.getCurrentEnv().getTransientTaskManager().cancelMemoryTask(executor.getId()); } catch (JobException e) { - LOG.warn("cancel export task {} exception: {}", id, e); + LOG.warn("cancel export task {} exception: {}", executor.getId(), e); } }); @@ -705,10 +702,11 @@ private void cancelExportJobUnprotected(ExportFailMsg.CancelType type, String ms setExportJobState(ExportJobState.CANCELLED); finishTimeMs = System.currentTimeMillis(); failMsg = new ExportFailMsg(type, msg); + jobExecutorList.clear(); if (FeConstants.runningUnitTest) { return; } - Env.getCurrentEnv().getEditLog().logExportUpdateState(id, ExportJobState.CANCELLED); + Env.getCurrentEnv().getEditLog().logExportUpdateState(this, ExportJobState.CANCELLED); } private void exportExportJob() { @@ -749,7 +747,9 @@ private void finishExportJobUnprotected() { setExportJobState(ExportJobState.FINISHED); finishTimeMs = System.currentTimeMillis(); outfileInfo = GsonUtils.GSON.toJson(allOutfileInfo); - Env.getCurrentEnv().getEditLog().logExportUpdateState(id, ExportJobState.FINISHED); + // Clear the jobExecutorList to release memory. + jobExecutorList.clear(); + Env.getCurrentEnv().getEditLog().logExportUpdateState(this, ExportJobState.FINISHED); } public void replayExportJobState(ExportJobState newState) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportJobStateTransfer.java b/fe/fe-core/src/main/java/org/apache/doris/load/ExportJobStateTransfer.java index 06253b1f1e85f95..4b6f2ff9f516b9a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportJobStateTransfer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportJobStateTransfer.java @@ -54,10 +54,9 @@ public ExportJobStateTransfer() { } // used for persisting one log - public ExportJobStateTransfer(long jobId, ExportJobState state) { - this.jobId = jobId; + public ExportJobStateTransfer(ExportJob job, ExportJobState state) { + this.jobId = job.getId(); this.state = state; - ExportJob job = Env.getCurrentEnv().getExportMgr().getJob(jobId); this.startTimeMs = job.getStartTimeMs(); this.finishTimeMs = job.getFinishTimeMs(); this.failMsg = job.getFailMsg(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java b/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java index 7dbe953cf9bdbc0..94f43d531bb1f19 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java @@ -67,8 +67,8 @@ public class ExportMgr { // dbid ->