diff --git a/README.md b/README.md index 94f9f4b777f8f5..3d264ee13ed8ad 100644 --- a/README.md +++ b/README.md @@ -59,12 +59,9 @@ Apache Doris is an easy-to-use, high-performance and real-time analytical databa All this makes Apache Doris an ideal tool for scenarios including report analysis, ad-hoc query, unified data warehouse, and data lake query acceleration. On Apache Doris, users can build various applications, such as user behavior analysis, AB test platform, log retrieval analysis, user portrait analysis, and order analysis. -πŸŽ‰ Version 2.1.4 released now. Check out the πŸ”—[Release Notes](https://doris.apache.org/docs/releasenotes/release-2.1.4) here. The 2.1 verison delivers exceptional performance with 100% higher out-of-the-box queries proven by TPC-DS 1TB tests, enhanced data lake analytics that are 4-6 times speedier than Trino and Spark, solid support for semi-structured data analysis with new Variant types and suite of analytical functions, asynchronous materialized views for query acceleration, optimized real-time writing at scale, and better workload management with stability and runtime SQL resource tracking. +πŸŽ‰ Check out the πŸ”—[All releases](https://doris.apache.org/docs/releasenotes/all-release), where you'll find a chronological summary of Apache Doris versions released over the past year. - -πŸŽ‰ Version 2.0.12 is now released ! This fully evolved and stable release is ready for all users to upgrade. Check out the πŸ”—[Release Notes](https://doris.apache.org/docs/2.0/releasenotes/release-2.0.12) here. - -πŸ‘€ Have a look at the πŸ”—[Official Website](https://doris.apache.org/) for a comprehensive list of Apache Doris's core features, blogs and user cases. +πŸ‘€ Explore the πŸ”—[Official Website](https://doris.apache.org/) to discover Apache Doris's core features, blogs, and user cases in detail. ## πŸ“ˆ Usage Scenarios diff --git a/be/src/agent/workload_group_listener.cpp b/be/src/agent/workload_group_listener.cpp index 7b688b7dcdf6ef..0cd5a3ee1ac748 100644 --- a/be/src/agent/workload_group_listener.cpp +++ b/be/src/agent/workload_group_listener.cpp @@ -17,6 +17,7 @@ #include "agent/workload_group_listener.h" +#include "runtime/exec_env.h" #include "runtime/workload_group/workload_group.h" #include "runtime/workload_group/workload_group_manager.h" #include "util/mem_info.h" diff --git a/be/src/agent/workload_group_listener.h b/be/src/agent/workload_group_listener.h index f596535908d079..9578a36f70d63e 100644 --- a/be/src/agent/workload_group_listener.h +++ b/be/src/agent/workload_group_listener.h @@ -20,10 +20,11 @@ #include #include "agent/topic_listener.h" -#include "runtime/exec_env.h" namespace doris { +class ExecEnv; + class WorkloadGroupListener : public TopicListener { public: ~WorkloadGroupListener() {} diff --git a/be/src/apache-orc b/be/src/apache-orc index db01184f765c03..2f937bdc76406f 160000 --- a/be/src/apache-orc +++ b/be/src/apache-orc @@ -1 +1 @@ -Subproject commit db01184f765c03496e4107bd3ac37c077ac4bc5f +Subproject commit 2f937bdc76406f150b484b6e57629aa8a03d48b6 diff --git a/be/src/cloud/cloud_base_compaction.cpp b/be/src/cloud/cloud_base_compaction.cpp index 9742e57dcf9d34..d053214e964a78 100644 --- a/be/src/cloud/cloud_base_compaction.cpp +++ b/be/src/cloud/cloud_base_compaction.cpp @@ -268,8 +268,9 @@ Status CloudBaseCompaction::execute_compact() { << ", output_version=" << _output_version; return res; } - LOG_INFO("finish CloudBaseCompaction, tablet_id={}, cost={}ms", _tablet->tablet_id(), - duration_cast(steady_clock::now() - start).count()) + LOG_INFO("finish CloudBaseCompaction, tablet_id={}, cost={}ms range=[{}-{}]", + _tablet->tablet_id(), duration_cast(steady_clock::now() - start).count(), + _input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()) .tag("job_id", _uuid) .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) @@ -343,7 +344,7 @@ Status CloudBaseCompaction::modify_rowsets() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("update_bitmap_size", output_rowset_delete_bitmap->delete_bitmap.size()); + .tag("num_output_delete_bitmap", output_rowset_delete_bitmap->delete_bitmap.size()); compaction_job->set_delete_bitmap_lock_initiator(initiator); } diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp b/be/src/cloud/cloud_cumulative_compaction.cpp index 6b4dbe360da651..c7a82b322fb82a 100644 --- a/be/src/cloud/cloud_cumulative_compaction.cpp +++ b/be/src/cloud/cloud_cumulative_compaction.cpp @@ -92,6 +92,10 @@ Status CloudCumulativeCompaction::prepare_compact() { // plus 1 to skip the delete version. // NOTICE: after that, the cumulative point may be larger than max version of this tablet, but it doesn't matter. update_cumulative_point(); + if (!config::enable_sleep_between_delete_cumu_compaction) { + st = Status::Error( + "_last_delete_version.first not equal to -1"); + } } return st; } @@ -200,8 +204,9 @@ Status CloudCumulativeCompaction::execute_compact() { << ", output_version=" << _output_version; return res; } - LOG_INFO("finish CloudCumulativeCompaction, tablet_id={}, cost={}ms", _tablet->tablet_id(), - duration_cast(steady_clock::now() - start).count()) + LOG_INFO("finish CloudCumulativeCompaction, tablet_id={}, cost={}ms, range=[{}-{}]", + _tablet->tablet_id(), duration_cast(steady_clock::now() - start).count(), + _input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()) .tag("job_id", _uuid) .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) @@ -295,7 +300,8 @@ Status CloudCumulativeCompaction::modify_rowsets() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("update_bitmap_size", output_rowset_delete_bitmap->delete_bitmap.size()); + .tag("number_output_delete_bitmap", + output_rowset_delete_bitmap->delete_bitmap.size()); compaction_job->set_delete_bitmap_lock_initiator(initiator); } diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index 835e74ca7d5687..2cd6b58c57b5f6 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -143,6 +143,10 @@ class MetaServiceProxy { } private: + static bool is_meta_service_endpoint_list() { + return config::meta_service_endpoint.find(',') != std::string::npos; + } + static Status get_pooled_client(std::shared_ptr* stub) { static std::once_flag proxies_flag; static size_t num_proxies = 1; @@ -154,9 +158,6 @@ class MetaServiceProxy { if (config::meta_service_connection_pooled) { num_proxies = config::meta_service_connection_pool_size; } - if (config::meta_service_endpoint.find(',') != std::string::npos) { - is_meta_service_endpoint_list = true; - } proxies = std::make_unique(num_proxies); }); @@ -175,7 +176,7 @@ class MetaServiceProxy { const char* load_balancer_name = nullptr; std::string endpoint; - if (is_meta_service_endpoint_list) { + if (is_meta_service_endpoint_list()) { endpoint = fmt::format("list://{}", config::meta_service_endpoint); load_balancer_name = "random"; } else { @@ -215,7 +216,7 @@ class MetaServiceProxy { bool is_idle_timeout(long now) { auto idle_timeout_ms = config::meta_service_idle_connection_timeout_ms; // idle timeout only works without list endpoint. - return !is_meta_service_endpoint_list && idle_timeout_ms > 0 && + return !is_meta_service_endpoint_list() && idle_timeout_ms > 0 && _last_access_at_ms.load(std::memory_order_relaxed) + idle_timeout_ms < now; } @@ -243,7 +244,7 @@ class MetaServiceProxy { long deadline = now; // connection age only works without list endpoint. - if (!is_meta_service_endpoint_list && + if (!is_meta_service_endpoint_list() && config::meta_service_connection_age_base_seconds > 0) { std::default_random_engine rng(static_cast(now)); std::uniform_int_distribution<> uni( @@ -262,16 +263,12 @@ class MetaServiceProxy { return Status::OK(); } - static std::atomic_bool is_meta_service_endpoint_list; - std::shared_mutex _mutex; std::atomic _last_access_at_ms {0}; long _deadline_ms {0}; std::shared_ptr _stub; }; -std::atomic_bool MetaServiceProxy::is_meta_service_endpoint_list = false; - template struct is_any : std::disjunction...> {}; diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index 1cc4d052a81d69..b086def3c03ee5 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -169,6 +169,15 @@ Status CloudSchemaChangeJob::process_alter_tablet(const TAlterTabletReqV2& reque reader_context.batch_size = ALTER_TABLE_BATCH_SIZE; reader_context.delete_bitmap = &_base_tablet->tablet_meta()->delete_bitmap(); reader_context.version = Version(0, start_resp.alter_version()); + std::vector cluster_key_idxes; + if (!_base_tablet_schema->cluster_key_uids().empty()) { + for (const auto& uid : _base_tablet_schema->cluster_key_uids()) { + cluster_key_idxes.emplace_back(_base_tablet_schema->field_index(uid)); + } + reader_context.read_orderby_key_columns = &cluster_key_idxes; + reader_context.is_unique = false; + reader_context.sequence_id_idx = -1; + } for (auto& split : rs_splits) { RETURN_IF_ERROR(split.rs_reader->init(&reader_context)); diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index b66a9cfbdb2245..650909a29157cd 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -677,7 +677,8 @@ Status CloudStorageEngine::_submit_cumulative_compaction_task(const CloudTabletS auto st = compaction->prepare_compact(); if (!st.ok()) { long now = duration_cast(system_clock::now().time_since_epoch()).count(); - if (st.is()) { + if (st.is() && + st.msg() != "_last_delete_version.first not equal to -1") { // Backoff strategy if no suitable version tablet->last_cumu_no_suitable_version_ms = now; } diff --git a/be/src/cloud/cloud_storage_engine.h b/be/src/cloud/cloud_storage_engine.h index 072b8366542253..2cd47c52dbeb62 100644 --- a/be/src/cloud/cloud_storage_engine.h +++ b/be/src/cloud/cloud_storage_engine.h @@ -75,7 +75,7 @@ class CloudStorageEngine final : public BaseStorageEngine { void _check_file_cache_ttl_block_valid(); std::optional get_storage_resource(const std::string& vault_id) { - LOG(INFO) << "Getting storage resource for vault_id: " << vault_id; + VLOG_DEBUG << "Getting storage resource for vault_id: " << vault_id; bool synced = false; do { diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 93c7128756738c..f0f6f92e9d0c36 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -33,6 +33,7 @@ #include "cloud/cloud_meta_mgr.h" #include "cloud/cloud_storage_engine.h" #include "cloud/cloud_tablet_mgr.h" +#include "common/logging.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" #include "olap/cumulative_compaction_time_series_policy.h" @@ -54,6 +55,7 @@ namespace doris { using namespace ErrorCode; static constexpr int COMPACTION_DELETE_BITMAP_LOCK_ID = -1; +static constexpr int LOAD_INITIATOR_ID = -1; CloudTablet::CloudTablet(CloudStorageEngine& engine, TabletMetaSharedPtr tablet_meta) : BaseTablet(std::move(tablet_meta)), _engine(engine) {} @@ -407,6 +409,9 @@ uint64_t CloudTablet::delete_expired_stale_rowsets() { auto rs_it = _stale_rs_version_map.find(v_ts->version()); if (rs_it != _stale_rs_version_map.end()) { expired_rowsets.push_back(rs_it->second); + LOG(INFO) << "erase stale rowset, tablet_id=" << tablet_id() + << " rowset_id=" << rs_it->second->rowset_id().to_string() + << " version=" << rs_it->first.to_string(); _stale_rs_version_map.erase(rs_it); } else { LOG(WARNING) << "cannot find stale rowset " << v_ts->version() << " in tablet " @@ -504,13 +509,19 @@ Result> CloudTablet::create_rowset_writer( Result> CloudTablet::create_transient_rowset_writer( const Rowset& rowset, std::shared_ptr partial_update_info, int64_t txn_expiration) { - if (rowset.rowset_meta()->rowset_state() != RowsetStatePB::BEGIN_PARTIAL_UPDATE) [[unlikely]] { - // May cause the segment files generated by the transient rowset writer unable to be - // recycled, see `CloudRowsetWriter::build` for detail. - LOG(WARNING) << "Wrong rowset state: " << rowset.rowset_meta()->rowset_state(); - DCHECK(false) << rowset.rowset_meta()->rowset_state(); + if (rowset.rowset_meta_state() != RowsetStatePB::BEGIN_PARTIAL_UPDATE && + rowset.rowset_meta_state() != RowsetStatePB::COMMITTED) [[unlikely]] { + auto msg = fmt::format( + "wrong rowset state when create_transient_rowset_writer, rowset state should be " + "BEGIN_PARTIAL_UPDATE or COMMITTED, but found {}, rowset_id={}, tablet_id={}", + RowsetStatePB_Name(rowset.rowset_meta_state()), rowset.rowset_id().to_string(), + tablet_id()); + // see `CloudRowsetWriter::build` for detail. + // if this is in a retry task, the rowset state may have been changed to RowsetStatePB::COMMITTED + // in `RowsetMeta::merge_rowset_meta()` in previous trials. + LOG(WARNING) << msg; + DCHECK(false) << msg; } - RowsetWriterContext context; context.rowset_state = PREPARED; context.segments_overlap = OVERLAPPING; @@ -650,11 +661,14 @@ void CloudTablet::get_compaction_status(std::string* json_result) { } void CloudTablet::set_cumulative_layer_point(int64_t new_point) { + if (new_point == Tablet::K_INVALID_CUMULATIVE_POINT || new_point >= _cumulative_point) { + _cumulative_point = new_point; + return; + } // cumulative point should only be reset to -1, or be increased - CHECK(new_point == Tablet::K_INVALID_CUMULATIVE_POINT || new_point >= _cumulative_point) - << "Unexpected cumulative point: " << new_point - << ", origin: " << _cumulative_point.load(); - _cumulative_point = new_point; + // FIXME: could happen in currently unresolved race conditions + LOG(WARNING) << "Unexpected cumulative point: " << new_point + << ", origin: " << _cumulative_point.load(); } std::vector CloudTablet::pick_candidate_rowsets_to_base_compaction() { @@ -703,6 +717,9 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t tx if (txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update() && rowset_writer->num_rows() > 0) { + DBUG_EXECUTE_IF("CloudTablet::save_delete_bitmap.update_tmp_rowset.error", { + return Status::InternalError("injected update_tmp_rowset error."); + }); const auto& rowset_meta = rowset->rowset_meta(); RETURN_IF_ERROR(_engine.meta_mgr().update_tmp_rowset(*rowset_meta)); } @@ -719,8 +736,8 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t tx } auto ms_lock_id = lock_id == -1 ? txn_id : lock_id; - RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap( - *this, ms_lock_id, COMPACTION_DELETE_BITMAP_LOCK_ID, new_delete_bitmap.get())); + RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap(*this, ms_lock_id, LOAD_INITIATOR_ID, + new_delete_bitmap.get())); // store the delete bitmap with sentinel marks in txn_delete_bitmap_cache because if the txn is retried for some reason, // it will use the delete bitmap from txn_delete_bitmap_cache when re-calculating the delete bitmap, during which it will do @@ -856,9 +873,6 @@ Status CloudTablet::sync_meta() { } return st; } - if (tablet_meta->tablet_state() != TABLET_RUNNING) { // impossible - return Status::InternalError("invalid tablet state. tablet_id={}", tablet_id()); - } auto new_ttl_seconds = tablet_meta->ttl_seconds(); if (_tablet_meta->ttl_seconds() != new_ttl_seconds) { diff --git a/be/src/cloud/cloud_tablet_hotspot.cpp b/be/src/cloud/cloud_tablet_hotspot.cpp index dd197268646fbc..6391a2dc5c4928 100644 --- a/be/src/cloud/cloud_tablet_hotspot.cpp +++ b/be/src/cloud/cloud_tablet_hotspot.cpp @@ -57,18 +57,55 @@ TabletHotspot::~TabletHotspot() { } } -struct MapKeyHash { - int64_t operator()(const std::pair& key) const { - return std::hash {}(key.first) + std::hash {}(key.second); +void get_return_partitions( + const std::unordered_map, MapKeyHash>& + hot_partition, + const std::unordered_map, MapKeyHash>& + last_hot_partition, + std::vector* hot_tables, int& return_partitions, int N) { + for (const auto& [key, partition_to_value] : hot_partition) { + THotTableMessage msg; + msg.table_id = key.first; + msg.index_id = key.second; + for (const auto& [partition_id, value] : partition_to_value) { + if (return_partitions > N) { + return; + } + auto last_value_iter = last_hot_partition.find(key); + if (last_value_iter != last_hot_partition.end()) { + auto last_partition_iter = last_value_iter->second.find(partition_id); + if (last_partition_iter != last_value_iter->second.end()) { + const auto& last_value = last_partition_iter->second; + if (std::abs(static_cast(value.qpd) - + static_cast(last_value.qpd)) < 5 && + std::abs(static_cast(value.qpw) - + static_cast(last_value.qpw)) < 10 && + std::abs(static_cast(value.last_access_time) - + static_cast(last_value.last_access_time)) < 60) { + LOG(INFO) << "skip partition_id=" << partition_id << " qpd=" << value.qpd + << " qpw=" << value.qpw + << " last_access_time=" << value.last_access_time + << " last_qpd=" << last_value.qpd + << " last_qpw=" << last_value.qpw + << " last_access_time=" << last_value.last_access_time; + continue; + } + } + } + THotPartition hot_partition; + hot_partition.__set_partition_id(partition_id); + hot_partition.__set_query_per_day(value.qpd); + hot_partition.__set_query_per_week(value.qpw); + hot_partition.__set_last_access_time(value.last_access_time); + msg.hot_partitions.push_back(hot_partition); + return_partitions++; + } + msg.__isset.hot_partitions = !msg.hot_partitions.empty(); + hot_tables->push_back(std::move(msg)); } -}; -struct TabletHotspotMapValue { - uint64_t qpd = 0; // query per day - uint64_t qpw = 0; // query per week - int64_t last_access_time; -}; - -using TabletHotspotMapKey = std::pair; +} void TabletHotspot::get_top_n_hot_partition(std::vector* hot_tables) { // map, map> for day @@ -108,33 +145,14 @@ void TabletHotspot::get_top_n_hot_partition(std::vector* hot_t }); constexpr int N = 50; int return_partitions = 0; - auto get_return_partitions = - [=, &return_partitions]( - const std::unordered_map, - MapKeyHash>& hot_partition) { - for (const auto& [key, partition_to_value] : hot_partition) { - THotTableMessage msg; - msg.table_id = key.first; - msg.index_id = key.second; - for (const auto& [partition_id, value] : partition_to_value) { - if (return_partitions > N) { - return; - } - THotPartition hot_partition; - hot_partition.__set_partition_id(partition_id); - hot_partition.__set_query_per_day(value.qpd); - hot_partition.__set_query_per_week(value.qpw); - hot_partition.__set_last_access_time(value.last_access_time); - msg.hot_partitions.push_back(hot_partition); - return_partitions++; - } - msg.__isset.hot_partitions = !msg.hot_partitions.empty(); - hot_tables->push_back(std::move(msg)); - } - }; - get_return_partitions(day_hot_partitions); - get_return_partitions(week_hot_partitions); + + get_return_partitions(day_hot_partitions, _last_day_hot_partitions, hot_tables, + return_partitions, N); + get_return_partitions(week_hot_partitions, _last_week_hot_partitions, hot_tables, + return_partitions, N); + + _last_day_hot_partitions = std::move(day_hot_partitions); + _last_week_hot_partitions = std::move(week_hot_partitions); } void HotspotCounter::make_dot_point() { diff --git a/be/src/cloud/cloud_tablet_hotspot.h b/be/src/cloud/cloud_tablet_hotspot.h index af98f99a558b9b..0be1c085a6c990 100644 --- a/be/src/cloud/cloud_tablet_hotspot.h +++ b/be/src/cloud/cloud_tablet_hotspot.h @@ -49,6 +49,19 @@ struct HotspotCounter { }; using HotspotCounterPtr = std::shared_ptr; +using TabletHotspotMapKey = std::pair; + +struct TabletHotspotMapValue { + uint64_t qpd = 0; // query per day + uint64_t qpw = 0; // query per week + int64_t last_access_time; +}; + +struct MapKeyHash { + int64_t operator()(const std::pair& key) const { + return std::hash {}(key.first) + std::hash {}(key.second); + } +}; class TabletHotspot { public: @@ -71,6 +84,12 @@ class TabletHotspot { bool _closed {false}; std::mutex _mtx; std::condition_variable _cond; + std::unordered_map, + MapKeyHash> + _last_day_hot_partitions; + std::unordered_map, + MapKeyHash> + _last_week_hot_partitions; }; } // namespace doris diff --git a/be/src/cloud/cloud_tablet_mgr.cpp b/be/src/cloud/cloud_tablet_mgr.cpp index e7a7d254f3fa89..f60d0eeb5ba0dd 100644 --- a/be/src/cloud/cloud_tablet_mgr.cpp +++ b/be/src/cloud/cloud_tablet_mgr.cpp @@ -261,9 +261,6 @@ void CloudTabletMgr::sync_tablets(const CountDownLatch& stop_latch) { for (auto& weak_tablet : weak_tablets) { if (auto tablet = weak_tablet.lock()) { - if (tablet->tablet_state() != TABLET_RUNNING) { - continue; - } int64_t last_sync_time = tablet->last_sync_time_s; if (last_sync_time <= last_sync_time_bound) { sync_time_tablet_set.emplace(last_sync_time, weak_tablet); diff --git a/be/src/clucene b/be/src/clucene index a506dbb6c523aa..2204eaec46a68e 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit a506dbb6c523aa65044eb1c527a066d236172543 +Subproject commit 2204eaec46a68e5e9a1876b7021f24839ecb2cf0 diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index c3d00e23c98e5e..083b9f06c9491d 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -251,7 +251,7 @@ DEFINE_mInt32(download_low_speed_limit_kbps, "50"); // download low speed time(seconds) DEFINE_mInt32(download_low_speed_time, "300"); // whether to download small files in batch -DEFINE_mBool(enable_batch_download, "false"); +DEFINE_mBool(enable_batch_download, "true"); DEFINE_String(sys_log_dir, ""); DEFINE_String(user_function_dir, "${DORIS_HOME}/lib/udf"); @@ -1211,7 +1211,7 @@ DEFINE_Bool(exit_on_exception, "false"); DEFINE_Bool(enable_flush_file_cache_async, "true"); // cgroup -DEFINE_mString(doris_cgroup_cpu_path, ""); +DEFINE_String(doris_cgroup_cpu_path, ""); DEFINE_mBool(enable_be_proc_monitor, "false"); DEFINE_mInt32(be_proc_monitor_interval_ms, "10000"); @@ -1402,6 +1402,9 @@ DEFINE_mBool(enable_delete_bitmap_merge_on_compaction, "false"); // Enable validation to check the correctness of table size. DEFINE_Bool(enable_table_size_correctness_check, "false"); DEFINE_Bool(force_regenerate_rowsetid_on_start_error, "false"); +DEFINE_mBool(enable_sleep_between_delete_cumu_compaction, "false"); + +DEFINE_mInt32(compaction_num_per_round, "1"); // clang-format off #ifdef BE_TEST diff --git a/be/src/common/config.h b/be/src/common/config.h index c0b2e19b49a6be..1e3d57ff763417 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1292,7 +1292,7 @@ DECLARE_mInt32(tablet_schema_cache_capacity); DECLARE_mBool(exit_on_exception); // cgroup -DECLARE_mString(doris_cgroup_cpu_path); +DECLARE_String(doris_cgroup_cpu_path); DECLARE_mBool(enable_be_proc_monitor); DECLARE_mInt32(be_proc_monitor_interval_ms); DECLARE_Int32(workload_group_metrics_interval_ms); @@ -1487,6 +1487,10 @@ DECLARE_Bool(force_regenerate_rowsetid_on_start_error); DECLARE_mBool(enable_delete_bitmap_merge_on_compaction); // Enable validation to check the correctness of table size. DECLARE_Bool(enable_table_size_correctness_check); +// Enable sleep 5s between delete cumulative compaction. +DECLARE_mBool(enable_sleep_between_delete_cumu_compaction); + +DECLARE_mInt32(compaction_num_per_round); #ifdef BE_TEST // test s3 diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 73035ecf3957eb..2aaa58f4feb597 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -230,6 +230,11 @@ void refresh_memory_state_after_memory_change() { } void refresh_cache_capacity() { + if (doris::GlobalMemoryArbitrator::cache_adjust_capacity_notify.load( + std::memory_order_relaxed)) { + // the last cache capacity adjustment has not been completed. + return; + } if (refresh_cache_capacity_sleep_time_ms <= 0) { auto cache_capacity_reduce_mem_limit = int64_t( doris::MemInfo::soft_mem_limit() * config::cache_capacity_reduce_mem_limit_frac); @@ -247,6 +252,8 @@ void refresh_cache_capacity() { new_cache_capacity_adjust_weighted; doris::GlobalMemoryArbitrator::notify_cache_adjust_capacity(); refresh_cache_capacity_sleep_time_ms = config::memory_gc_sleep_time_ms; + } else { + refresh_cache_capacity_sleep_time_ms = 0; } } refresh_cache_capacity_sleep_time_ms -= config::memory_maintenance_sleep_time_ms; @@ -437,6 +444,8 @@ void Daemon::calculate_metrics_thread() { // update lst map DorisMetrics::instance()->system_metrics()->get_network_traffic( &lst_net_send_bytes, &lst_net_receive_bytes); + + DorisMetrics::instance()->system_metrics()->update_be_avail_cpu_num(); } update_rowsets_and_segments_num_metrics(); } diff --git a/be/src/common/status.h b/be/src/common/status.h index 344f82a81b8e25..0252ec8564feeb 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -293,7 +293,8 @@ namespace ErrorCode { E(ENTRY_NOT_FOUND, -7002, false); \ E(INVALID_TABLET_STATE, -7211, false); \ E(ROWSETS_EXPIRED, -7311, false); \ - E(CGROUP_ERROR, -7411, false); + E(CGROUP_ERROR, -7411, false); \ + E(FATAL_ERROR, -7412, false); // Define constexpr int error_code_name = error_code_value #define M(NAME, ERRORCODE, ENABLESTACKTRACE) constexpr int NAME = ERRORCODE; @@ -446,6 +447,14 @@ class [[nodiscard]] Status { static Status OK() { return {}; } + template + static Status FatalError(std::string_view msg, Args&&... args) { +#ifndef NDEBUG + LOG(FATAL) << fmt::format(msg, std::forward(args)...); +#endif + return Error(msg, std::forward(args)...); + } + // default have stacktrace. could disable manually. #define ERROR_CTOR(name, code) \ template \ @@ -570,7 +579,7 @@ class [[nodiscard]] Status { // and another thread is call to_string method, it may core, because the _err_msg is an unique ptr and // it is deconstructed during copy method. // And also we could not use lock, because we need get status frequently to check if it is cancelled. -// The defaule value is ok. +// The default value is ok. class AtomicStatus { public: AtomicStatus() : error_st_(Status::OK()) {} diff --git a/be/src/common/version_internal.cpp b/be/src/common/version_internal.cpp index 1190242b6aa687..55402fab209400 100644 --- a/be/src/common/version_internal.cpp +++ b/be/src/common/version_internal.cpp @@ -34,6 +34,9 @@ int doris_build_version_minor() { int doris_build_version_patch() { return DORIS_BUILD_VERSION_PATCH; } +int doris_build_version_hotfix() { + return DORIS_BUILD_VERSION_HOTFIX; +} const char* doris_build_version_rc_version() { return DORIS_BUILD_VERSION_RC_VERSION; } @@ -56,4 +59,4 @@ const char* doris_build_info() { } // namespace version -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/common/version_internal.h b/be/src/common/version_internal.h index 8852d26dba9531..f4deaa15aff545 100644 --- a/be/src/common/version_internal.h +++ b/be/src/common/version_internal.h @@ -24,6 +24,7 @@ extern const char* doris_build_version_prefix(); extern int doris_build_version_major(); extern int doris_build_version_minor(); extern int doris_build_version_patch(); +extern int doris_build_version_hotfix(); extern const char* doris_build_version_rc_version(); extern const char* doris_build_version(); @@ -34,4 +35,4 @@ extern const char* doris_build_info(); } // namespace version -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/exec/decompressor.cpp b/be/src/exec/decompressor.cpp index 9365bb00288db1..5da2e6acbb9bdf 100644 --- a/be/src/exec/decompressor.cpp +++ b/be/src/exec/decompressor.cpp @@ -492,15 +492,15 @@ Status Lz4BlockDecompressor::decompress(uint8_t* input, size_t input_len, size_t auto* output_ptr = output; while (input_len > 0) { - //if faild , fall back to large block begin - auto* large_block_input_ptr = input_ptr; - auto* large_block_output_ptr = output_ptr; - if (input_len < sizeof(uint32_t)) { - return Status::InvalidArgument(strings::Substitute( - "fail to do hadoop-lz4 decompress, input_len=$0", input_len)); + *more_input_bytes = sizeof(uint32_t) - input_len; + break; } + //if faild, fall back to large block begin + auto* large_block_input_ptr = input_ptr; + auto* large_block_output_ptr = output_ptr; + uint32_t remaining_decompressed_large_block_len = BigEndian::Load32(input_ptr); input_ptr += sizeof(uint32_t); @@ -609,15 +609,15 @@ Status SnappyBlockDecompressor::decompress(uint8_t* input, size_t input_len, auto* output_ptr = output; while (input_len > 0) { - //if faild , fall back to large block begin - auto* large_block_input_ptr = input_ptr; - auto* large_block_output_ptr = output_ptr; - if (input_len < sizeof(uint32_t)) { - return Status::InvalidArgument(strings::Substitute( - "fail to do hadoop-snappy decompress, input_len=$0", input_len)); + *more_input_bytes = sizeof(uint32_t) - input_len; + break; } + //if faild, fall back to large block begin + auto* large_block_input_ptr = input_ptr; + auto* large_block_output_ptr = output_ptr; + uint32_t remaining_decompressed_large_block_len = BigEndian::Load32(input_ptr); input_ptr += sizeof(uint32_t); diff --git a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp index 9805163802699a..0ccff6439b802b 100644 --- a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp @@ -26,6 +26,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaActiveQueriesScanner::_s_tbls_columns = { // name, type, size {"QUERY_ID", TYPE_VARCHAR, sizeof(StringRef), true}, @@ -92,7 +94,7 @@ Status SchemaActiveQueriesScanner::_get_active_queries_block_from_fe() { _active_query_block->reserve(_block_rows_limit); if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError("active queries schema is not match for FE and BE"); } @@ -119,7 +121,7 @@ Status SchemaActiveQueriesScanner::get_next_block_internal(vectorized::Block* bl if (_active_query_block == nullptr) { RETURN_IF_ERROR(_get_active_queries_block_from_fe()); - _total_rows = _active_query_block->rows(); + _total_rows = (int)_active_query_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp b/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp index 74e95f4203217c..eb7b373c7dc7f6 100644 --- a/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp +++ b/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp @@ -25,6 +25,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaBackendActiveTasksScanner::_s_tbls_columns = { // name, type, size {"BE_ID", TYPE_BIGINT, sizeof(int64_t), false}, @@ -76,7 +78,7 @@ Status SchemaBackendActiveTasksScanner::get_next_block_internal(vectorized::Bloc ExecEnv::GetInstance()->runtime_query_statistics_mgr()->get_active_be_tasks_block( _task_stats_block.get()); - _total_rows = _task_stats_block->rows(); + _total_rows = (int)_task_stats_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/schema_scanner/schema_catalog_meta_cache_stats_scanner.cpp b/be/src/exec/schema_scanner/schema_catalog_meta_cache_stats_scanner.cpp index 4c067057729f21..576ae3f9e919c7 100644 --- a/be/src/exec/schema_scanner/schema_catalog_meta_cache_stats_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_catalog_meta_cache_stats_scanner.cpp @@ -27,6 +27,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaCatalogMetaCacheStatsScanner::_s_tbls_columns = { {"CATALOG_NAME", TYPE_STRING, sizeof(StringRef), true}, {"CACHE_NAME", TYPE_STRING, sizeof(StringRef), true}, @@ -86,7 +88,7 @@ Status SchemaCatalogMetaCacheStatsScanner::_get_meta_cache_from_fe() { _block->reserve(_block_rows_limit); if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError( "catalog meta cache stats schema is not match for FE and BE"); @@ -115,7 +117,7 @@ Status SchemaCatalogMetaCacheStatsScanner::get_next_block_internal(vectorized::B if (_block == nullptr) { RETURN_IF_ERROR(_get_meta_cache_from_fe()); - _total_rows = _block->rows(); + _total_rows = (int)_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.cpp b/be/src/exec/schema_scanner/schema_columns_scanner.cpp index b60dfc3d203f89..2cc827a7b43e78 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_columns_scanner.cpp @@ -30,6 +30,8 @@ #include "vec/common/string_ref.h" namespace doris { +#include "common/compile_check_begin.h" + class RuntimeState; namespace vectorized { @@ -411,7 +413,7 @@ Status SchemaColumnsScanner::_fill_block_impl(vectorized::Block* block) { { std::vector strs(columns_num); int offset_index = 0; - int cur_table_index = _table_index - _desc_result.tables_offset.size(); + int cur_table_index = int(_table_index - _desc_result.tables_offset.size()); for (int i = 0; i < columns_num; ++i) { while (_desc_result.tables_offset[offset_index] <= i) { @@ -609,14 +611,14 @@ Status SchemaColumnsScanner::_fill_block_impl(vectorized::Block* block) { // EXTRA { StringRef str = StringRef("", 0); - std::vector datas(columns_num, &str); - RETURN_IF_ERROR(fill_dest_column_for_range(block, 17, datas)); + std::vector filled_values(columns_num, &str); + RETURN_IF_ERROR(fill_dest_column_for_range(block, 17, filled_values)); } // PRIVILEGES { StringRef str = StringRef("", 0); - std::vector datas(columns_num, &str); - RETURN_IF_ERROR(fill_dest_column_for_range(block, 18, datas)); + std::vector filled_values(columns_num, &str); + RETURN_IF_ERROR(fill_dest_column_for_range(block, 18, filled_values)); } // COLUMN_COMMENT { diff --git a/be/src/exec/schema_scanner/schema_file_cache_statistics.cpp b/be/src/exec/schema_scanner/schema_file_cache_statistics.cpp index ecad274d218983..8a3efa0edc537c 100644 --- a/be/src/exec/schema_scanner/schema_file_cache_statistics.cpp +++ b/be/src/exec/schema_scanner/schema_file_cache_statistics.cpp @@ -25,6 +25,7 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" std::vector SchemaFileCacheStatisticsScanner::_s_tbls_columns = { // name, type, size @@ -68,7 +69,7 @@ Status SchemaFileCacheStatisticsScanner::get_next_block_internal(vectorized::Blo _stats_block->reserve(_block_rows_limit); ExecEnv::GetInstance()->file_cache_factory()->get_cache_stats_block(_stats_block.get()); - _total_rows = _stats_block->rows(); + _total_rows = (int)_stats_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp index 459715fd628943..dd7919a7fe2e30 100644 --- a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp @@ -31,6 +31,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + class RuntimeState; namespace vectorized { class Block; @@ -138,7 +140,7 @@ Status SchemaPartitionsScanner::get_onedb_info_from_fe(int64_t dbId) { } _partitions_block->reserve(_block_rows_limit); if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError("table options schema is not match for FE and BE"); } @@ -178,7 +180,7 @@ Status SchemaPartitionsScanner::get_next_block_internal(vectorized::Block* block if (_db_index < _db_result.db_ids.size()) { RETURN_IF_ERROR(get_onedb_info_from_fe(_db_result.db_ids[_db_index])); _row_idx = 0; // reset row index so that it start filling for next block. - _total_rows = _partitions_block->rows(); + _total_rows = (int)_partitions_block->rows(); _db_index++; } } diff --git a/be/src/exec/schema_scanner/schema_processlist_scanner.cpp b/be/src/exec/schema_scanner/schema_processlist_scanner.cpp index 185ef2ab44237f..92c80262963b03 100644 --- a/be/src/exec/schema_scanner/schema_processlist_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_processlist_scanner.cpp @@ -30,6 +30,7 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" std::vector SchemaProcessListScanner::_s_processlist_columns = { {"CURRENT_CONNECTED", TYPE_VARCHAR, sizeof(StringRef), false}, @@ -126,7 +127,7 @@ Status SchemaProcessListScanner::_fill_block_impl(vectorized::Block* block) { datas[row_idx] = &int_vals[row_idx]; } else if (_s_processlist_columns[col_idx].type == TYPE_DATETIMEV2) { auto* dv = reinterpret_cast*>(&int_vals[row_idx]); - if (!dv->from_date_str(column_value.data(), column_value.size(), -1, + if (!dv->from_date_str(column_value.data(), (int)column_value.size(), -1, config::allow_zero_date)) { return Status::InternalError( "process list meet invalid data, column={}, data={}, reason={}", diff --git a/be/src/exec/schema_scanner/schema_routine_scanner.cpp b/be/src/exec/schema_scanner/schema_routine_scanner.cpp index 8660d75e8a1faf..7f16c0cddba460 100644 --- a/be/src/exec/schema_scanner/schema_routine_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_routine_scanner.cpp @@ -26,6 +26,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaRoutinesScanner::_s_tbls_columns = { {"SPECIFIC_NAME", TYPE_VARCHAR, sizeof(StringRef), true}, {"ROUTINE_CATALOG", TYPE_VARCHAR, sizeof(StringRef), true}, @@ -94,7 +96,7 @@ Status SchemaRoutinesScanner::get_block_from_fe() { } _routines_block->reserve(_block_rows_limit); if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError("routine table schema is not match for FE and BE"); } @@ -121,7 +123,7 @@ Status SchemaRoutinesScanner::get_next_block_internal(vectorized::Block* block, if (_routines_block == nullptr) { RETURN_IF_ERROR(get_block_from_fe()); - _total_rows = _routines_block->rows(); + _total_rows = (int)_routines_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp index 3aa0e944a822c5..aea98bd61ac89a 100644 --- a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp @@ -48,6 +48,8 @@ namespace vectorized { class Block; } // namespace vectorized +#include "common/compile_check_begin.h" + std::vector SchemaRowsetsScanner::_s_tbls_columns = { // name, type, size, is_null {"BACKEND_ID", TYPE_BIGINT, sizeof(int64_t), true}, @@ -132,13 +134,13 @@ Status SchemaRowsetsScanner::get_next_block_internal(vectorized::Block* block, b Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { SCOPED_TIMER(_fill_block_timer); size_t fill_rowsets_num = std::min(1000UL, rowsets_.size() - _rowsets_idx); - auto fill_idx_begin = _rowsets_idx; - auto fill_idx_end = _rowsets_idx + fill_rowsets_num; + size_t fill_idx_begin = _rowsets_idx; + size_t fill_idx_end = _rowsets_idx + fill_rowsets_num; std::vector datas(fill_rowsets_num); // BACKEND_ID { int64_t src = backend_id_; - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { datas[i - fill_idx_begin] = &src; } RETURN_IF_ERROR(fill_dest_column_for_range(block, 0, datas)); @@ -147,7 +149,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { { std::vector rowset_ids(fill_rowsets_num); std::vector strs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; rowset_ids[i - fill_idx_begin] = rowset->rowset_id().to_string(); strs[i - fill_idx_begin] = StringRef(rowset_ids[i - fill_idx_begin].c_str(), @@ -159,7 +161,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // TABLET_ID { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->rowset_meta()->tablet_id(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; @@ -169,7 +171,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // ROWSET_NUM_ROWS { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->num_rows(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; @@ -179,7 +181,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // TXN_ID { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->txn_id(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; @@ -189,7 +191,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // NUM_SEGMENTS { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->num_segments(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; @@ -199,7 +201,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // START_VERSION { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->start_version(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; @@ -209,7 +211,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // END_VERSION { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->end_version(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; @@ -219,7 +221,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // INDEX_DISK_SIZE { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->index_disk_size(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; @@ -229,7 +231,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // DATA_DISK_SIZE { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->data_disk_size(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; @@ -239,7 +241,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // CREATION_TIME { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; int64_t creation_time = rowset->creation_time(); srcs[i - fill_idx_begin].from_unixtime(creation_time, TimezoneUtils::default_time_zone); @@ -250,7 +252,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // NEWEST_WRITE_TIMESTAMP { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; int64_t newest_write_timestamp = rowset->newest_write_timestamp(); srcs[i - fill_idx_begin].from_unixtime(newest_write_timestamp, @@ -262,7 +264,7 @@ Status SchemaRowsetsScanner::_fill_block_impl(vectorized::Block* block) { // SCHEMA_VERSION { std::vector srcs(fill_rowsets_num); - for (int i = fill_idx_begin; i < fill_idx_end; ++i) { + for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; srcs[i - fill_idx_begin] = rowset->tablet_schema()->schema_version(); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; diff --git a/be/src/exec/schema_scanner/schema_table_options_scanner.cpp b/be/src/exec/schema_scanner/schema_table_options_scanner.cpp index bb778996a83f04..fd9d17c8b93cf2 100644 --- a/be/src/exec/schema_scanner/schema_table_options_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_table_options_scanner.cpp @@ -27,6 +27,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaTableOptionsScanner::_s_tbls_columns = { {"TABLE_CATALOG", TYPE_VARCHAR, sizeof(StringRef), true}, {"TABLE_SCHEMA", TYPE_VARCHAR, sizeof(StringRef), true}, @@ -110,7 +112,7 @@ Status SchemaTableOptionsScanner::get_onedb_info_from_fe(int64_t dbId) { } _tableoptions_block->reserve(_block_rows_limit); if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError("table options schema is not match for FE and BE"); } @@ -150,7 +152,7 @@ Status SchemaTableOptionsScanner::get_next_block_internal(vectorized::Block* blo if (_db_index < _db_result.db_ids.size()) { RETURN_IF_ERROR(get_onedb_info_from_fe(_db_result.db_ids[_db_index])); _row_idx = 0; // reset row index so that it start filling for next block. - _total_rows = _tableoptions_block->rows(); + _total_rows = (int)_tableoptions_block->rows(); _db_index++; } } diff --git a/be/src/exec/schema_scanner/schema_table_properties_scanner.cpp b/be/src/exec/schema_scanner/schema_table_properties_scanner.cpp index 8d6a26a552f707..682560372b97c7 100644 --- a/be/src/exec/schema_scanner/schema_table_properties_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_table_properties_scanner.cpp @@ -27,6 +27,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaTablePropertiesScanner::_s_tbls_columns = { {"TABLE_CATALOG", TYPE_VARCHAR, sizeof(StringRef), true}, {"TABLE_SCHEMA", TYPE_VARCHAR, sizeof(StringRef), true}, @@ -108,7 +110,7 @@ Status SchemaTablePropertiesScanner::get_onedb_info_from_fe(int64_t dbId) { } _tableproperties_block->reserve(_block_rows_limit); if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError("table options schema is not match for FE and BE"); } @@ -148,7 +150,7 @@ Status SchemaTablePropertiesScanner::get_next_block_internal(vectorized::Block* if (_db_index < _db_result.db_ids.size()) { RETURN_IF_ERROR(get_onedb_info_from_fe(_db_result.db_ids[_db_index])); _row_idx = 0; // reset row index so that it start filling for next block. - _total_rows = _tableproperties_block->rows(); + _total_rows = (int)_tableproperties_block->rows(); _db_index++; } } diff --git a/be/src/exec/schema_scanner/schema_views_scanner.cpp b/be/src/exec/schema_scanner/schema_views_scanner.cpp index f47766ef3567ad..6ba7bf04c8b990 100644 --- a/be/src/exec/schema_scanner/schema_views_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_views_scanner.cpp @@ -140,7 +140,14 @@ Status SchemaViewsScanner::_fill_block_impl(vectorized::Block* block) { std::vector datas(tables_num); // catalog - { RETURN_IF_ERROR(fill_dest_column_for_range(block, 0, null_datas)); } + { + std::string catalog_name = _db_result.catalogs[_db_index - 1]; + StringRef str = StringRef(catalog_name.c_str(), catalog_name.size()); + for (int i = 0; i < tables_num; ++i) { + datas[i] = &str; + } + RETURN_IF_ERROR(fill_dest_column_for_range(block, 0, datas)); + } // schema { std::string db_name = SchemaHelper::extract_db_name(_db_result.dbs[_db_index - 1]); diff --git a/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp b/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp index a91a28322ecd76..bdf306ef7d94ad 100644 --- a/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp +++ b/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp @@ -26,6 +26,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaWorkloadGroupPrivilegesScanner::_s_tbls_columns = { {"GRANTEE", TYPE_VARCHAR, sizeof(StringRef), true}, {"WORKLOAD_GROUP_NAME", TYPE_VARCHAR, sizeof(StringRef), true}, @@ -83,7 +85,7 @@ Status SchemaWorkloadGroupPrivilegesScanner::_get_workload_group_privs_block_fro } if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError( "workload group privileges schema is not match for FE and BE"); @@ -116,7 +118,7 @@ Status SchemaWorkloadGroupPrivilegesScanner::get_next_block_internal(vectorized: if (_workload_groups_privs_block == nullptr) { RETURN_IF_ERROR(_get_workload_group_privs_block_from_fe()); - _total_rows = _workload_groups_privs_block->rows(); + _total_rows = (int)_workload_groups_privs_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.cpp index ca339044e98a5f..805bf12cc38ae6 100644 --- a/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.cpp @@ -28,6 +28,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaBackendWorkloadGroupResourceUsage::_s_tbls_columns = { // name, type, size {"BE_ID", TYPE_BIGINT, sizeof(int64_t), false}, @@ -70,7 +72,7 @@ Status SchemaBackendWorkloadGroupResourceUsage::get_next_block_internal(vectoriz } ExecEnv::GetInstance()->workload_group_mgr()->get_wg_resource_usage(_block.get()); - _total_rows = _block->rows(); + _total_rows = (int)_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp index 481360eee90557..bc5fb61669c525 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp @@ -26,6 +26,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaWorkloadGroupsScanner::_s_tbls_columns = { {"ID", TYPE_BIGINT, sizeof(int64_t), true}, {"NAME", TYPE_VARCHAR, sizeof(StringRef), true}, @@ -98,7 +100,7 @@ Status SchemaWorkloadGroupsScanner::_get_workload_groups_block_from_fe() { _workload_groups_block->reserve(_block_rows_limit); if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError( "workload groups schema is not match for FE and BE"); @@ -127,7 +129,7 @@ Status SchemaWorkloadGroupsScanner::get_next_block_internal(vectorized::Block* b if (_workload_groups_block == nullptr) { RETURN_IF_ERROR(_get_workload_groups_block_from_fe()); - _total_rows = _workload_groups_block->rows(); + _total_rows = (int)_workload_groups_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp index 5c6a6f70a88a86..fa1c671f5eeea0 100644 --- a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp @@ -26,6 +26,8 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" + std::vector SchemaWorkloadSchedulePolicyScanner::_s_tbls_columns = { {"ID", TYPE_BIGINT, sizeof(int64_t), true}, {"NAME", TYPE_VARCHAR, sizeof(StringRef), true}, @@ -89,7 +91,7 @@ Status SchemaWorkloadSchedulePolicyScanner::_get_workload_schedule_policy_block_ _block->reserve(_block_rows_limit); if (result_data.size() > 0) { - int col_size = result_data[0].column_value.size(); + auto col_size = result_data[0].column_value.size(); if (col_size != _s_tbls_columns.size()) { return Status::InternalError( "workload policy schema is not match for FE and BE"); @@ -118,7 +120,7 @@ Status SchemaWorkloadSchedulePolicyScanner::get_next_block_internal(vectorized:: if (_block == nullptr) { RETURN_IF_ERROR(_get_workload_schedule_policy_block_from_fe()); - _total_rows = _block->rows(); + _total_rows = (int)_block->rows(); } if (_row_idx == _total_rows) { diff --git a/be/src/exec/table_connector.cpp b/be/src/exec/table_connector.cpp index fa5181f5fecb2d..549fa6aae90fd8 100644 --- a/be/src/exec/table_connector.cpp +++ b/be/src/exec/table_connector.cpp @@ -118,16 +118,17 @@ Status TableConnector::convert_column_data(const vectorized::ColumnPtr& column_p fmt::format_to(_insert_stmt_buffer, "\"{}\"", str); } }; - const vectorized::IColumn* column = column_ptr; + const vectorized::IColumn* column = column_ptr.get(); if (type_ptr->is_nullable()) { - auto nullable_column = assert_cast(column_ptr.get()); + const auto* nullable_column = + assert_cast(column_ptr.get()); if (nullable_column->is_null_at(row)) { fmt::format_to(_insert_stmt_buffer, "{}", "NULL"); return Status::OK(); } column = nullable_column->get_nested_column_ptr().get(); } else { - column = column_ptr; + column = column_ptr.get(); } auto [item, size] = column->get_data_at(row); switch (type.type) { diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 8f297d7074ff12..d1567a8fa79cb4 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -498,12 +498,12 @@ class RuntimePredicateWrapper { switch (_filter_type) { case RuntimeFilterType::IN_FILTER: { if (!_context->hybrid_set) { - _context->ignored = true; + set_ignored(); return Status::OK(); } _context->hybrid_set->insert(wrapper->_context->hybrid_set.get()); if (_max_in_num >= 0 && _context->hybrid_set->size() >= _max_in_num) { - _context->ignored = true; + set_ignored(); // release in filter _context->hybrid_set.reset(); } @@ -1337,7 +1337,7 @@ void IRuntimeFilter::set_synced_size(uint64_t global_size) { } void IRuntimeFilter::set_ignored() { - _wrapper->_context->ignored = true; + _wrapper->set_ignored(); } bool IRuntimeFilter::get_ignored() { diff --git a/be/src/gutil/strings/escaping.cc b/be/src/gutil/strings/escaping.cc index 2ff59104f6d5ce..c6ba8e2f9c375e 100644 --- a/be/src/gutil/strings/escaping.cc +++ b/be/src/gutil/strings/escaping.cc @@ -10,6 +10,8 @@ #include #include +#include "common/exception.h" + using std::numeric_limits; #include @@ -1084,7 +1086,8 @@ int Base64UnescapeInternal(const char* src, int szsrc, char* dest, int szdest, default: // state should have no other values at this point. - LOG(FATAL) << "This can't happen; base64 decoder state = " << state; + throw doris::Exception( + doris::Status::FatalError("This can't happen; base64 decoder state = {}", state)); } // The remainder of the string should be all whitespace, mixed with diff --git a/be/src/gutil/strings/numbers.cc b/be/src/gutil/strings/numbers.cc index f471bf31bd08bb..f044ea08d31551 100644 --- a/be/src/gutil/strings/numbers.cc +++ b/be/src/gutil/strings/numbers.cc @@ -19,6 +19,8 @@ #include #include +#include "common/exception.h" + using std::numeric_limits; #include @@ -772,8 +774,8 @@ uint64 atoi_kmgt(const char* s) { scale = GG_ULONGLONG(1) << 40; break; default: - LOG(FATAL) << "Invalid mnemonic: `" << c << "';" - << " should be one of `K', `M', `G', and `T'."; + throw doris::Exception(doris::Status::FatalError( + "Invalid mnemonic: `{}'; should be one of `K', `M', `G', and `T'.", c)); } } return n * scale; diff --git a/be/src/gutil/strings/util.cc b/be/src/gutil/strings/util.cc index 80d5d463430c77..37c09d63b24fff 100644 --- a/be/src/gutil/strings/util.cc +++ b/be/src/gutil/strings/util.cc @@ -19,6 +19,8 @@ #include #include +#include "common/exception.h" + using std::copy; using std::max; using std::min; @@ -489,8 +491,7 @@ const char* strstr_delimited(const char* haystack, const char* needle, char deli ++haystack; } } - LOG(FATAL) << "Unreachable statement"; - return nullptr; + throw doris::Exception(doris::Status::FatalError("Unreachable statement")); } // ---------------------------------------------------------------------- diff --git a/be/src/gutil/threading/thread_collision_warner.cc b/be/src/gutil/threading/thread_collision_warner.cc index d2f1e47f8e02d9..fd51a9195d629e 100644 --- a/be/src/gutil/threading/thread_collision_warner.cc +++ b/be/src/gutil/threading/thread_collision_warner.cc @@ -4,6 +4,9 @@ #include "gutil/threading/thread_collision_warner.h" +#include "common/exception.h" +#include "common/status.h" + #ifdef __linux__ #include #else @@ -19,8 +22,9 @@ namespace base { void DCheckAsserter::warn(int64_t previous_thread_id, int64_t current_thread_id) { - LOG(FATAL) << "Thread Collision! Previous thread id: " << previous_thread_id - << ", current thread id: " << current_thread_id; + throw doris::Exception(doris::Status::FatalError( + "Thread Collision! Previous thread id: {}, current thread id: {}", previous_thread_id, + current_thread_id)); } static subtle::Atomic64 CurrentThread() { diff --git a/be/src/http/action/download_binlog_action.cpp b/be/src/http/action/download_binlog_action.cpp index 372f840401c4ad..4bb8b8b70dd722 100644 --- a/be/src/http/action/download_binlog_action.cpp +++ b/be/src/http/action/download_binlog_action.cpp @@ -144,8 +144,19 @@ void handle_get_segment_index_file(StorageEngine& engine, HttpRequest* req, const auto& rowset_id = get_http_param(req, kRowsetIdParameter); const auto& segment_index = get_http_param(req, kSegmentIndexParameter); const auto& segment_index_id = req->param(kSegmentIndexIdParameter); - segment_index_file_path = - tablet->get_segment_index_filepath(rowset_id, segment_index, segment_index_id); + auto segment_file_path = tablet->get_segment_filepath(rowset_id, segment_index); + if (tablet->tablet_schema()->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { + // now CCR not support for variant + index v1 + constexpr std::string_view index_suffix = ""; + segment_index_file_path = InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_file_path), + std::stoll(segment_index_id), index_suffix); + } else { + DCHECK(segment_index_id == "-1"); + segment_index_file_path = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_file_path)); + } is_acquire_md5 = !req->param(kAcquireMD5Parameter).empty(); } catch (const std::exception& e) { HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, e.what()); diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index 7e71f3eb910053..e8db5cb542fb4b 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -145,7 +145,8 @@ void StreamLoadAction::handle(HttpRequest* req) { << ctx->commit_and_publish_txn_cost_nanos / 1000000 << ", number_total_rows=" << ctx->number_total_rows << ", number_loaded_rows=" << ctx->number_loaded_rows - << ", receive_bytes=" << ctx->receive_bytes << ", loaded_bytes=" << ctx->loaded_bytes; + << ", receive_bytes=" << ctx->receive_bytes << ", loaded_bytes=" << ctx->loaded_bytes + << ", error_url=" << ctx->error_url; // update statistics streaming_load_requests_total->increment(1); diff --git a/be/src/http/http_channel.cpp b/be/src/http/http_channel.cpp index 312f1ab9286909..598330ff7cbcfb 100644 --- a/be/src/http/http_channel.cpp +++ b/be/src/http/http_channel.cpp @@ -123,7 +123,8 @@ void HttpChannel::send_files(HttpRequest* request, const std::string& root_dir, VLOG_DEBUG << "http channel send file " << file_path << ", size: " << file_size; evbuffer_add_printf(evb.get(), "File-Name: %s\r\n", file.c_str()); - evbuffer_add_printf(evb.get(), "Content-Length: %ld\r\n", file_size); + evbuffer_add_printf(evb.get(), "Content-Length: %" PRIi64 "\r\n", file_size); + evbuffer_add_printf(evb.get(), "\r\n"); if (file_size > 0) { evbuffer_add_file(evb.get(), fd, 0, file_size); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 2a59a5158e46c2..b5f48d09648bc6 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -40,6 +40,8 @@ #include "io/cache/file_cache_common.h" #include "io/cache/fs_file_cache_storage.h" #include "io/cache/mem_file_cache_storage.h" +#include "util/runtime_profile.h" +#include "util/stopwatch.hpp" #include "util/time.h" #include "vec/common/sip_hash.h" #include "vec/common/uint128.h" @@ -770,7 +772,13 @@ FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t o CacheContext& context) { FileBlock::Range range(offset, offset + size - 1); - SCOPED_CACHE_LOCK(_mutex); + ReadStatistics* stats = context.stats; + DCHECK(stats != nullptr); + MonotonicStopWatch sw; + sw.start(); + std::lock_guard cache_lock(_mutex); + stats->lock_wait_timer += sw.elapsed_time(); + if (auto iter = _key_to_time.find(hash); context.cache_type == FileCacheType::INDEX && iter != _key_to_time.end()) { context.cache_type = FileCacheType::TTL; @@ -778,12 +786,18 @@ FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t o } /// Get all blocks which intersect with the given range. - auto file_blocks = get_impl(hash, context, range, cache_lock); + FileBlocks file_blocks; + { + SCOPED_RAW_TIMER(&stats->get_timer); + file_blocks = get_impl(hash, context, range, cache_lock); + } if (file_blocks.empty()) { + SCOPED_RAW_TIMER(&stats->set_timer); file_blocks = split_range_into_cells(hash, context, offset, size, FileBlock::State::EMPTY, cache_lock); } else { + SCOPED_RAW_TIMER(&stats->set_timer); fill_holes_with_empty_file_blocks(file_blocks, hash, context, range, cache_lock); } DCHECK(!file_blocks.empty()); @@ -996,7 +1010,6 @@ bool BlockFileCache::try_reserve(const UInt128Wrapper& hash, const CacheContext& if (!_async_open_done) { return try_reserve_during_async_load(size, cache_lock); } - // use this strategy in scenarios where there is insufficient disk capacity or insufficient number of inodes remaining // directly eliminate 5 times the size of the space if (_disk_resource_limit_mode) { @@ -1055,6 +1068,7 @@ bool BlockFileCache::try_reserve(const UInt128Wrapper& hash, const CacheContext& if (cell->releasable()) { auto& file_block = cell->file_block; + std::lock_guard block_lock(file_block->_mutex); DCHECK(file_block->_download_state == FileBlock::State::DOWNLOADED); to_evict.push_back(cell); diff --git a/be/src/io/cache/block_file_cache_profile.h b/be/src/io/cache/block_file_cache_profile.h index 19d7f4139f7f15..f9d9df0939f017 100644 --- a/be/src/io/cache/block_file_cache_profile.h +++ b/be/src/io/cache/block_file_cache_profile.h @@ -75,6 +75,7 @@ struct FileCacheProfile { struct FileCacheProfileReporter { RuntimeProfile::Counter* num_local_io_total = nullptr; RuntimeProfile::Counter* num_remote_io_total = nullptr; + RuntimeProfile::Counter* num_inverted_index_remote_io_total = nullptr; RuntimeProfile::Counter* local_io_timer = nullptr; RuntimeProfile::Counter* bytes_scanned_from_cache = nullptr; RuntimeProfile::Counter* bytes_scanned_from_remote = nullptr; @@ -82,6 +83,11 @@ struct FileCacheProfileReporter { RuntimeProfile::Counter* write_cache_io_timer = nullptr; RuntimeProfile::Counter* bytes_write_into_cache = nullptr; RuntimeProfile::Counter* num_skip_cache_io_total = nullptr; + RuntimeProfile::Counter* read_cache_file_directly_timer = nullptr; + RuntimeProfile::Counter* cache_get_or_set_timer = nullptr; + RuntimeProfile::Counter* lock_wait_timer = nullptr; + RuntimeProfile::Counter* get_timer = nullptr; + RuntimeProfile::Counter* set_timer = nullptr; FileCacheProfileReporter(RuntimeProfile* profile) { static const char* cache_profile = "FileCache"; @@ -90,6 +96,8 @@ struct FileCacheProfileReporter { cache_profile, 1); num_remote_io_total = ADD_CHILD_COUNTER_WITH_LEVEL(profile, "NumRemoteIOTotal", TUnit::UNIT, cache_profile, 1); + num_inverted_index_remote_io_total = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "NumInvertedIndexRemoteIOTotal", TUnit::UNIT, cache_profile, 1); local_io_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "LocalIOUseTimer", cache_profile, 1); remote_io_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "RemoteIOUseTimer", cache_profile, 1); write_cache_io_timer = @@ -102,11 +110,20 @@ struct FileCacheProfileReporter { TUnit::BYTES, cache_profile, 1); bytes_scanned_from_remote = ADD_CHILD_COUNTER_WITH_LEVEL(profile, "BytesScannedFromRemote", TUnit::BYTES, cache_profile, 1); + read_cache_file_directly_timer = + ADD_CHILD_TIMER_WITH_LEVEL(profile, "ReadCacheFileDirectlyTimer", cache_profile, 1); + cache_get_or_set_timer = + ADD_CHILD_TIMER_WITH_LEVEL(profile, "CacheGetOrSetTimer", cache_profile, 1); + lock_wait_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "LockWaitTimer", cache_profile, 1); + get_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "GetTimer", cache_profile, 1); + set_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "SetTimer", cache_profile, 1); } void update(const FileCacheStatistics* statistics) const { COUNTER_UPDATE(num_local_io_total, statistics->num_local_io_total); COUNTER_UPDATE(num_remote_io_total, statistics->num_remote_io_total); + COUNTER_UPDATE(num_inverted_index_remote_io_total, + statistics->num_inverted_index_remote_io_total); COUNTER_UPDATE(local_io_timer, statistics->local_io_timer); COUNTER_UPDATE(remote_io_timer, statistics->remote_io_timer); COUNTER_UPDATE(write_cache_io_timer, statistics->write_cache_io_timer); @@ -114,6 +131,11 @@ struct FileCacheProfileReporter { COUNTER_UPDATE(num_skip_cache_io_total, statistics->num_skip_cache_io_total); COUNTER_UPDATE(bytes_scanned_from_cache, statistics->bytes_read_from_local); COUNTER_UPDATE(bytes_scanned_from_remote, statistics->bytes_read_from_remote); + COUNTER_UPDATE(read_cache_file_directly_timer, statistics->read_cache_file_directly_timer); + COUNTER_UPDATE(cache_get_or_set_timer, statistics->cache_get_or_set_timer); + COUNTER_UPDATE(lock_wait_timer, statistics->lock_wait_timer); + COUNTER_UPDATE(get_timer, statistics->get_timer); + COUNTER_UPDATE(set_timer, statistics->set_timer); } }; diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index c9a273c5d368a6..70765fa707ea87 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -126,7 +126,7 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* ReadStatistics stats; auto defer_func = [&](int*) { if (io_ctx->file_cache_stats) { - _update_state(stats, io_ctx->file_cache_stats); + _update_state(stats, io_ctx->file_cache_stats, io_ctx->is_inverted_index); io::FileCacheProfile::instance().update(io_ctx->file_cache_stats); } }; @@ -134,6 +134,7 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* stats.bytes_read += bytes_req; if (config::enable_read_cache_file_directly) { // read directly + SCOPED_RAW_TIMER(&stats.read_cache_file_directly_timer); size_t need_read_size = bytes_req; std::shared_lock lock(_mtx); if (!_cache_file_readers.empty()) { @@ -174,8 +175,12 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* // read from cache or remote auto [align_left, align_size] = s_align_size(offset, bytes_req, size()); CacheContext cache_context(io_ctx); + cache_context.stats = &stats; + MonotonicStopWatch sw; + sw.start(); FileBlocksHolder holder = _cache->get_or_set(_cache_hash, align_left, align_size, cache_context); + stats.cache_get_or_set_timer += sw.elapsed_time(); std::vector empty_blocks; for (auto& block : holder.file_blocks) { switch (block->state()) { @@ -312,7 +317,8 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* } void CachedRemoteFileReader::_update_state(const ReadStatistics& read_stats, - FileCacheStatistics* statis) const { + FileCacheStatistics* statis, + bool is_inverted_index) const { if (statis == nullptr) { return; } @@ -320,6 +326,9 @@ void CachedRemoteFileReader::_update_state(const ReadStatistics& read_stats, statis->num_local_io_total++; statis->bytes_read_from_local += read_stats.bytes_read; } else { + if (is_inverted_index) { + statis->num_inverted_index_remote_io_total++; + } statis->num_remote_io_total++; statis->bytes_read_from_remote += read_stats.bytes_read; } @@ -329,6 +338,12 @@ void CachedRemoteFileReader::_update_state(const ReadStatistics& read_stats, statis->bytes_write_into_cache += read_stats.bytes_write_into_file_cache; statis->write_cache_io_timer += read_stats.local_write_timer; + statis->read_cache_file_directly_timer += read_stats.read_cache_file_directly_timer; + statis->cache_get_or_set_timer += read_stats.cache_get_or_set_timer; + statis->lock_wait_timer += read_stats.lock_wait_timer; + statis->get_timer += read_stats.get_timer; + statis->set_timer += read_stats.set_timer; + g_skip_cache_num << read_stats.skip_cache; g_skip_cache_sum << read_stats.skip_cache; } diff --git a/be/src/io/cache/cached_remote_file_reader.h b/be/src/io/cache/cached_remote_file_reader.h index b3efb83c0803c8..735e652f94cadc 100644 --- a/be/src/io/cache/cached_remote_file_reader.h +++ b/be/src/io/cache/cached_remote_file_reader.h @@ -67,16 +67,8 @@ class CachedRemoteFileReader final : public FileReader { std::shared_mutex _mtx; std::map _cache_file_readers; - struct ReadStatistics { - bool hit_cache = true; - bool skip_cache = false; - int64_t bytes_read = 0; - int64_t bytes_write_into_file_cache = 0; - int64_t remote_read_timer = 0; - int64_t local_read_timer = 0; - int64_t local_write_timer = 0; - }; - void _update_state(const ReadStatistics& stats, FileCacheStatistics* state) const; + void _update_state(const ReadStatistics& stats, FileCacheStatistics* state, + bool is_inverted_index) const; }; } // namespace doris::io diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 19041938a08346..56525425f758d4 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -83,6 +83,8 @@ FileBlocksHolderPtr FileCacheAllocatorBuilder::allocate_cache_holder(size_t offs ctx.cache_type = _expiration_time == 0 ? FileCacheType::NORMAL : FileCacheType::TTL; ctx.expiration_time = _expiration_time; ctx.is_cold_data = _is_cold_data; + ReadStatistics stats; + ctx.stats = &stats; auto holder = _cache->get_or_set(_cache_hash, offset, size, ctx); return std::make_unique(std::move(holder)); } diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 0d700d9303191f..25df07b5ddff20 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -50,6 +50,21 @@ struct UInt128Wrapper { bool operator==(const UInt128Wrapper& other) const { return value_ == other.value_; } }; +struct ReadStatistics { + bool hit_cache = true; + bool skip_cache = false; + int64_t bytes_read = 0; + int64_t bytes_write_into_file_cache = 0; + int64_t remote_read_timer = 0; + int64_t local_read_timer = 0; + int64_t local_write_timer = 0; + int64_t read_cache_file_directly_timer = 0; + int64_t cache_get_or_set_timer = 0; + int64_t lock_wait_timer = 0; + int64_t get_timer = 0; + int64_t set_timer = 0; +}; + class BlockFileCache; struct FileBlocksHolder; using FileBlocksHolderPtr = std::unique_ptr; @@ -134,6 +149,7 @@ struct CacheContext { FileCacheType cache_type; int64_t expiration_time {0}; bool is_cold_data {false}; + ReadStatistics* stats; }; } // namespace doris::io diff --git a/be/src/io/file_factory.h b/be/src/io/file_factory.h index 9d9d714812ffe9..afa54e221664c9 100644 --- a/be/src/io/file_factory.h +++ b/be/src/io/file_factory.h @@ -118,10 +118,9 @@ class FileFactory { case TStorageBackendType::HDFS: return TFileType::FILE_HDFS; default: - LOG(FATAL) << "not match type to convert, from type:" << type; + throw Exception(Status::FatalError("not match type to convert, from type:{}", type)); } - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); + throw Exception(Status::FatalError("__builtin_unreachable")); } }; diff --git a/be/src/io/fs/err_utils.cpp b/be/src/io/fs/err_utils.cpp index 6552d454824796..e9bed7f5887dc3 100644 --- a/be/src/io/fs/err_utils.cpp +++ b/be/src/io/fs/err_utils.cpp @@ -122,13 +122,13 @@ Status s3fs_error(const Aws::S3::S3Error& err, std::string_view msg) { using namespace Aws::Http; switch (err.GetResponseCode()) { case HttpResponseCode::NOT_FOUND: - return Status::Error("{}: {} {} type={}, request_id={}", msg, - err.GetExceptionName(), err.GetMessage(), + return Status::Error("{}: {} {} code=NOT_FOUND, type={}, request_id={}", + msg, err.GetExceptionName(), err.GetMessage(), err.GetErrorType(), err.GetRequestId()); case HttpResponseCode::FORBIDDEN: - return Status::Error("{}: {} {} type={}, request_id={}", msg, - err.GetExceptionName(), err.GetMessage(), - err.GetErrorType(), err.GetRequestId()); + return Status::Error( + "{}: {} {} code=FORBIDDEN, type={}, request_id={}", msg, err.GetExceptionName(), + err.GetMessage(), err.GetErrorType(), err.GetRequestId()); default: return Status::Error( "{}: {} {} code={} type={}, request_id={}", msg, err.GetExceptionName(), diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 80a594473dc376..be56d0d63abcfd 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -38,6 +38,7 @@ namespace io { struct FileCacheStatistics { int64_t num_local_io_total = 0; int64_t num_remote_io_total = 0; + int64_t num_inverted_index_remote_io_total = 0; int64_t local_io_timer = 0; int64_t bytes_read_from_local = 0; int64_t bytes_read_from_remote = 0; @@ -45,6 +46,11 @@ struct FileCacheStatistics { int64_t write_cache_io_timer = 0; int64_t bytes_write_into_cache = 0; int64_t num_skip_cache_io_total = 0; + int64_t read_cache_file_directly_timer = 0; + int64_t cache_get_or_set_timer = 0; + int64_t lock_wait_timer = 0; + int64_t get_timer = 0; + int64_t set_timer = 0; }; struct IOContext { @@ -60,6 +66,7 @@ struct IOContext { int64_t expiration_time = 0; const TUniqueId* query_id = nullptr; // Ref FileCacheStatistics* file_cache_stats = nullptr; // Ref + bool is_inverted_index = false; }; } // namespace io diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index 82dc122e19f5ef..a4720f89d19be6 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -28,6 +28,7 @@ #include "common/status.h" #include "olap/calc_delete_bitmap_executor.h" #include "olap/delete_bitmap_calculator.h" +#include "olap/iterators.h" #include "olap/memtable.h" #include "olap/partial_update_info.h" #include "olap/primary_key_index.h" @@ -81,7 +82,9 @@ Status _get_segment_column_iterator(const BetaRowsetSharedPtr& rowset, uint32_t rowset->rowset_id().to_string(), segid)); } segment_v2::SegmentSharedPtr segment = *it; - RETURN_IF_ERROR(segment->new_column_iterator(target_column, column_iterator, nullptr)); + StorageReadOptions opts; + opts.stats = stats; + RETURN_IF_ERROR(segment->new_column_iterator(target_column, column_iterator, &opts)); segment_v2::ColumnIteratorOptions opt { .use_page_cache = !config::disable_storage_page_cache, .file_reader = segment->file_reader().get(), @@ -496,7 +499,7 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest for (auto id : picked_segments) { Status s = segments[id]->lookup_row_key(encoded_key, schema, with_seq_col, with_rowid, - &loc, encoded_seq_value, stats); + &loc, stats, encoded_seq_value); if (s.is()) { continue; } @@ -612,7 +615,7 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, vectorized::Block ordered_block = block.clone_empty(); uint32_t pos = 0; - RETURN_IF_ERROR(seg->load_pk_index_and_bf()); // We need index blocks to iterate + RETURN_IF_ERROR(seg->load_pk_index_and_bf(nullptr)); // We need index blocks to iterate const auto* pk_idx = seg->get_primary_key_index(); int total = pk_idx->num_rows(); uint32_t row_id = 0; @@ -626,7 +629,7 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, std::vector> segment_caches(specified_rowsets.size()); while (remaining > 0) { std::unique_ptr iter; - RETURN_IF_ERROR(pk_idx->new_iterator(&iter)); + RETURN_IF_ERROR(pk_idx->new_iterator(&iter, nullptr)); size_t num_to_read = std::min(batch_size, remaining); auto index_type = vectorized::DataTypeFactory::instance().create_data_type( diff --git a/be/src/olap/block_column_predicate.h b/be/src/olap/block_column_predicate.h index eed5e18329acf7..b6ff115c34c72d 100644 --- a/be/src/olap/block_column_predicate.h +++ b/be/src/olap/block_column_predicate.h @@ -74,25 +74,21 @@ class BlockColumnPredicate { } virtual bool can_do_apply_safely(PrimitiveType input_type, bool is_null) const { - LOG(FATAL) << "should not reach here"; - return true; + throw Exception(Status::FatalError("should not reach here")); } virtual bool support_zonemap() const { return true; } virtual bool evaluate_and(const std::pair& statistic) const { - LOG(FATAL) << "should not reach here"; - return true; + throw Exception(Status::FatalError("should not reach here")); } virtual bool evaluate_and(const segment_v2::BloomFilter* bf) const { - LOG(FATAL) << "should not reach here"; - return true; + throw Exception(Status::FatalError("should not reach here")); } virtual bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const { - LOG(FATAL) << "should not reach here"; - return true; + throw Exception(Status::FatalError("should not reach here")); } virtual bool can_do_bloom_filter(bool ngram) const { return false; } diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 7b2bb5d4ee3cfb..aec38699e014a2 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -495,10 +495,35 @@ Status CompactionMixin::execute_compact_impl(int64_t permits) { Status Compaction::do_inverted_index_compaction() { const auto& ctx = _output_rs_writer->context(); if (!config::inverted_index_compaction_enable || _input_row_num <= 0 || - !_stats.rowid_conversion || ctx.columns_to_do_index_compaction.empty()) { + ctx.columns_to_do_index_compaction.empty()) { return Status::OK(); } + auto error_handler = [this](int64_t index_id, int64_t column_uniq_id) { + LOG(WARNING) << "failed to do index compaction" + << ". tablet=" << _tablet->tablet_id() << ". column uniq id=" << column_uniq_id + << ". index_id=" << index_id; + for (auto& rowset : _input_rowsets) { + rowset->set_skip_index_compaction(column_uniq_id); + LOG(INFO) << "mark skipping inverted index compaction next time" + << ". tablet=" << _tablet->tablet_id() << ", rowset=" << rowset->rowset_id() + << ", column uniq id=" << column_uniq_id << ", index_id=" << index_id; + } + }; + + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_rowid_conversion_null", + { _stats.rowid_conversion = nullptr; }) + if (!_stats.rowid_conversion) { + LOG(WARNING) << "failed to do index compaction, rowid conversion is null" + << ". tablet=" << _tablet->tablet_id() + << ", input row number=" << _input_row_num; + mark_skip_index_compaction(ctx, error_handler); + + return Status::Error( + "failed to do index compaction, rowid conversion is null. tablet={}", + _tablet->tablet_id()); + } + OlapStopWatch inverted_watch; // translation vec @@ -521,8 +546,7 @@ Status Compaction::do_inverted_index_compaction() { auto src_segment_num = src_seg_to_id_map.size(); auto dest_segment_num = dest_segment_num_rows.size(); - DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_dest_segment_num_is_zero", - { dest_segment_num = 0; }) + // when all the input rowsets are deleted, the output rowset will be empty and dest_segment_num will be 0. if (dest_segment_num <= 0) { LOG(INFO) << "skip doing index compaction due to no output segments" << ". tablet=" << _tablet->tablet_id() << ", input row number=" << _input_row_num @@ -600,27 +624,62 @@ Status Compaction::do_inverted_index_compaction() { DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_find_rowset_error", { find_it = rs_id_to_rowset_map.end(); }) if (find_it == rs_id_to_rowset_map.end()) [[unlikely]] { - // DCHECK(false) << _tablet->tablet_id() << ' ' << rowset_id; - return Status::InternalError("cannot find rowset. tablet_id={} rowset_id={}", - _tablet->tablet_id(), rowset_id.to_string()); + LOG(WARNING) << "failed to do index compaction, cannot find rowset. tablet_id=" + << _tablet->tablet_id() << " rowset_id=" << rowset_id.to_string(); + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "failed to do index compaction, cannot find rowset. tablet_id={} rowset_id={}", + _tablet->tablet_id(), rowset_id.to_string()); } auto* rowset = find_it->second; auto fs = rowset->rowset_meta()->fs(); DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_get_fs_error", { fs = nullptr; }) if (!fs) { - return Status::InternalError("get fs failed, resource_id={}", - rowset->rowset_meta()->resource_id()); + LOG(WARNING) << "failed to do index compaction, get fs failed. resource_id=" + << rowset->rowset_meta()->resource_id(); + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "get fs failed, resource_id={}", rowset->rowset_meta()->resource_id()); } - auto seg_path = DORIS_TRY(rowset->segment_path(seg_id)); + auto seg_path = rowset->segment_path(seg_id); + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_seg_path_nullptr", { + seg_path = ResultError(Status::Error( + "do_inverted_index_compaction_seg_path_nullptr")); + }) + if (!seg_path.has_value()) { + LOG(WARNING) << "failed to do index compaction, get segment path failed. tablet_id=" + << _tablet->tablet_id() << " rowset_id=" << rowset_id.to_string() + << " seg_id=" << seg_id; + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "get segment path failed. tablet_id={} rowset_id={} seg_id={}", + _tablet->tablet_id(), rowset_id.to_string(), seg_id); + } auto inverted_index_file_reader = std::make_unique( - fs, std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path)}, + fs, + std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path.value())}, _cur_tablet_schema->get_inverted_index_storage_format(), rowset->rowset_meta()->inverted_index_file_info(seg_id)); - RETURN_NOT_OK_STATUS_WITH_WARN( - inverted_index_file_reader->init(config::inverted_index_read_buffer_size), - "inverted_index_file_reader init faiqled"); + auto st = inverted_index_file_reader->init(config::inverted_index_read_buffer_size); + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_init_inverted_index_file_reader", + { + st = Status::Error( + "debug point: " + "Compaction::do_inverted_index_compaction_init_inverted_index_" + "file_reader error"); + }) + if (!st.ok()) { + LOG(WARNING) << "failed to do index compaction, init inverted index file reader " + "failed. tablet_id=" + << _tablet->tablet_id() << " rowset_id=" << rowset_id.to_string() + << " seg_id=" << seg_id; + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "init inverted index file reader failed. tablet_id={} rowset_id={} seg_id={}", + _tablet->tablet_id(), rowset_id.to_string(), seg_id); + } inverted_index_file_readers[m.second] = std::move(inverted_index_file_reader); } @@ -628,7 +687,20 @@ Status Compaction::do_inverted_index_compaction() { // format: rowsetId_segmentId auto& inverted_index_file_writers = dynamic_cast(_output_rs_writer.get()) ->inverted_index_file_writers(); - DCHECK_EQ(inverted_index_file_writers.size(), dest_segment_num); + DBUG_EXECUTE_IF( + "Compaction::do_inverted_index_compaction_inverted_index_file_writers_size_error", + { inverted_index_file_writers.clear(); }) + if (inverted_index_file_writers.size() != dest_segment_num) { + LOG(WARNING) << "failed to do index compaction, dest segment num not match. tablet_id=" + << _tablet->tablet_id() << " dest_segment_num=" << dest_segment_num + << " inverted_index_file_writers.size()=" + << inverted_index_file_writers.size(); + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "dest segment num not match. tablet_id={} dest_segment_num={} " + "inverted_index_file_writers.size()={}", + _tablet->tablet_id(), dest_segment_num, inverted_index_file_writers.size()); + } // use tmp file dir to store index files auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); @@ -637,18 +709,6 @@ Status Compaction::do_inverted_index_compaction() { << ". tablet=" << _tablet->tablet_id() << ", source index size=" << src_segment_num << ", destination index size=" << dest_segment_num << "."; - auto error_handler = [this](int64_t index_id, int64_t column_uniq_id) { - LOG(WARNING) << "failed to do index compaction" - << ". tablet=" << _tablet->tablet_id() << ". column uniq id=" << column_uniq_id - << ". index_id=" << index_id; - for (auto& rowset : _input_rowsets) { - rowset->set_skip_index_compaction(column_uniq_id); - LOG(INFO) << "mark skipping inverted index compaction next time" - << ". tablet=" << _tablet->tablet_id() << ", rowset=" << rowset->rowset_id() - << ", column uniq id=" << column_uniq_id << ", index_id=" << index_id; - } - }; - Status status = Status::OK(); for (auto&& column_uniq_id : ctx.columns_to_do_index_compaction) { auto col = _cur_tablet_schema->column_by_uid(column_uniq_id); @@ -658,6 +718,10 @@ Status Compaction::do_inverted_index_compaction() { if (index_meta == nullptr) { status = Status::Error( fmt::format("Can not find index_meta for col {}", col.name())); + LOG(WARNING) << "failed to do index compaction, can not find index_meta for column" + << ". tablet=" << _tablet->tablet_id() + << ", column uniq id=" << column_uniq_id; + error_handler(-1, column_uniq_id); break; } @@ -671,6 +735,11 @@ Status Compaction::do_inverted_index_compaction() { "debug point: Compaction::open_index_file_reader error")); }) if (!res.has_value()) { + LOG(WARNING) << "failed to do index compaction, open inverted index file " + "reader failed" + << ". tablet=" << _tablet->tablet_id() + << ", column uniq id=" << column_uniq_id + << ", src_segment_id=" << src_segment_id; throw Exception(ErrorCode::INVERTED_INDEX_COMPACTION_ERROR, res.error().msg()); } src_idx_dirs[src_segment_id] = std::move(res.value()); @@ -682,6 +751,11 @@ Status Compaction::do_inverted_index_compaction() { "debug point: Compaction::open_inverted_index_file_writer error")); }) if (!res.has_value()) { + LOG(WARNING) << "failed to do index compaction, open inverted index file " + "writer failed" + << ". tablet=" << _tablet->tablet_id() + << ", column uniq id=" << column_uniq_id + << ", dest_segment_id=" << dest_segment_id; throw Exception(ErrorCode::INVERTED_INDEX_COMPACTION_ERROR, res.error().msg()); } // Destination directories in dest_index_dirs do not need to be deconstructed, @@ -714,6 +788,23 @@ Status Compaction::do_inverted_index_compaction() { return Status::OK(); } +void Compaction::mark_skip_index_compaction( + const RowsetWriterContext& context, + const std::function& error_handler) { + for (auto&& column_uniq_id : context.columns_to_do_index_compaction) { + auto col = _cur_tablet_schema->column_by_uid(column_uniq_id); + const auto* index_meta = _cur_tablet_schema->inverted_index(col); + if (index_meta == nullptr) { + LOG(WARNING) << "mark skip index compaction, can not find index_meta for column" + << ". tablet=" << _tablet->tablet_id() + << ", column uniq id=" << column_uniq_id; + error_handler(-1, column_uniq_id); + continue; + } + error_handler(index_meta->index_id(), column_uniq_id); + } +} + void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { for (const auto& index : _cur_tablet_schema->inverted_indexes()) { auto col_unique_ids = index->col_unique_ids(); @@ -789,7 +880,8 @@ void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { // TODO: inverted_index_path auto seg_path = rowset->segment_path(i); DBUG_EXECUTE_IF("Compaction::construct_skip_inverted_index_seg_path_nullptr", { - seg_path = ResultError(Status::Error("error")); + seg_path = ResultError(Status::Error( + "construct_skip_inverted_index_seg_path_nullptr")); }) if (!seg_path) { LOG(WARNING) << seg_path.error(); @@ -800,8 +892,8 @@ void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { try { auto inverted_index_file_reader = std::make_unique( fs, - std::string { - InvertedIndexDescriptor::get_index_file_path_prefix(*seg_path)}, + std::string {InvertedIndexDescriptor::get_index_file_path_prefix( + seg_path.value())}, _cur_tablet_schema->get_inverted_index_storage_format(), rowset->rowset_meta()->inverted_index_file_info(i)); auto st = inverted_index_file_reader->init( @@ -1013,8 +1105,31 @@ Status CompactionMixin::modify_rowsets() { if (!_tablet->tablet_meta()->tablet_schema()->cluster_key_uids().empty()) { merged_missed_rows_size += _stats.filtered_rows; } + + // Suppose a heavy schema change process on BE converting tablet A to tablet B. + // 1. during schema change double write, new loads write [X-Y] on tablet B. + // 2. rowsets with version [a],[a+1],...,[b-1],[b] on tablet B are picked for cumu compaction(X<=aget_header_lock()); + need_to_check_missed_rows = + std::all_of(_input_rowsets.begin(), _input_rowsets.end(), + [&](const RowsetSharedPtr& rowset) { + return tablet()->rowset_exists_unlocked(rowset); + }); + } + if (_tablet->tablet_state() == TABLET_RUNNING && - merged_missed_rows_size != missed_rows_size) { + merged_missed_rows_size != missed_rows_size && need_to_check_missed_rows) { std::stringstream ss; ss << "cumulative compaction: the merged rows(" << _stats.merged_rows << "), filtered rows(" << _stats.filtered_rows diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index ccabf7dadb4733..057f4084b068b3 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -70,6 +70,10 @@ class Compaction { // merge inverted index files Status do_inverted_index_compaction(); + // mark all columns in columns_to_do_index_compaction to skip index compaction next time. + void mark_skip_index_compaction(const RowsetWriterContext& context, + const std::function& error_handler); + void construct_index_compaction_columns(RowsetWriterContext& ctx); virtual Status construct_output_rowset_writer(RowsetWriterContext& ctx) = 0; diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index 2dfd30fb86ed9a..a9509a005763f6 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -100,6 +100,20 @@ Status CumulativeCompaction::prepare_compact() { } Status CumulativeCompaction::execute_compact() { + DBUG_EXECUTE_IF("CumulativeCompaction::execute_compact.block", { + auto target_tablet_id = dp->param("tablet_id", -1); + if (target_tablet_id == _tablet->tablet_id()) { + LOG(INFO) << "start debug block " + << "CumulativeCompaction::execute_compact.block"; + while (DebugPoints::instance()->is_enable( + "CumulativeCompaction::execute_compact.block")) { + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } + LOG(INFO) << "end debug block " + << "CumulativeCompaction::execute_compact.block"; + } + }) + std::unique_lock lock(tablet()->get_cumulative_compaction_lock(), std::try_to_lock); if (!lock.owns_lock()) { return Status::Error( diff --git a/be/src/olap/cumulative_compaction_policy.cpp b/be/src/olap/cumulative_compaction_policy.cpp index ee7a2b1812a0ae..c812a12b656580 100644 --- a/be/src/olap/cumulative_compaction_policy.cpp +++ b/be/src/olap/cumulative_compaction_policy.cpp @@ -28,6 +28,7 @@ #include "olap/olap_common.h" #include "olap/tablet.h" #include "olap/tablet_meta.h" +#include "util/debug_points.h" namespace doris { @@ -246,6 +247,21 @@ int SizeBasedCumulativeCompactionPolicy::pick_input_rowsets( const int64_t max_compaction_score, const int64_t min_compaction_score, std::vector* input_rowsets, Version* last_delete_version, size_t* compaction_score, bool allow_delete) { + DBUG_EXECUTE_IF("SizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", { + auto target_tablet_id = dp->param("tablet_id", -1); + if (target_tablet_id == tablet->tablet_id()) { + auto start_version = dp->param("start_version", -1); + auto end_version = dp->param("end_version", -1); + for (auto& rowset : candidate_rowsets) { + if (rowset->start_version() >= start_version && + rowset->end_version() <= end_version) { + input_rowsets->push_back(rowset); + } + } + } + return input_rowsets->size(); + }) + size_t promotion_size = tablet->cumulative_promotion_size(); auto max_version = tablet->max_version().first; int transient_size = 0; diff --git a/be/src/olap/cumulative_compaction_time_series_policy.cpp b/be/src/olap/cumulative_compaction_time_series_policy.cpp index 6fa4b8d014313f..64e51c77641311 100644 --- a/be/src/olap/cumulative_compaction_time_series_policy.cpp +++ b/be/src/olap/cumulative_compaction_time_series_policy.cpp @@ -27,11 +27,14 @@ namespace doris { uint32_t TimeSeriesCumulativeCompactionPolicy::calc_cumulative_compaction_score(Tablet* tablet) { uint32_t score = 0; + uint32_t level0_score = 0; bool base_rowset_exist = false; const int64_t point = tablet->cumulative_layer_point(); + int64_t level0_total_size = 0; RowsetMetaSharedPtr first_meta; int64_t first_version = INT64_MAX; + std::list checked_rs_metas; // NOTE: tablet._meta_lock is hold auto& rs_metas = tablet->tablet_meta()->all_rs_metas(); // check the base rowset and collect the rowsets of cumulative part @@ -50,6 +53,12 @@ uint32_t TimeSeriesCumulativeCompactionPolicy::calc_cumulative_compaction_score( } else { // collect the rowsets of cumulative part score += rs_meta->get_compaction_score(); + if (rs_meta->compaction_level() == 0) { + level0_total_size += rs_meta->total_disk_size(); + level0_score += rs_meta->get_compaction_score(); + } else { + checked_rs_metas.push_back(rs_meta); + } } } @@ -64,7 +73,64 @@ uint32_t TimeSeriesCumulativeCompactionPolicy::calc_cumulative_compaction_score( return 0; } - return score; + // Condition 1: the size of input files for compaction meets the requirement of parameter compaction_goal_size + int64_t compaction_goal_size_mbytes = + tablet->tablet_meta()->time_series_compaction_goal_size_mbytes(); + if (level0_total_size >= compaction_goal_size_mbytes * 1024 * 1024) { + return score; + } + + // Condition 2: the number of input files reaches the threshold specified by parameter compaction_file_count_threshold + if (level0_score >= tablet->tablet_meta()->time_series_compaction_file_count_threshold()) { + return score; + } + + // Condition 3: level1 achieve compaction_goal_size + if (tablet->tablet_meta()->time_series_compaction_level_threshold() >= 2) { + checked_rs_metas.sort([](const RowsetMetaSharedPtr& a, const RowsetMetaSharedPtr& b) { + return a->version().first < b->version().first; + }); + int32_t rs_meta_count = 0; + int64_t continuous_size = 0; + for (const auto& rs_meta : checked_rs_metas) { + rs_meta_count++; + continuous_size += rs_meta->total_disk_size(); + if (rs_meta_count >= 2) { + if (continuous_size >= compaction_goal_size_mbytes * 1024 * 1024) { + return score; + } + } + } + } + + int64_t now = UnixMillis(); + int64_t last_cumu = tablet->last_cumu_compaction_success_time(); + if (last_cumu != 0) { + int64_t cumu_interval = now - last_cumu; + + // Condition 4: the time interval between compactions exceeds the value specified by parameter _compaction_time_threshold_second + if (cumu_interval > + (tablet->tablet_meta()->time_series_compaction_time_threshold_seconds() * 1000)) { + return score; + } + } else if (score > 0) { + // If the compaction process has not been successfully executed, + // the condition for triggering compaction based on the last successful compaction time (condition 3) will never be met + tablet->set_last_cumu_compaction_success_time(now); + } + + // Condition 5: If there is a continuous set of empty rowsets, prioritize merging. + std::vector input_rowsets; + std::vector candidate_rowsets = + tablet->pick_candidate_rowsets_to_cumulative_compaction(); + tablet->calc_consecutive_empty_rowsets( + &input_rowsets, candidate_rowsets, + tablet->tablet_meta()->time_series_compaction_empty_rowsets_threshold()); + if (!input_rowsets.empty()) { + return score; + } + + return 0; } void TimeSeriesCumulativeCompactionPolicy::calculate_cumulative_point( diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp index 4070bd1dd4340e..4aa215e0c2eb16 100644 --- a/be/src/olap/data_dir.cpp +++ b/be/src/olap/data_dir.cpp @@ -316,10 +316,10 @@ Status DataDir::_check_incompatible_old_format_tablet() { std::string_view value) -> bool { // if strict check incompatible old format, then log fatal if (config::storage_strict_check_incompatible_old_format) { - LOG(FATAL) - << "There are incompatible old format metas, current version does not support " - << "and it may lead to data missing!!! " - << "tablet_id = " << tablet_id << " schema_hash = " << schema_hash; + throw Exception(Status::FatalError( + "There are incompatible old format metas, current version does not support and " + "it may lead to data missing!!! tablet_id = {} schema_hash = {}", + tablet_id, schema_hash)); } else { LOG(WARNING) << "There are incompatible old format metas, current version does not support " @@ -451,7 +451,8 @@ Status DataDir::load() { << ", loaded tablet: " << tablet_ids.size() << ", error tablet: " << failed_tablet_ids.size() << ", path: " << _path; if (!config::ignore_load_tablet_failure) { - LOG(FATAL) << "load tablets encounter failure. stop BE process. path: " << _path; + throw Exception(Status::FatalError( + "load tablets encounter failure. stop BE process. path: {}", _path)); } } if (!load_tablet_status) { @@ -495,10 +496,9 @@ Status DataDir::load() { } } if (rowset_partition_id_eq_0_num > config::ignore_invalid_partition_id_rowset_num) { - LOG(FATAL) << fmt::format( + throw Exception(Status::FatalError( "roswet partition id eq 0 is {} bigger than config {}, be exit, plz check be.INFO", - rowset_partition_id_eq_0_num, config::ignore_invalid_partition_id_rowset_num); - exit(-1); + rowset_partition_id_eq_0_num, config::ignore_invalid_partition_id_rowset_num)); } // traverse rowset diff --git a/be/src/olap/delete_bitmap_calculator.cpp b/be/src/olap/delete_bitmap_calculator.cpp index 017e3cff3d0489..8ac05a1e393043 100644 --- a/be/src/olap/delete_bitmap_calculator.cpp +++ b/be/src/olap/delete_bitmap_calculator.cpp @@ -145,12 +145,11 @@ Status MergeIndexDeleteBitmapCalculator::init(RowsetId rowset_id, MergeIndexDeleteBitmapCalculatorContext::Comparator(seq_col_length, _rowid_length); _contexts.reserve(segments.size()); _heap = std::make_unique(_comparator); - for (auto& segment : segments) { - RETURN_IF_ERROR(segment->load_index()); + RETURN_IF_ERROR(segment->load_index(nullptr)); auto pk_idx = segment->get_primary_key_index(); std::unique_ptr index; - RETURN_IF_ERROR(pk_idx->new_iterator(&index)); + RETURN_IF_ERROR(pk_idx->new_iterator(&index, nullptr)); auto index_type = vectorized::DataTypeFactory::instance().create_data_type( pk_idx->type_info()->type(), 1, 0); _contexts.emplace_back(std::move(index), index_type, segment->id(), pk_idx->num_rows()); diff --git a/be/src/olap/key_coder.h b/be/src/olap/key_coder.h index 6885a0d96f251b..549ac53656b647 100644 --- a/be/src/olap/key_coder.h +++ b/be/src/olap/key_coder.h @@ -109,8 +109,8 @@ class KeyCoderTraits< case 16: return BigEndian::FromHost128(val); default: - LOG(FATAL) << "Invalid type to big endian, type=" << int(field_type) - << ", size=" << sizeof(UnsignedCppType); + throw Exception(Status::FatalError("Invalid type to big endian, type={}, size={}", + int(field_type), sizeof(UnsignedCppType))); } } } @@ -300,8 +300,7 @@ class KeyCoderTraits { } static Status decode_ascending(Slice* encoded_key, size_t index_size, uint8_t* cell_ptr) { - LOG(FATAL) << "decode_ascending is not implemented"; - return Status::OK(); + throw Exception(Status::FatalError("decode_ascending is not implemented")); } }; @@ -320,8 +319,7 @@ class KeyCoderTraits { } static Status decode_ascending(Slice* encoded_key, size_t index_size, uint8_t* cell_ptr) { - LOG(FATAL) << "decode_ascending is not implemented"; - return Status::OK(); + throw Exception(Status::FatalError("decode_ascending is not implemented")); } }; @@ -340,8 +338,7 @@ class KeyCoderTraits { } static Status decode_ascending(Slice* encoded_key, size_t index_size, uint8_t* cell_ptr) { - LOG(FATAL) << "decode_ascending is not implemented"; - return Status::OK(); + throw Exception(Status::FatalError("decode_ascending is not implemented")); } }; diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h index 31763d45f7edc7..e0d185c7bd3e98 100644 --- a/be/src/olap/like_column_predicate.h +++ b/be/src/olap/like_column_predicate.h @@ -128,8 +128,8 @@ class LikeColumnPredicate : public ColumnPredicate { } } } else { - LOG(FATAL) << "vectorized (not) like predicates should be dict column"; - __builtin_unreachable(); + throw Exception(Status::FatalError( + "vectorized (not) like predicates should be dict column")); } } else { if (column.is_column_dictionary()) { @@ -153,8 +153,8 @@ class LikeColumnPredicate : public ColumnPredicate { } } } else { - LOG(FATAL) << "vectorized (not) like predicates should be dict column"; - __builtin_unreachable(); + throw Exception(Status::FatalError( + "vectorized (not) like predicates should be dict column")); } } } diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index e539f4a440ab0c..a442a1c1ffe800 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -4,21 +4,21 @@ #include "olap/lru_cache.h" -#include - +#include #include #include #include #include #include "gutil/bits.h" -#include "util/doris_metrics.h" +#include "util/metrics.h" #include "util/time.h" using std::string; using std::stringstream; namespace doris { +#include "common/compile_check_begin.h" DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_capacity, MetricUnit::BYTES); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_usage, MetricUnit::BYTES); @@ -35,7 +35,7 @@ uint32_t CacheKey::hash(const char* data, size_t n, uint32_t seed) const { const uint32_t m = 0xc6a4a793; const uint32_t r = 24; const char* limit = data + n; - uint32_t h = seed ^ (n * m); + uint32_t h = seed ^ (static_cast(n) * m); // Pick up four bytes at a time while (data + 4 <= limit) { @@ -69,8 +69,6 @@ uint32_t CacheKey::hash(const char* data, size_t n, uint32_t seed) const { return h; } -Cache::~Cache() {} - HandleTable::~HandleTable() { delete[] _list; } @@ -140,7 +138,7 @@ void HandleTable::_resize() { new_length *= 2; } - LRUHandle** new_list = new (std::nothrow) LRUHandle*[new_length]; + auto** new_list = new (std::nothrow) LRUHandle*[new_length]; memset(new_list, 0, sizeof(new_list[0]) * new_length); uint32_t count = 0; @@ -167,7 +165,7 @@ uint32_t HandleTable::element_count() const { return _elems; } -LRUCache::LRUCache(LRUCacheType type) : _type(type) { +LRUCache::LRUCache(LRUCacheType type, bool is_lru_k) : _type(type), _is_lru_k(is_lru_k) { // Make empty circular linked list _lru_normal.next = &_lru_normal; _lru_normal.prev = &_lru_normal; @@ -305,6 +303,17 @@ Cache::Handle* LRUCache::lookup(const CacheKey& key, uint32_t hash) { } else { ++_miss_count; } + + // If key not exist in cache, and is lru k cache, and key in visits list, + // then move the key to beginning of the visits list. + // key in visits list indicates that the key has been inserted once after the cache is full. + if (e == nullptr && _is_lru_k) { + auto it = _visits_lru_cache_map.find(hash); + if (it != _visits_lru_cache_map.end()) { + _visits_lru_cache_list.splice(_visits_lru_cache_list.begin(), _visits_lru_cache_list, + it->second); + } + } return reinterpret_cast(e); } @@ -312,14 +321,14 @@ void LRUCache::release(Cache::Handle* handle) { if (handle == nullptr) { return; } - LRUHandle* e = reinterpret_cast(handle); + auto* e = reinterpret_cast(handle); bool last_ref = false; { std::lock_guard l(_mutex); + // if last_ref is true, key may have been evict from the cache, + // or if it is lru k, first insert of key may have failed. last_ref = _unref(e); - if (last_ref) { - _usage -= e->total_size; - } else if (e->in_cache && e->refs == 1) { + if (e->in_cache && e->refs == 1) { // only exists in cache if (_usage > _capacity) { // take this opportunity and remove the item @@ -327,6 +336,8 @@ void LRUCache::release(Cache::Handle* handle) { DCHECK(removed); e->in_cache = false; _unref(e); + // `entry->in_cache = false` and `_usage -= entry->total_size;` and `_unref(entry)` should appear together. + // see the comment for old entry in `LRUCache::insert`. _usage -= e->total_size; last_ref = true; } else { @@ -401,6 +412,8 @@ void LRUCache::_evict_one_entry(LRUHandle* e) { DCHECK(removed); e->in_cache = false; _unref(e); + // `entry->in_cache = false` and `_usage -= entry->total_size;` and `_unref(entry)` should appear together. + // see the comment for old entry in `LRUCache::insert`. _usage -= e->total_size; } @@ -408,6 +421,42 @@ bool LRUCache::_check_element_count_limit() { return _element_count_capacity != 0 && _table.element_count() >= _element_count_capacity; } +// After cache is full, +// 1.Return false. If key has been inserted into the visits list before, +// key is allowed to be inserted into cache this time (this will trigger cache evict), +// and key is removed from the visits list. +// 2. Return true. If key not in visits list, insert it into visits list. +bool LRUCache::_lru_k_insert_visits_list(size_t total_size, visits_lru_cache_key visits_key) { + if (_usage + total_size > _capacity || + _check_element_count_limit()) { // this line no lock required + auto it = _visits_lru_cache_map.find(visits_key); + if (it != _visits_lru_cache_map.end()) { + _visits_lru_cache_usage -= it->second->second; + _visits_lru_cache_list.erase(it->second); + _visits_lru_cache_map.erase(it); + } else { + // _visits_lru_cache_list capacity is same as the cache itself. + // If _visits_lru_cache_list is full, some keys will also be evict. + while (_visits_lru_cache_usage + total_size > _capacity && + _visits_lru_cache_usage != 0) { + DCHECK(!_visits_lru_cache_map.empty()); + _visits_lru_cache_usage -= _visits_lru_cache_list.back().second; + _visits_lru_cache_map.erase(_visits_lru_cache_list.back().first); + _visits_lru_cache_list.pop_back(); + } + // 1. If true, insert key at the beginning of _visits_lru_cache_list. + // 2. If false, it means total_size > cache _capacity, preventing this insert. + if (_visits_lru_cache_usage + total_size <= _capacity) { + _visits_lru_cache_list.emplace_front(visits_key, total_size); + _visits_lru_cache_map[visits_key] = _visits_lru_cache_list.begin(); + _visits_lru_cache_usage += total_size; + } + return true; + } + } + return false; +} + Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, size_t charge, CachePriority priority) { size_t handle_size = sizeof(LRUHandle) - 1 + key.size(); @@ -419,17 +468,22 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // because charge at this time is no longer the memory size, but an weight. e->total_size = (_type == LRUCacheType::SIZE ? handle_size + charge : charge); e->hash = hash; - e->refs = 2; // one for the returned handle, one for LRUCache. + e->refs = 1; // only one for the returned handle. e->next = e->prev = nullptr; - e->in_cache = true; + e->in_cache = false; e->priority = priority; e->type = _type; memcpy(e->key_data, key.data(), key.size()); e->last_visit_time = UnixMillis(); + LRUHandle* to_remove_head = nullptr; { std::lock_guard l(_mutex); + if (_is_lru_k && _lru_k_insert_visits_list(e->total_size, hash)) { + return reinterpret_cast(e); + } + // Free the space following strict LRU policy until enough space // is freed or the lru list is empty if (_cache_value_check_timestamp) { @@ -441,13 +495,22 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // insert into the cache // note that the cache might get larger than its capacity if not enough // space was freed - auto old = _table.insert(e); + auto* old = _table.insert(e); + e->in_cache = true; _usage += e->total_size; + e->refs++; // one for the returned handle, one for LRUCache. if (old != nullptr) { _stampede_count++; old->in_cache = false; + // `entry->in_cache = false` and `_usage -= entry->total_size;` and `_unref(entry)` should appear together. + // Whether the reference of the old entry is 0, the cache usage is subtracted here, + // because the old entry has been removed from the cache and should not be counted in the cache capacity, + // but the memory of the old entry is still tracked by the cache memory_tracker. + // After all the old handles are released, the old entry will be freed and the memory of the old entry + // will be released from the cache memory_tracker. + _usage -= old->total_size; + // if false, old entry is being used externally, just ref-- and sub _usage, if (_unref(old)) { - _usage -= old->total_size; // old is on LRU because it's in cache and its reference count // was just 1 (Unref returned 0) _lru_remove(old); @@ -476,14 +539,15 @@ void LRUCache::erase(const CacheKey& key, uint32_t hash) { e = _table.remove(key, hash); if (e != nullptr) { last_ref = _unref(e); - if (last_ref) { - _usage -= e->total_size; - if (e->in_cache) { - // locate in free list - _lru_remove(e); - } + // if last_ref is false or in_cache is false, e must not be in lru + if (last_ref && e->in_cache) { + // locate in free list + _lru_remove(e); } e->in_cache = false; + // `entry->in_cache = false` and `_usage -= entry->total_size;` and `_unref(entry)` should appear together. + // see the comment for old entry in `LRUCache::insert`. + _usage -= e->total_size; } } // free handle out of mutex, when last_ref is true, e must not be nullptr @@ -576,11 +640,11 @@ inline uint32_t ShardedLRUCache::_hash_slice(const CacheKey& s) { } ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type, - uint32_t num_shards, uint32_t total_element_count_capacity) + uint32_t num_shards, uint32_t total_element_count_capacity, + bool is_lru_k) : _name(name), _num_shard_bits(Bits::FindLSBSetNonZero(num_shards)), _num_shards(num_shards), - _shards(nullptr), _last_id(1), _capacity(capacity) { CHECK(num_shards > 0) << "num_shards cannot be 0"; @@ -588,11 +652,11 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCa << "num_shards should be power of two, but got " << num_shards; const size_t per_shard = (capacity + (_num_shards - 1)) / _num_shards; - const size_t per_shard_element_count_capacity = + const uint32_t per_shard_element_count_capacity = (total_element_count_capacity + (_num_shards - 1)) / _num_shards; - LRUCache** shards = new (std::nothrow) LRUCache*[_num_shards]; + auto** shards = new (std::nothrow) LRUCache*[_num_shards]; for (int s = 0; s < _num_shards; s++) { - shards[s] = new LRUCache(type); + shards[s] = new LRUCache(type, is_lru_k); shards[s]->set_capacity(per_shard); shards[s]->set_element_count_capacity(per_shard_element_count_capacity); } @@ -604,12 +668,12 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCa INT_GAUGE_METRIC_REGISTER(_entity, cache_capacity); INT_GAUGE_METRIC_REGISTER(_entity, cache_usage); INT_GAUGE_METRIC_REGISTER(_entity, cache_element_count); - INT_DOUBLE_METRIC_REGISTER(_entity, cache_usage_ratio); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_lookup_count); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_hit_count); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_stampede_count); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_miss_count); - INT_DOUBLE_METRIC_REGISTER(_entity, cache_hit_ratio); + DOUBLE_GAUGE_METRIC_REGISTER(_entity, cache_usage_ratio); + INT_COUNTER_METRIC_REGISTER(_entity, cache_lookup_count); + INT_COUNTER_METRIC_REGISTER(_entity, cache_hit_count); + INT_COUNTER_METRIC_REGISTER(_entity, cache_stampede_count); + INT_COUNTER_METRIC_REGISTER(_entity, cache_miss_count); + DOUBLE_GAUGE_METRIC_REGISTER(_entity, cache_hit_ratio); _hit_count_bvar.reset(new bvar::Adder("doris_cache", _name)); _hit_count_per_second.reset(new bvar::PerSecond>( @@ -623,8 +687,9 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCa uint32_t num_shards, CacheValueTimeExtractor cache_value_time_extractor, bool cache_value_check_timestamp, - uint32_t total_element_count_capacity) - : ShardedLRUCache(name, capacity, type, num_shards, total_element_count_capacity) { + uint32_t total_element_count_capacity, bool is_lru_k) + : ShardedLRUCache(name, capacity, type, num_shards, total_element_count_capacity, + is_lru_k) { for (int s = 0; s < _num_shards; s++) { _shards[s]->set_cache_value_time_extractor(cache_value_time_extractor); _shards[s]->set_cache_value_check_timestamp(cache_value_check_timestamp); @@ -672,7 +737,7 @@ Cache::Handle* ShardedLRUCache::lookup(const CacheKey& key) { } void ShardedLRUCache::release(Handle* handle) { - LRUHandle* h = reinterpret_cast(handle); + auto* h = reinterpret_cast(handle); _shards[_shard(h->hash)]->release(handle); } @@ -751,9 +816,11 @@ void ShardedLRUCache::update_cache_metrics() const { cache_hit_count->set_value(total_hit_count); cache_miss_count->set_value(total_miss_count); cache_stampede_count->set_value(total_stampede_count); - cache_usage_ratio->set_value(capacity == 0 ? 0 : ((double)total_usage / capacity)); - cache_hit_ratio->set_value( - total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count)); + cache_usage_ratio->set_value( + capacity == 0 ? 0 : (static_cast(total_usage) / static_cast(capacity))); + cache_hit_ratio->set_value(total_lookup_count == 0 ? 0 + : (static_cast(total_hit_count) / + static_cast(total_lookup_count))); } Cache::Handle* DummyLRUCache::insert(const CacheKey& key, void* value, size_t charge, diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index 303a4cf2065ef9..d4f8b905faa982 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -4,16 +4,16 @@ #pragma once -#include #include #include #include #include -#include -#include -#include #include +#include +#include +#include +#include #include #include #include @@ -25,30 +25,7 @@ #include "util/metrics.h" namespace doris { - -#define OLAP_CACHE_STRING_TO_BUF(cur, str, r_len) \ - do { \ - if (r_len > str.size()) { \ - memcpy(cur, str.c_str(), str.size()); \ - r_len -= str.size(); \ - cur += str.size(); \ - } else { \ - LOG(WARNING) << "construct cache key buf not enough."; \ - return CacheKey(nullptr, 0); \ - } \ - } while (0) - -#define OLAP_CACHE_NUMERIC_TO_BUF(cur, numeric, r_len) \ - do { \ - if (r_len > sizeof(numeric)) { \ - memcpy(cur, &numeric, sizeof(numeric)); \ - r_len -= sizeof(numeric); \ - cur += sizeof(numeric); \ - } else { \ - LOG(WARNING) << "construct cache key buf not enough."; \ - return CacheKey(nullptr, 0); \ - } \ - } while (0) +#include "common/compile_check_begin.h" class Cache; class LRUCachePolicy; @@ -62,10 +39,11 @@ enum LRUCacheType { static constexpr LRUCacheType DEFAULT_LRU_CACHE_TYPE = LRUCacheType::SIZE; static constexpr uint32_t DEFAULT_LRU_CACHE_NUM_SHARDS = 32; static constexpr size_t DEFAULT_LRU_CACHE_ELEMENT_COUNT_CAPACITY = 0; +static constexpr bool DEFAULT_LRU_CACHE_IS_LRU_K = false; class CacheKey { public: - CacheKey() : _data(nullptr), _size(0) {} + CacheKey() : _size(0) {} // Create a slice that refers to d[0,n-1]. CacheKey(const char* d, size_t n) : _data(d), _size(n) {} @@ -75,7 +53,7 @@ class CacheKey { // Create a slice that refers to s[0,strlen(s)-1] CacheKey(const char* s) : _data(s), _size(strlen(s)) {} - ~CacheKey() {} + ~CacheKey() = default; // Return a pointer to the beginning of the referenced data const char* data() const { return _data; } @@ -107,7 +85,7 @@ class CacheKey { } // Return a string that contains the copy of the referenced data. - std::string to_string() const { return std::string(_data, _size); } + std::string to_string() const { return {_data, _size}; } bool operator==(const CacheKey& other) const { return ((size() == other.size()) && (memcmp(data(), other.data(), size()) == 0)); @@ -162,11 +140,11 @@ struct PrunedInfo { class Cache { public: - Cache() {} + Cache() = default; // Destroys all existing entries by calling the "deleter" // function that was passed to the constructor. - virtual ~Cache(); + virtual ~Cache() = default; // Opaque handle to an entry stored in the cache. struct Handle {}; @@ -180,6 +158,10 @@ class Cache { // // When the inserted entry is no longer needed, the key and // value will be passed to "deleter". + // + // if cache is lru k and cache is full, first insert of key will not succeed. + // + // Note: if is ShardedLRUCache, cache capacity = ShardedLRUCache_capacity / num_shards. virtual Handle* insert(const CacheKey& key, void* value, size_t charge, CachePriority priority = CachePriority::NORMAL) = 0; @@ -263,7 +245,7 @@ struct LRUHandle { if (next == this) { return *(reinterpret_cast(value)); } else { - return CacheKey(key_data, key_length); + return {key_data, key_length}; } } @@ -283,7 +265,7 @@ struct LRUHandle { class HandleTable { public: - HandleTable() : _length(0), _elems(0), _list(nullptr) { _resize(); } + HandleTable() { _resize(); } ~HandleTable(); @@ -306,8 +288,8 @@ class HandleTable { // The tablet consists of an array of buckets where each bucket is // a linked list of cache entries that hash into the bucket. - uint32_t _length; - uint32_t _elems; + uint32_t _length {}; + uint32_t _elems {}; LRUHandle** _list = nullptr; // Return a pointer to slot that points to a cache entry that @@ -326,9 +308,16 @@ using LRUHandleSortedSet = std::set>; // A single shard of sharded cache. class LRUCache { public: - LRUCache(LRUCacheType type); + LRUCache(LRUCacheType type, bool is_lru_k = DEFAULT_LRU_CACHE_IS_LRU_K); ~LRUCache(); + // visits_lru_cache_key is the hash value of CacheKey. + // If there is a hash conflict, a cache entry may be inserted early + // and another cache entry with the same key hash may be inserted later. + // Otherwise, this does not affect the correctness of the cache. + using visits_lru_cache_key = uint32_t; + using visits_lru_cache_pair = std::pair; + // Separate from constructor so caller can easily make an array of LRUCache PrunedInfo set_capacity(size_t capacity); void set_element_count_capacity(uint32_t element_count_capacity) { @@ -365,6 +354,7 @@ class LRUCache { void _evict_from_lru_with_time(size_t total_size, LRUHandle** to_remove_head); void _evict_one_entry(LRUHandle* e); bool _check_element_count_limit(); + bool _lru_k_insert_visits_list(size_t total_size, visits_lru_cache_key visits_key); private: LRUCacheType _type; @@ -396,18 +386,24 @@ class LRUCache { LRUHandleSortedSet _sorted_durable_entries_with_timestamp; uint32_t _element_count_capacity = 0; + + bool _is_lru_k = false; // LRU-K algorithm, K=2 + std::list _visits_lru_cache_list; + std::unordered_map::iterator> + _visits_lru_cache_map; + size_t _visits_lru_cache_usage = 0; }; class ShardedLRUCache : public Cache { public: - virtual ~ShardedLRUCache(); - virtual Handle* insert(const CacheKey& key, void* value, size_t charge, - CachePriority priority = CachePriority::NORMAL) override; - virtual Handle* lookup(const CacheKey& key) override; - virtual void release(Handle* handle) override; - virtual void erase(const CacheKey& key) override; - virtual void* value(Handle* handle) override; - virtual uint64_t new_id() override; + ~ShardedLRUCache() override; + Handle* insert(const CacheKey& key, void* value, size_t charge, + CachePriority priority = CachePriority::NORMAL) override; + Handle* lookup(const CacheKey& key) override; + void release(Handle* handle) override; + void erase(const CacheKey& key) override; + void* value(Handle* handle) override; + uint64_t new_id() override; PrunedInfo prune() override; PrunedInfo prune_if(CachePrunePredicate pred, bool lazy_mode = false) override; int64_t get_usage() override; @@ -420,17 +416,18 @@ class ShardedLRUCache : public Cache { friend class LRUCachePolicy; explicit ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type, - uint32_t num_shards, uint32_t element_count_capacity); + uint32_t num_shards, uint32_t element_count_capacity, bool is_lru_k); explicit ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type, uint32_t num_shards, CacheValueTimeExtractor cache_value_time_extractor, - bool cache_value_check_timestamp, uint32_t element_count_capacity); + bool cache_value_check_timestamp, uint32_t element_count_capacity, + bool is_lru_k); void update_cache_metrics() const; private: static uint32_t _hash_slice(const CacheKey& s); - uint32_t _shard(uint32_t hash) { + uint32_t _shard(uint32_t hash) const { return _num_shard_bits > 0 ? (hash >> (32 - _num_shard_bits)) : 0; } @@ -447,10 +444,10 @@ class ShardedLRUCache : public Cache { IntGauge* cache_usage = nullptr; IntGauge* cache_element_count = nullptr; DoubleGauge* cache_usage_ratio = nullptr; - IntAtomicCounter* cache_lookup_count = nullptr; - IntAtomicCounter* cache_hit_count = nullptr; - IntAtomicCounter* cache_miss_count = nullptr; - IntAtomicCounter* cache_stampede_count = nullptr; + IntCounter* cache_lookup_count = nullptr; + IntCounter* cache_hit_count = nullptr; + IntCounter* cache_miss_count = nullptr; + IntCounter* cache_stampede_count = nullptr; DoubleGauge* cache_hit_ratio = nullptr; // bvars std::unique_ptr> _hit_count_bvar; @@ -481,3 +478,4 @@ class DummyLRUCache : public Cache { }; } // namespace doris +#include "common/compile_check_end.h" diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h index ad202b7b2427cf..3ff1775fd8882a 100644 --- a/be/src/olap/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -55,8 +55,7 @@ class MatchPredicate : public ColumnPredicate { //evaluate predicate on Bitmap Status evaluate(BitmapIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* roaring) const override { - LOG(FATAL) << "Not Implemented MatchPredicate::evaluate"; - __builtin_unreachable(); + throw Exception(Status::FatalError("Not Implemented MatchPredicate::evaluate")); } //evaluate predicate on inverted diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 765f67a07c7884..f8cc79b205535f 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -34,6 +34,7 @@ #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/thread_context.h" +#include "util/debug_points.h" #include "util/runtime_profile.h" #include "util/stopwatch.hpp" #include "vec/aggregate_functions/aggregate_function_reader.h" @@ -589,6 +590,7 @@ void MemTable::shrink_memtable_by_agg() { } bool MemTable::need_flush() const { + DBUG_EXECUTE_IF("MemTable.need_flush", { return true; }); auto max_size = config::write_buffer_size; if (_partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { auto update_columns_size = _num_columns; diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h index 59480264b46103..8e3fef1ff27695 100644 --- a/be/src/olap/null_predicate.h +++ b/be/src/olap/null_predicate.h @@ -87,8 +87,8 @@ class NullPredicate : public ColumnPredicate { if (_is_null) { return bf->test_bytes(nullptr, 0); } else { - LOG(FATAL) << "Bloom filter is not supported by predicate type: is_null=" << _is_null; - return true; + throw Exception(Status::FatalError( + "Bloom filter is not supported by predicate type: is_null=")); } } diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index 11249bafb1e3c0..a83e6a6df63e1a 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -36,6 +36,7 @@ #include #include "common/config.h" +#include "common/exception.h" #include "io/io_common.h" #include "olap/olap_define.h" #include "olap/rowset/rowset_fwd.h" @@ -388,6 +389,30 @@ struct OlapReaderStatistics { int64_t collect_iterator_merge_next_timer = 0; int64_t collect_iterator_normal_next_timer = 0; int64_t delete_bitmap_get_agg_ns = 0; + + int64_t tablet_reader_init_timer_ns = 0; + int64_t tablet_reader_capture_rs_readers_timer_ns = 0; + int64_t tablet_reader_init_return_columns_timer_ns = 0; + int64_t tablet_reader_init_keys_param_timer_ns = 0; + int64_t tablet_reader_init_orderby_keys_param_timer_ns = 0; + int64_t tablet_reader_init_conditions_param_timer_ns = 0; + int64_t tablet_reader_init_delete_condition_param_timer_ns = 0; + int64_t block_reader_vcollect_iter_init_timer_ns = 0; + int64_t block_reader_rs_readers_init_timer_ns = 0; + int64_t block_reader_build_heap_init_timer_ns = 0; + + int64_t rowset_reader_get_segment_iterators_timer_ns = 0; + int64_t rowset_reader_create_iterators_timer_ns = 0; + int64_t rowset_reader_init_iterators_timer_ns = 0; + int64_t rowset_reader_load_segments_timer_ns = 0; + + int64_t segment_iterator_init_timer_ns = 0; + int64_t segment_iterator_init_return_column_iterators_timer_ns = 0; + int64_t segment_iterator_init_bitmap_index_iterators_timer_ns = 0; + int64_t segment_iterator_init_inverted_index_iterators_timer_ns = 0; + + int64_t segment_create_column_readers_timer_ns = 0; + int64_t segment_load_index_timer_ns = 0; }; using ColumnId = uint32_t; @@ -419,7 +444,8 @@ struct RowsetId { LOG(WARNING) << "failed to init rowset id: " << rowset_id_str; high = next_rowset_id().hi; } else { - LOG(FATAL) << "failed to init rowset id: " << rowset_id_str; + throw Exception( + Status::FatalError("failed to init rowset id: {}", rowset_id_str)); } } init(1, high, 0, 0); @@ -440,7 +466,7 @@ struct RowsetId { void init(int64_t id_version, int64_t high, int64_t middle, int64_t low) { version = id_version; if (UNLIKELY(high >= MAX_ROWSET_ID)) { - LOG(FATAL) << "inc rowsetid is too large:" << high; + throw Exception(Status::FatalError("inc rowsetid is too large:{}", high)); } hi = (id_version << 56) + (high & LOW_56_BITS); mi = middle; diff --git a/be/src/olap/page_cache.h b/be/src/olap/page_cache.h index 32b6683e7823b0..64212e8c06efb9 100644 --- a/be/src/olap/page_cache.h +++ b/be/src/olap/page_cache.h @@ -97,7 +97,8 @@ class StoragePageCache { DataPageCache(size_t capacity, uint32_t num_shards) : LRUCachePolicy(CachePolicy::CacheType::DATA_PAGE_CACHE, capacity, LRUCacheType::SIZE, config::data_page_cache_stale_sweep_time_sec, - num_shards) {} + num_shards, DEFAULT_LRU_CACHE_ELEMENT_COUNT_CAPACITY, true, true) { + } }; class IndexPageCache : public LRUCachePolicy { @@ -176,11 +177,9 @@ class StoragePageCache { return _pk_index_page_cache.get(); } default: - LOG(FATAL) << "get error type page cache"; - __builtin_unreachable(); + throw Exception(Status::FatalError("get error type page cache")); } - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); + throw Exception(Status::FatalError("__builtin_unreachable")); } }; diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index 5f7bedb01fc8de..00b72832ee60e0 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -95,27 +95,29 @@ Status PrimaryKeyIndexBuilder::finalize(segment_v2::PrimaryKeyIndexMetaPB* meta) } Status PrimaryKeyIndexReader::parse_index(io::FileReaderSPtr file_reader, - const segment_v2::PrimaryKeyIndexMetaPB& meta) { + const segment_v2::PrimaryKeyIndexMetaPB& meta, + OlapReaderStatistics* pk_index_load_stats) { // parse primary key index _index_reader.reset(new segment_v2::IndexedColumnReader(file_reader, meta.primary_key_index())); _index_reader->set_is_pk_index(true); RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false, - _pk_index_load_stats)); + pk_index_load_stats)); _index_parsed = true; return Status::OK(); } Status PrimaryKeyIndexReader::parse_bf(io::FileReaderSPtr file_reader, - const segment_v2::PrimaryKeyIndexMetaPB& meta) { + const segment_v2::PrimaryKeyIndexMetaPB& meta, + OlapReaderStatistics* pk_index_load_stats) { // parse bloom filter segment_v2::ColumnIndexMetaPB column_index_meta = meta.bloom_filter_index(); segment_v2::BloomFilterIndexReader bf_index_reader(std::move(file_reader), column_index_meta.bloom_filter_index()); RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false, - _pk_index_load_stats)); + pk_index_load_stats)); std::unique_ptr bf_iter; - RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter)); + RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter, pk_index_load_stats)); RETURN_IF_ERROR(bf_iter->read_bloom_filter(0, &_bf)); segment_v2::g_pk_total_bloom_filter_num << 1; segment_v2::g_pk_total_bloom_filter_total_bytes << _bf->size(); diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h index dcbbc5f30625f4..f74d3e42030f2f 100644 --- a/be/src/olap/primary_key_index.h +++ b/be/src/olap/primary_key_index.h @@ -98,8 +98,7 @@ class PrimaryKeyIndexBuilder { class PrimaryKeyIndexReader { public: - PrimaryKeyIndexReader(OlapReaderStatistics* pk_index_load_stats = nullptr) - : _index_parsed(false), _bf_parsed(false), _pk_index_load_stats(pk_index_load_stats) {} + PrimaryKeyIndexReader() : _index_parsed(false), _bf_parsed(false) {} ~PrimaryKeyIndexReader() { segment_v2::g_pk_total_bloom_filter_num << -static_cast(_bf_num); @@ -109,12 +108,14 @@ class PrimaryKeyIndexReader { } Status parse_index(io::FileReaderSPtr file_reader, - const segment_v2::PrimaryKeyIndexMetaPB& meta); + const segment_v2::PrimaryKeyIndexMetaPB& meta, + OlapReaderStatistics* pk_index_load_stats); - Status parse_bf(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta); + Status parse_bf(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta, + OlapReaderStatistics* pk_index_load_stats); Status new_iterator(std::unique_ptr* index_iterator, - OlapReaderStatistics* stats = nullptr) const { + OlapReaderStatistics* stats) const { DCHECK(_index_parsed); index_iterator->reset(new segment_v2::IndexedColumnIterator(_index_reader.get(), stats)); return Status::OK(); @@ -155,7 +156,6 @@ class PrimaryKeyIndexReader { std::unique_ptr _bf; size_t _bf_num = 0; uint64 _bf_bytes = 0; - OlapReaderStatistics* _pk_index_load_stats = nullptr; }; } // namespace doris diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index 56d167459f5be7..6f24e015fbb771 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -34,15 +34,16 @@ #include #include #include +#include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" #include "common/status.h" +#include "io/hdfs_builder.h" #include "olap/delete_handler.h" #include "olap/olap_define.h" #include "olap/rowset/pending_rowset_helper.h" -#include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" #include "olap/schema.h" @@ -53,10 +54,11 @@ #include "olap/txn_manager.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "util/runtime_profile.h" #include "util/time.h" #include "vec/core/block.h" +#include "vec/core/column_with_type_and_name.h" #include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/data_type_nullable.h" #include "vec/exec/format/parquet/vparquet_reader.h" #include "vec/exprs/vexpr_context.h" #include "vec/functions/simple_function_factory.h" @@ -352,8 +354,12 @@ PushBrokerReader::PushBrokerReader(const Schema* schema, const TBrokerScanRange& _file_params.expr_of_dest_slot = _params.expr_of_dest_slot; _file_params.dest_sid_to_src_sid_without_trans = _params.dest_sid_to_src_sid_without_trans; _file_params.strict_mode = _params.strict_mode; - _file_params.__isset.broker_addresses = true; - _file_params.broker_addresses = t_scan_range.broker_addresses; + if (_ranges[0].file_type == TFileType::FILE_HDFS) { + _file_params.hdfs_params = parse_properties(_params.properties); + } else { + _file_params.__isset.broker_addresses = true; + _file_params.broker_addresses = t_scan_range.broker_addresses; + } for (const auto& range : _ranges) { TFileRangeDesc file_range; @@ -482,17 +488,36 @@ Status PushBrokerReader::_cast_to_input_block() { auto& arg = _src_block_ptr->get_by_name(slot_desc->col_name()); // remove nullable here, let the get_function decide whether nullable auto return_type = slot_desc->get_data_type_ptr(); - vectorized::ColumnsWithTypeAndName arguments { - arg, - {vectorized::DataTypeString().create_column_const( - arg.column->size(), remove_nullable(return_type)->get_family_name()), - std::make_shared(), ""}}; - auto func_cast = vectorized::SimpleFunctionFactory::instance().get_function( - "CAST", arguments, return_type); idx = _src_block_name_to_idx[slot_desc->col_name()]; - RETURN_IF_ERROR( - func_cast->execute(nullptr, *_src_block_ptr, {idx}, idx, arg.column->size())); - _src_block_ptr->get_by_position(idx).type = std::move(return_type); + // bitmap convert:src -> to_base64 -> bitmap_from_base64 + if (slot_desc->type().is_bitmap_type()) { + auto base64_return_type = vectorized::DataTypeFactory::instance().create_data_type( + vectorized::DataTypeString().get_type_as_type_descriptor(), + slot_desc->is_nullable()); + auto func_to_base64 = vectorized::SimpleFunctionFactory::instance().get_function( + "to_base64", {arg}, base64_return_type); + RETURN_IF_ERROR(func_to_base64->execute(nullptr, *_src_block_ptr, {idx}, idx, + arg.column->size())); + _src_block_ptr->get_by_position(idx).type = std::move(base64_return_type); + auto& arg_base64 = _src_block_ptr->get_by_name(slot_desc->col_name()); + auto func_bitmap_from_base64 = + vectorized::SimpleFunctionFactory::instance().get_function( + "bitmap_from_base64", {arg_base64}, return_type); + RETURN_IF_ERROR(func_bitmap_from_base64->execute(nullptr, *_src_block_ptr, {idx}, idx, + arg_base64.column->size())); + _src_block_ptr->get_by_position(idx).type = std::move(return_type); + } else { + vectorized::ColumnsWithTypeAndName arguments { + arg, + {vectorized::DataTypeString().create_column_const( + arg.column->size(), remove_nullable(return_type)->get_family_name()), + std::make_shared(), ""}}; + auto func_cast = vectorized::SimpleFunctionFactory::instance().get_function( + "CAST", arguments, return_type); + RETURN_IF_ERROR( + func_cast->execute(nullptr, *_src_block_ptr, {idx}, idx, arg.column->size())); + _src_block_ptr->get_by_position(idx).type = std::move(return_type); + } } return Status::OK(); } @@ -518,7 +543,7 @@ Status PushBrokerReader::_convert_to_output_block(vectorized::Block* block) { column_ptr = _src_block.get_by_position(result_column_id).column; // column_ptr maybe a ColumnConst, convert it to a normal column column_ptr = column_ptr->convert_to_full_column_if_const(); - DCHECK(column_ptr != nullptr); + DCHECK(column_ptr); // because of src_slot_desc is always be nullable, so the column_ptr after do dest_expr // is likely to be nullable diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index cd52deed0c8a4d..a328b1b9e8b90e 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -18,6 +18,7 @@ #include "olap/rowset/beta_rowset.h" #include +#include #include #include @@ -557,10 +558,6 @@ Status BetaRowset::add_to_binlog() { } const auto& fs = io::global_local_filesystem(); - - // all segments are in the same directory, so cache binlog_dir without multi times check - std::string binlog_dir; - auto segments_num = num_segments(); VLOG_DEBUG << fmt::format("add rowset to binlog. rowset_id={}, segments_num={}", rowset_id().to_string(), segments_num); @@ -569,17 +566,25 @@ Status BetaRowset::add_to_binlog() { std::vector linked_success_files; Defer remove_linked_files {[&]() { // clear linked files if errors happen if (!status.ok()) { - LOG(WARNING) << "will delete linked success files due to error " << status; + LOG(WARNING) << "will delete linked success files due to error " + << status.to_string_no_stack(); std::vector paths; for (auto& file : linked_success_files) { paths.emplace_back(file); LOG(WARNING) << "will delete linked success file " << file << " due to error"; } static_cast(fs->batch_delete(paths)); - LOG(WARNING) << "done delete linked success files due to error " << status; + LOG(WARNING) << "done delete linked success files due to error " + << status.to_string_no_stack(); } }}; + // The publish_txn might fail even if the add_to_binlog success, so we need to check + // whether a file already exists before linking. + auto errno_is_file_exists = []() { return Errno::no() == EEXIST; }; + + // all segments are in the same directory, so cache binlog_dir without multi times check + std::string binlog_dir; for (int i = 0; i < segments_num; ++i) { auto seg_file = local_segment_path(_tablet_path, rowset_id().to_string(), i); @@ -597,7 +602,7 @@ Status BetaRowset::add_to_binlog() { (std::filesystem::path(binlog_dir) / std::filesystem::path(seg_file).filename()) .string(); VLOG_DEBUG << "link " << seg_file << " to " << binlog_file; - if (!fs->link_file(seg_file, binlog_file).ok()) { + if (!fs->link_file(seg_file, binlog_file).ok() && !errno_is_file_exists()) { status = Status::Error("fail to create hard link. from={}, to={}, errno={}", seg_file, binlog_file, Errno::no()); return status; @@ -614,7 +619,12 @@ Status BetaRowset::add_to_binlog() { std::filesystem::path(index_file).filename()) .string(); VLOG_DEBUG << "link " << index_file << " to " << binlog_index_file; - RETURN_IF_ERROR(fs->link_file(index_file, binlog_index_file)); + if (!fs->link_file(index_file, binlog_index_file).ok() && !errno_is_file_exists()) { + status = Status::Error( + "fail to create hard link. from={}, to={}, errno={}", index_file, + binlog_index_file, Errno::no()); + return status; + } linked_success_files.push_back(binlog_index_file); } } else { @@ -625,7 +635,12 @@ Status BetaRowset::add_to_binlog() { std::filesystem::path(index_file).filename()) .string(); VLOG_DEBUG << "link " << index_file << " to " << binlog_index_file; - RETURN_IF_ERROR(fs->link_file(index_file, binlog_index_file)); + if (!fs->link_file(index_file, binlog_index_file).ok() && !errno_is_file_exists()) { + status = Status::Error( + "fail to create hard link. from={}, to={}, errno={}", index_file, + binlog_index_file, Errno::no()); + return status; + } linked_success_files.push_back(binlog_index_file); } } diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 47cf9b820e8562..9a4d71587a02c1 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -78,7 +78,6 @@ bool BetaRowsetReader::update_profile(RuntimeProfile* profile) { Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context, std::vector* out_iters, bool use_cache) { - RETURN_IF_ERROR(_rowset->load()); _read_context = read_context; // The segment iterator is created with its own statistics, // and the member variable '_stats' is initialized by '_stats(&owned_stats)'. @@ -92,6 +91,9 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context if (_read_context->stats != nullptr) { _stats = _read_context->stats; } + SCOPED_RAW_TIMER(&_stats->rowset_reader_get_segment_iterators_timer_ns); + + RETURN_IF_ERROR(_rowset->load()); // convert RowsetReaderContext to StorageReadOptions _read_options.block_row_max = read_context->batch_size; @@ -225,9 +227,12 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context bool should_use_cache = use_cache || (_read_context->reader_type == ReaderType::READER_QUERY && enable_segment_cache); SegmentCacheHandle segment_cache_handle; - RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(_rowset, &segment_cache_handle, - should_use_cache, - /*need_load_pk_index_and_bf*/ false)); + { + SCOPED_RAW_TIMER(&_stats->rowset_reader_load_segments_timer_ns); + RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( + _rowset, &segment_cache_handle, should_use_cache, + /*need_load_pk_index_and_bf*/ false)); + } // create iterator for each segment auto& segments = segment_cache_handle.get_segments(); @@ -253,6 +258,7 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context const bool use_lazy_init_iterators = !is_merge_iterator && _read_context->reader_type == ReaderType::READER_QUERY; for (int i = seg_start; i < seg_end; i++) { + SCOPED_RAW_TIMER(&_stats->rowset_reader_create_iterators_timer_ns); auto& seg_ptr = segments[i]; std::unique_ptr iter; @@ -317,6 +323,8 @@ Status BetaRowsetReader::_init_iterator() { std::vector iterators; RETURN_IF_ERROR(get_segment_iterators(_read_context, &iterators)); + SCOPED_RAW_TIMER(&_stats->rowset_reader_init_iterators_timer_ns); + if (_read_context->merged_rows == nullptr) { _read_context->merged_rows = &_merged_rows; } @@ -352,8 +360,8 @@ Status BetaRowsetReader::_init_iterator() { } Status BetaRowsetReader::next_block(vectorized::Block* block) { - SCOPED_RAW_TIMER(&_stats->block_fetch_ns); RETURN_IF_ERROR(_init_iterator_once()); + SCOPED_RAW_TIMER(&_stats->block_fetch_ns); if (_empty) { return Status::Error("BetaRowsetReader is empty"); } @@ -381,9 +389,8 @@ Status BetaRowsetReader::next_block(vectorized::Block* block) { } Status BetaRowsetReader::next_block_view(vectorized::BlockView* block_view) { - SCOPED_RAW_TIMER(&_stats->block_fetch_ns); RETURN_IF_ERROR(_init_iterator_once()); - + SCOPED_RAW_TIMER(&_stats->block_fetch_ns); RuntimeState* runtime_state = nullptr; if (_read_context != nullptr) { runtime_state = _read_context->runtime_state; diff --git a/be/src/olap/rowset/beta_rowset_writer_v2.h b/be/src/olap/rowset/beta_rowset_writer_v2.h index 78ec4a7dce703c..9040003a68d0d8 100644 --- a/be/src/olap/rowset/beta_rowset_writer_v2.h +++ b/be/src/olap/rowset/beta_rowset_writer_v2.h @@ -99,8 +99,7 @@ class BetaRowsetWriterV2 : public RowsetWriter { }; RowsetSharedPtr manual_build(const RowsetMetaSharedPtr& rowset_meta) override { - LOG(FATAL) << "not implemeted"; - return nullptr; + throw Exception(Status::FatalError("not implemeted")); } PUniqueId load_id() override { return _context.load_id; } diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index f84ff964ea3051..0a0d36ea04a661 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -170,7 +170,9 @@ class RowsetWriter { virtual int32_t allocate_segment_id() = 0; - virtual void set_segment_start_id(int num_segment) { LOG(FATAL) << "not supported!"; } + virtual void set_segment_start_id(int num_segment) { + throw Exception(Status::FatalError("not supported!")); + } virtual int64_t delete_bitmap_ns() { return 0; } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.h b/be/src/olap/rowset/segment_v2/bloom_filter.h index 4f4adf0fd12283..2ef050257e16ab 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter.h @@ -167,6 +167,16 @@ class BloomFilter { return hash_code; } + static Result hash(const char* buf, uint32_t size, HashStrategyPB strategy) { + if (strategy == HASH_MURMUR3_X64_64) { + uint64_t hash_code; + murmur_hash3_x64_64(buf, size, DEFAULT_SEED, &hash_code); + return hash_code; + } else { + return Status::InvalidArgument("invalid strategy:{}", strategy); + } + } + virtual void add_bytes(const char* buf, uint32_t size) { if (buf == nullptr) { *_has_null = true; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp index 8c63c25d20acee..7c51f0a24c1b1d 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp @@ -34,9 +34,8 @@ namespace segment_v2 { Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats) { // TODO yyq: implement a new once flag to avoid status construct. - _index_load_stats = index_load_stats; - return _load_once.call([this, use_page_cache, kept_in_memory] { - return _load(use_page_cache, kept_in_memory); + return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats] { + return _load(use_page_cache, kept_in_memory, index_load_stats); }); } @@ -45,20 +44,22 @@ int64_t BloomFilterIndexReader::get_metadata_size() const { (_bloom_filter_index_meta ? _bloom_filter_index_meta->ByteSizeLong() : 0); } -Status BloomFilterIndexReader::_load(bool use_page_cache, bool kept_in_memory) { +Status BloomFilterIndexReader::_load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta->bloom_filter(); _bloom_filter_reader.reset(new IndexedColumnReader(_file_reader, bf_index_meta)); - RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory, _index_load_stats)); + RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory, index_load_stats)); update_metadata_size(); return Status::OK(); } -Status BloomFilterIndexReader::new_iterator(std::unique_ptr* iterator) { +Status BloomFilterIndexReader::new_iterator(std::unique_ptr* iterator, + OlapReaderStatistics* index_load_stats) { DBUG_EXECUTE_IF("BloomFilterIndexReader::new_iterator.fail", { return Status::InternalError("new_iterator for bloom filter index failed"); }); - iterator->reset(new BloomFilterIndexIterator(this)); + iterator->reset(new BloomFilterIndexIterator(this, index_load_stats)); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h index fcb0239a2440fa..fb53af89c0fe92 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h @@ -48,17 +48,18 @@ class BloomFilterIndexReader : public MetadataAdder { } Status load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* _bf_index_load_stats = nullptr); + OlapReaderStatistics* bf_index_load_stats); BloomFilterAlgorithmPB algorithm() { return _bloom_filter_index_meta->algorithm(); } // create a new column iterator. - Status new_iterator(std::unique_ptr* iterator); + Status new_iterator(std::unique_ptr* iterator, + OlapReaderStatistics* index_load_stats); const TypeInfo* type_info() const { return _type_info; } private: - Status _load(bool use_page_cache, bool kept_in_memory); + Status _load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats); int64_t get_metadata_size() const override; @@ -70,13 +71,12 @@ class BloomFilterIndexReader : public MetadataAdder { const TypeInfo* _type_info = nullptr; std::unique_ptr _bloom_filter_index_meta = nullptr; std::unique_ptr _bloom_filter_reader; - OlapReaderStatistics* _index_load_stats = nullptr; }; class BloomFilterIndexIterator { public: - explicit BloomFilterIndexIterator(BloomFilterIndexReader* reader) - : _reader(reader), _bloom_filter_iter(reader->_bloom_filter_reader.get()) {} + explicit BloomFilterIndexIterator(BloomFilterIndexReader* reader, OlapReaderStatistics* stats) + : _reader(reader), _bloom_filter_iter(reader->_bloom_filter_reader.get(), stats) {} // Read bloom filter at the given ordinal into `bf`. Status read_bloom_filter(rowid_t ordinal, std::unique_ptr* bf); diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 3f9fb94df0a844..0326512c3d76ca 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -78,9 +78,10 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { for (int i = 0; i < count; ++i) { if (_values.find(*v) == _values.end()) { if constexpr (_is_slice_type()) { - CppType new_value; - RETURN_IF_CATCH_EXCEPTION(_type_info->deep_copy(&new_value, v, &_arena)); - _values.insert(new_value); + const auto* s = reinterpret_cast(v); + auto hash = + DORIS_TRY(BloomFilter::hash(s->data, s->size, _bf_options.strategy)); + _hash_values.insert(hash); } else if constexpr (_is_int128()) { int128_t new_value; memcpy(&new_value, v, sizeof(PackedInt128)); @@ -99,25 +100,28 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { Status flush() override { std::unique_ptr bf; RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf)); - RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); - bf->set_has_null(_has_null); - for (auto& v : _values) { - if constexpr (_is_slice_type()) { - auto* s = (Slice*)&v; - bf->add_bytes(s->data, s->size); - } else { + if constexpr (_is_slice_type()) { + RETURN_IF_ERROR(bf->init(_hash_values.size(), _bf_options.fpp, _bf_options.strategy)); + for (const auto& h : _hash_values) { + bf->add_hash(h); + } + } else { + RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); + for (auto& v : _values) { bf->add_bytes((char*)&v, sizeof(CppType)); } } + bf->set_has_null(_has_null); _bf_buffer_size += bf->size(); _bfs.push_back(std::move(bf)); _values.clear(); + _hash_values.clear(); _has_null = false; return Status::OK(); } Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override { - if (_values.size() > 0) { + if (_values.size() > 0 || !_hash_values.empty()) { RETURN_IF_ERROR(flush()); } index_meta->set_type(BLOOM_FILTER_INDEX); @@ -166,6 +170,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { // distinct values ValueDict _values; std::vector> _bfs; + std::set _hash_values; }; } // namespace diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index b96cf4f7e6794e..1abb60e58507ec 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -374,10 +374,12 @@ Status ColumnReader::read_page(const ColumnIteratorOptions& iter_opts, const Pag Status ColumnReader::get_row_ranges_by_zone_map( const AndBlockColumnPredicate* col_predicates, - const std::vector* delete_predicates, RowRanges* row_ranges) { + const std::vector* delete_predicates, RowRanges* row_ranges, + const ColumnIteratorOptions& iter_opts) { std::vector page_indexes; - RETURN_IF_ERROR(_get_filtered_pages(col_predicates, delete_predicates, &page_indexes)); - RETURN_IF_ERROR(_calculate_row_ranges(page_indexes, row_ranges)); + RETURN_IF_ERROR( + _get_filtered_pages(col_predicates, delete_predicates, &page_indexes, iter_opts)); + RETURN_IF_ERROR(_calculate_row_ranges(page_indexes, row_ranges, iter_opts)); return Status::OK(); } @@ -514,8 +516,8 @@ bool ColumnReader::_zone_map_match_condition(const ZoneMapPB& zone_map, Status ColumnReader::_get_filtered_pages( const AndBlockColumnPredicate* col_predicates, const std::vector* delete_predicates, - std::vector* page_indexes) { - RETURN_IF_ERROR(_load_zone_map_index(_use_index_page_cache, _opts.kept_in_memory)); + std::vector* page_indexes, const ColumnIteratorOptions& iter_opts) { + RETURN_IF_ERROR(_load_zone_map_index(_use_index_page_cache, _opts.kept_in_memory, iter_opts)); FieldType type = _type_info->type(); const std::vector& zone_maps = _zone_map_index->page_zone_maps(); @@ -553,9 +555,10 @@ Status ColumnReader::_get_filtered_pages( } Status ColumnReader::_calculate_row_ranges(const std::vector& page_indexes, - RowRanges* row_ranges) { + RowRanges* row_ranges, + const ColumnIteratorOptions& iter_opts) { row_ranges->clear(); - RETURN_IF_ERROR(_load_ordinal_index(_use_index_page_cache, _opts.kept_in_memory)); + RETURN_IF_ERROR(_load_ordinal_index(_use_index_page_cache, _opts.kept_in_memory, iter_opts)); for (auto i : page_indexes) { ordinal_t page_first_id = _ordinal_index->get_first_ordinal(i); ordinal_t page_last_id = _ordinal_index->get_last_ordinal(i); @@ -566,12 +569,14 @@ Status ColumnReader::_calculate_row_ranges(const std::vector& page_ind } Status ColumnReader::get_row_ranges_by_bloom_filter(const AndBlockColumnPredicate* col_predicates, - RowRanges* row_ranges) { - RETURN_IF_ERROR(_load_ordinal_index(_use_index_page_cache, _opts.kept_in_memory)); - RETURN_IF_ERROR(_load_bloom_filter_index(_use_index_page_cache, _opts.kept_in_memory)); + RowRanges* row_ranges, + const ColumnIteratorOptions& iter_opts) { + RETURN_IF_ERROR(_load_ordinal_index(_use_index_page_cache, _opts.kept_in_memory, iter_opts)); + RETURN_IF_ERROR( + _load_bloom_filter_index(_use_index_page_cache, _opts.kept_in_memory, iter_opts)); RowRanges bf_row_ranges; std::unique_ptr bf_iter; - RETURN_IF_ERROR(_bloom_filter_index->new_iterator(&bf_iter)); + RETURN_IF_ERROR(_bloom_filter_index->new_iterator(&bf_iter, iter_opts.stats)); size_t range_size = row_ranges->range_size(); // get covered page ids std::set page_ids; @@ -598,16 +603,18 @@ Status ColumnReader::get_row_ranges_by_bloom_filter(const AndBlockColumnPredicat return Status::OK(); } -Status ColumnReader::_load_ordinal_index(bool use_page_cache, bool kept_in_memory) { +Status ColumnReader::_load_ordinal_index(bool use_page_cache, bool kept_in_memory, + const ColumnIteratorOptions& iter_opts) { if (!_ordinal_index) { return Status::InternalError("ordinal_index not inited"); } - return _ordinal_index->load(use_page_cache, kept_in_memory); + return _ordinal_index->load(use_page_cache, kept_in_memory, iter_opts.stats); } -Status ColumnReader::_load_zone_map_index(bool use_page_cache, bool kept_in_memory) { +Status ColumnReader::_load_zone_map_index(bool use_page_cache, bool kept_in_memory, + const ColumnIteratorOptions& iter_opts) { if (_zone_map_index != nullptr) { - return _zone_map_index->load(use_page_cache, kept_in_memory); + return _zone_map_index->load(use_page_cache, kept_in_memory, iter_opts.stats); } return Status::OK(); } @@ -681,15 +688,17 @@ bool ColumnReader::has_bloom_filter_index(bool ngram) const { } } -Status ColumnReader::_load_bloom_filter_index(bool use_page_cache, bool kept_in_memory) { +Status ColumnReader::_load_bloom_filter_index(bool use_page_cache, bool kept_in_memory, + const ColumnIteratorOptions& iter_opts) { if (_bloom_filter_index != nullptr) { - return _bloom_filter_index->load(use_page_cache, kept_in_memory); + return _bloom_filter_index->load(use_page_cache, kept_in_memory, iter_opts.stats); } return Status::OK(); } -Status ColumnReader::seek_to_first(OrdinalPageIndexIterator* iter) { - RETURN_IF_ERROR(_load_ordinal_index(_use_index_page_cache, _opts.kept_in_memory)); +Status ColumnReader::seek_to_first(OrdinalPageIndexIterator* iter, + const ColumnIteratorOptions& iter_opts) { + RETURN_IF_ERROR(_load_ordinal_index(_use_index_page_cache, _opts.kept_in_memory, iter_opts)); *iter = _ordinal_index->begin(); if (!iter->valid()) { return Status::NotFound("Failed to seek to first rowid"); @@ -697,8 +706,9 @@ Status ColumnReader::seek_to_first(OrdinalPageIndexIterator* iter) { return Status::OK(); } -Status ColumnReader::seek_at_or_before(ordinal_t ordinal, OrdinalPageIndexIterator* iter) { - RETURN_IF_ERROR(_load_ordinal_index(_use_index_page_cache, _opts.kept_in_memory)); +Status ColumnReader::seek_at_or_before(ordinal_t ordinal, OrdinalPageIndexIterator* iter, + const ColumnIteratorOptions& iter_opts) { + RETURN_IF_ERROR(_load_ordinal_index(_use_index_page_cache, _opts.kept_in_memory, iter_opts)); *iter = _ordinal_index->seek_at_or_before(ordinal); if (!iter->valid()) { return Status::NotFound("Failed to seek to ordinal {}, ", ordinal); @@ -871,8 +881,18 @@ Status MapFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr size_t num_read = *n; auto null_map_ptr = static_cast(*dst).get_null_map_column_ptr(); - bool null_signs_has_null = false; - RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + // in not-null to null linked-schemachange mode, + // actually we do not change dat data include meta in footer, + // so may dst from changed meta which is nullable but old data is not nullable, + // if so, we should set null_map to all null by default + if (_null_iterator) { + bool null_signs_has_null = false; + RETURN_IF_ERROR( + _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + } else { + auto& null_map = assert_cast(*null_map_ptr); + null_map.insert_many_vals(0, num_read); + } DCHECK(num_read == *n); } return Status::OK(); @@ -932,8 +952,18 @@ Status StructFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumn size_t num_read = *n; auto null_map_ptr = static_cast(*dst).get_null_map_column_ptr(); - bool null_signs_has_null = false; - RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + // in not-null to null linked-schemachange mode, + // actually we do not change dat data include meta in footer, + // so may dst from changed meta which is nullable but old data is not nullable, + // if so, we should set null_map to all null by default + if (_null_iterator) { + bool null_signs_has_null = false; + RETURN_IF_ERROR( + _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + } else { + auto& null_map = assert_cast(*null_map_ptr); + null_map.insert_many_vals(0, num_read); + } DCHECK(num_read == *n); } @@ -1086,8 +1116,18 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnP auto null_map_ptr = static_cast(*dst).get_null_map_column_ptr(); size_t num_read = *n; - bool null_signs_has_null = false; - RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + // in not-null to null linked-schemachange mode, + // actually we do not change dat data include meta in footer, + // so may dst from changed meta which is nullable but old data is not nullable, + // if so, we should set null_map to all null by default + if (_null_iterator) { + bool null_signs_has_null = false; + RETURN_IF_ERROR( + _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + } else { + auto& null_map = assert_cast(*null_map_ptr); + null_map.insert_many_vals(0, num_read); + } DCHECK(num_read == *n); } @@ -1142,7 +1182,7 @@ Status FileColumnIterator::init(const ColumnIteratorOptions& opts) { FileColumnIterator::~FileColumnIterator() = default; Status FileColumnIterator::seek_to_first() { - RETURN_IF_ERROR(_reader->seek_to_first(&_page_iter)); + RETURN_IF_ERROR(_reader->seek_to_first(&_page_iter, _opts)); RETURN_IF_ERROR(_read_data_page(_page_iter)); _seek_to_pos_in_page(&_page, 0); @@ -1153,7 +1193,7 @@ Status FileColumnIterator::seek_to_first() { Status FileColumnIterator::seek_to_ordinal(ordinal_t ord) { // if current page contains this row, we don't need to seek if (!_page || !_page.contains(ord) || !_page_iter.valid()) { - RETURN_IF_ERROR(_reader->seek_at_or_before(ord, &_page_iter)); + RETURN_IF_ERROR(_reader->seek_at_or_before(ord, &_page_iter, _opts)); RETURN_IF_ERROR(_read_data_page(_page_iter)); } _seek_to_pos_in_page(&_page, ord - _page.first_ordinal); @@ -1227,8 +1267,8 @@ Status FileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& d DCHECK_EQ(this_run, num_rows); } else { *has_null = true; - auto* null_col = - vectorized::check_and_get_column(dst); + const auto* null_col = + vectorized::check_and_get_column(dst.get()); if (null_col != nullptr) { const_cast(null_col)->insert_null_elements( this_run); @@ -1288,8 +1328,9 @@ Status FileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t co auto origin_index = _page.data_decoder->current_index(); if (this_read_count > 0) { if (is_null) { - auto* null_col = - vectorized::check_and_get_column(dst); + const auto* null_col = + vectorized::check_and_get_column( + dst.get()); if (UNLIKELY(null_col == nullptr)) { return Status::InternalError("unexpected column type in column reader"); } @@ -1401,8 +1442,8 @@ Status FileColumnIterator::get_row_ranges_by_zone_map( const AndBlockColumnPredicate* col_predicates, const std::vector* delete_predicates, RowRanges* row_ranges) { if (_reader->has_zone_map()) { - RETURN_IF_ERROR( - _reader->get_row_ranges_by_zone_map(col_predicates, delete_predicates, row_ranges)); + RETURN_IF_ERROR(_reader->get_row_ranges_by_zone_map(col_predicates, delete_predicates, + row_ranges, _opts)); } return Status::OK(); } @@ -1411,7 +1452,7 @@ Status FileColumnIterator::get_row_ranges_by_bloom_filter( const AndBlockColumnPredicate* col_predicates, RowRanges* row_ranges) { if ((col_predicates->can_do_bloom_filter(false) && _reader->has_bloom_filter_index(false)) || (col_predicates->can_do_bloom_filter(true) && _reader->has_bloom_filter_index(true))) { - RETURN_IF_ERROR(_reader->get_row_ranges_by_bloom_filter(col_predicates, row_ranges)); + RETURN_IF_ERROR(_reader->get_row_ranges_by_bloom_filter(col_predicates, row_ranges, _opts)); } return Status::OK(); } @@ -1670,9 +1711,9 @@ Status DefaultNestedColumnIterator::next_batch(size_t* n, vectorized::MutableCol static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst, vectorized::MutableColumnPtr& sibling_column, size_t nrows) { const auto* sibling_array = vectorized::check_and_get_column( - remove_nullable(sibling_column->get_ptr())); + remove_nullable(sibling_column->get_ptr()).get()); const auto* dst_array = vectorized::check_and_get_column( - remove_nullable(dst->get_ptr())); + remove_nullable(dst->get_ptr()).get()); if (!dst_array || !sibling_array) { throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Expected array column, but met %s and %s", dst->get_name(), diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index d72d802f97769b..7e32b3a09b34da 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -148,8 +148,9 @@ class ColumnReader : public MetadataAdder { std::unique_ptr* iterator); // Seek to the first entry in the column. - Status seek_to_first(OrdinalPageIndexIterator* iter); - Status seek_at_or_before(ordinal_t ordinal, OrdinalPageIndexIterator* iter); + Status seek_to_first(OrdinalPageIndexIterator* iter, const ColumnIteratorOptions& iter_opts); + Status seek_at_or_before(ordinal_t ordinal, OrdinalPageIndexIterator* iter, + const ColumnIteratorOptions& iter_opts); // read a page from file into a page handle Status read_page(const ColumnIteratorOptions& iter_opts, const PagePointer& pp, @@ -175,11 +176,13 @@ class ColumnReader : public MetadataAdder { // - delete_condition is a delete predicate of one version Status get_row_ranges_by_zone_map(const AndBlockColumnPredicate* col_predicates, const std::vector* delete_predicates, - RowRanges* row_ranges); + RowRanges* row_ranges, + const ColumnIteratorOptions& iter_opts); // get row ranges with bloom filter index Status get_row_ranges_by_bloom_filter(const AndBlockColumnPredicate* col_predicates, - RowRanges* row_ranges); + RowRanges* row_ranges, + const ColumnIteratorOptions& iter_opts); PagePointer get_dict_page_pointer() const { return _meta_dict_page; } @@ -219,13 +222,16 @@ class ColumnReader : public MetadataAdder { return Status::OK(); } - [[nodiscard]] Status _load_zone_map_index(bool use_page_cache, bool kept_in_memory); - [[nodiscard]] Status _load_ordinal_index(bool use_page_cache, bool kept_in_memory); + [[nodiscard]] Status _load_zone_map_index(bool use_page_cache, bool kept_in_memory, + const ColumnIteratorOptions& iter_opts); + [[nodiscard]] Status _load_ordinal_index(bool use_page_cache, bool kept_in_memory, + const ColumnIteratorOptions& iter_opts); [[nodiscard]] Status _load_bitmap_index(bool use_page_cache, bool kept_in_memory); [[nodiscard]] Status _load_inverted_index_index( std::shared_ptr index_file_reader, const TabletIndex* index_meta); - [[nodiscard]] Status _load_bloom_filter_index(bool use_page_cache, bool kept_in_memory); + [[nodiscard]] Status _load_bloom_filter_index(bool use_page_cache, bool kept_in_memory, + const ColumnIteratorOptions& iter_opts); bool _zone_map_match_condition(const ZoneMapPB& zone_map, WrapperField* min_value_container, WrapperField* max_value_container, @@ -239,9 +245,11 @@ class ColumnReader : public MetadataAdder { Status _get_filtered_pages(const AndBlockColumnPredicate* col_predicates, const std::vector* delete_predicates, - std::vector* page_indexes); + std::vector* page_indexes, + const ColumnIteratorOptions& iter_opts); - Status _calculate_row_ranges(const std::vector& page_indexes, RowRanges* row_ranges); + Status _calculate_row_ranges(const std::vector& page_indexes, RowRanges* row_ranges, + const ColumnIteratorOptions& iter_opts); int64_t get_metadata_size() const override; diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp index db6bac6b8b4c09..fe7167e9444a76 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp @@ -80,8 +80,7 @@ Status HierarchicalDataReader::init(const ColumnIteratorOptions& opts) { } Status HierarchicalDataReader::seek_to_first() { - LOG(FATAL) << "Not implemented"; - __builtin_unreachable(); + throw Exception(Status::FatalError("Not implemented")); } Status HierarchicalDataReader::seek_to_ordinal(ordinal_t ord) { @@ -159,8 +158,7 @@ Status ExtractReader::init(const ColumnIteratorOptions& opts) { } Status ExtractReader::seek_to_first() { - LOG(FATAL) << "Not implemented"; - __builtin_unreachable(); + throw Exception(Status::FatalError("Not implemented")); } Status ExtractReader::seek_to_ordinal(ordinal_t ord) { diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h index f85038713cadb7..bd5de7484740a8 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h @@ -165,8 +165,8 @@ class HierarchicalDataReader : public ColumnIterator { // will type the type of ColumnObject::NESTED_TYPE, whih is Nullable>. for (auto& entry : nested_subcolumns) { MutableColumnPtr nested_object = ColumnObject::create(true, false); - const auto* base_array = - check_and_get_column(remove_nullable(entry.second[0].column)); + const auto* base_array = check_and_get_column( + remove_nullable(entry.second[0].column).get()); MutableColumnPtr offset = base_array->get_offsets_ptr()->assume_mutable(); auto* nested_object_ptr = assert_cast(nested_object.get()); // flatten nested arrays diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp index da6beff5d8d6a2..3f582293ee4d7f 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp @@ -66,7 +66,6 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats) { _use_page_cache = use_page_cache; _kept_in_memory = kept_in_memory; - _index_load_stats = index_load_stats; _type_info = get_scalar_type_info((FieldType)_meta.data_type()); if (_type_info == nullptr) { @@ -82,7 +81,7 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, } else { RETURN_IF_ERROR(load_index_page(_meta.ordinal_index_meta().root_page(), &_ordinal_index_page_handle, - _ordinal_index_reader.get())); + _ordinal_index_reader.get(), index_load_stats)); _has_index_page = true; } } @@ -93,7 +92,8 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, _sole_data_page = PagePointer(_meta.value_index_meta().root_page()); } else { RETURN_IF_ERROR(load_index_page(_meta.value_index_meta().root_page(), - &_value_index_page_handle, _value_index_reader.get())); + &_value_index_page_handle, _value_index_reader.get(), + index_load_stats)); _has_index_page = true; } } @@ -104,13 +104,14 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, } Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* handle, - IndexPageReader* reader) { + IndexPageReader* reader, + OlapReaderStatistics* index_load_stats) { Slice body; PageFooterPB footer; BlockCompressionCodec* local_compress_codec; RETURN_IF_ERROR(get_block_compression_codec(_meta.compression(), &local_compress_codec)); RETURN_IF_ERROR(read_page(PagePointer(pp), handle, &body, &footer, INDEX_PAGE, - local_compress_codec, false, _index_load_stats)); + local_compress_codec, false, index_load_stats)); RETURN_IF_ERROR(reader->parse(body, footer.index_page_footer())); _mem_size += body.get_size(); return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.h b/be/src/olap/rowset/segment_v2/indexed_column_reader.h index c9640c0007c153..6e62feaafdcdd1 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.h @@ -76,7 +76,8 @@ class IndexedColumnReader : public MetadataAdder { void set_is_pk_index(bool is_pk) { _is_pk_index = is_pk; } private: - Status load_index_page(const PagePointerPB& pp, PageHandle* handle, IndexPageReader* reader); + Status load_index_page(const PagePointerPB& pp, PageHandle* handle, IndexPageReader* reader, + OlapReaderStatistics* index_load_stats); int64_t get_metadata_size() const override; @@ -103,7 +104,6 @@ class IndexedColumnReader : public MetadataAdder { const KeyCoder* _value_key_coder = nullptr; uint64_t _mem_size = 0; bool _is_pk_index = false; - OlapReaderStatistics* _index_load_stats = nullptr; }; class IndexedColumnIterator { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index f988c46c027c26..dcbdca921ab8e8 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -79,7 +79,16 @@ Status compact_column(int64_t index_id, // delete temporary segment_path, only when inverted_index_ram_dir_enable is false if (!config::inverted_index_ram_dir_enable) { - std::ignore = io::global_local_filesystem()->delete_directory(tmp_path.data()); + auto st = io::global_local_filesystem()->delete_directory(tmp_path.data()); + DBUG_EXECUTE_IF("compact_column_delete_tmp_path_error", { + st = Status::Error( + "debug point: compact_column_delete_tmp_path_error in index compaction"); + }) + if (!st.ok()) { + LOG(WARNING) << "compact column failed to delete tmp path: " << tmp_path + << ", error: " << st.to_string(); + return st; + } } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp index 60006ea84550a2..c30017cc8fe737 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp @@ -96,12 +96,19 @@ void CSIndexInput::readInternal(uint8_t* b, const int32_t len) { if (start + len > _length) { _CLTHROWA(CL_ERR_IO, "read past EOF"); } - base->setIoContext(_io_ctx); + + if (_io_ctx) { + base->setIoContext(_io_ctx); + } + base->setIndexFile(_is_index_file); base->seek(fileOffset + start); bool read_from_buffer = true; base->readBytes(b, len, read_from_buffer); - base->setIoContext(nullptr); + + if (_io_ctx) { + base->setIoContext(nullptr); + } } CSIndexInput::~CSIndexInput() = default; @@ -231,6 +238,9 @@ const char* DorisCompoundReader::getObjectName() const { } bool DorisCompoundReader::list(std::vector* names) const { + if (_closed || _entries == nullptr) { + _CLTHROWA(CL_ERR_IO, "DorisCompoundReader is already closed"); + } for (EntriesType::const_iterator i = _entries->begin(); i != _entries->end(); i++) { names->push_back(i->first); } @@ -238,6 +248,9 @@ bool DorisCompoundReader::list(std::vector* names) const { } bool DorisCompoundReader::fileExists(const char* name) const { + if (_closed || _entries == nullptr) { + _CLTHROWA(CL_ERR_IO, "DorisCompoundReader is already closed"); + } return _entries->exists((char*)name); } @@ -246,6 +259,9 @@ int64_t DorisCompoundReader::fileModified(const char* name) const { } int64_t DorisCompoundReader::fileLength(const char* name) const { + if (_closed || _entries == nullptr) { + _CLTHROWA(CL_ERR_IO, "DorisCompoundReader is already closed"); + } ReaderFileEntry* e = _entries->get((char*)name); if (e == nullptr) { char buf[CL_MAX_PATH + 30]; @@ -260,6 +276,10 @@ int64_t DorisCompoundReader::fileLength(const char* name) const { bool DorisCompoundReader::openInput(const char* name, std::unique_ptr& ret, CLuceneError& error, int32_t bufferSize) { + if (_closed || _entries == nullptr) { + error.set(CL_ERR_IO, "DorisCompoundReader is already closed"); + return false; + } lucene::store::IndexInput* tmp; bool success = openInput(name, tmp, error, bufferSize); if (success) { @@ -303,6 +323,10 @@ void DorisCompoundReader::close() { _CLDELETE(_stream) } if (_entries != nullptr) { + // The life cycle of _entries should be consistent with that of the DorisCompoundReader. + // DO NOT DELETE _entries here, it will be deleted in the destructor + // When directory is closed, all _entries are cleared. But the directory may be called in other places. + // If we delete the _entries object here, it will cause core dump. _entries->clear(); } if (_ram_dir) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h index a30c39f8a2ffdd..1c7bc159b9ca09 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h @@ -67,6 +67,7 @@ class CLUCENE_EXPORT DorisCompoundReader : public lucene::store::Directory { private: lucene::store::RAMDirectory* _ram_dir = nullptr; CL_NS(store)::IndexInput* _stream = nullptr; + // The life cycle of _entries should be consistent with that of the DorisCompoundReader. EntriesType* _entries = nullptr; std::mutex _this_lock; bool _closed = false; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp index 8d480829a0cd37..813a78f2a3fa86 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp @@ -27,21 +27,27 @@ namespace doris::segment_v2 { -Status InvertedIndexFileReader::init(int32_t read_buffer_size) { +Status InvertedIndexFileReader::init(int32_t read_buffer_size, const io::IOContext* io_ctx) { if (!_inited) { _read_buffer_size = read_buffer_size; if (_storage_format >= InvertedIndexStorageFormatPB::V2) { - auto st = _init_from(read_buffer_size); + auto st = _init_from(read_buffer_size, io_ctx); if (!st.ok()) { return st; } } _inited = true; + } else { + if (_storage_format == InvertedIndexStorageFormatPB::V2) { + if (_stream) { + _stream->setIoContext(io_ctx); + } + } } return Status::OK(); } -Status InvertedIndexFileReader::_init_from(int32_t read_buffer_size) { +Status InvertedIndexFileReader::_init_from(int32_t read_buffer_size, const io::IOContext* io_ctx) { auto index_file_full_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); std::unique_lock lock(_mutex); // Lock for writing @@ -76,6 +82,7 @@ Status InvertedIndexFileReader::_init_from(int32_t read_buffer_size) { err.what()); } _stream = std::unique_ptr(index_input); + _stream->setIoContext(io_ctx); // 3. read file int32_t version = _stream->readInt(); // Read version number diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h index 443d40cfaf0d4f..ed6ee85e7d7bf1 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h @@ -58,7 +58,8 @@ class InvertedIndexFileReader { _storage_format(storage_format), _idx_file_info(idx_file_info) {} - Status init(int32_t read_buffer_size = config::inverted_index_read_buffer_size); + Status init(int32_t read_buffer_size = config::inverted_index_read_buffer_size, + const io::IOContext* io_ctx = nullptr); Result> open(const TabletIndex* index_meta) const; void debug_file_entries(); std::string get_index_file_cache_key(const TabletIndex* index_meta) const; @@ -70,7 +71,7 @@ class InvertedIndexFileReader { int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); } private: - Status _init_from(int32_t read_buffer_size); + Status _init_from(int32_t read_buffer_size, const io::IOContext* io_ctx); Result> _open(int64_t index_id, const std::string& index_suffix) const; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp index fe0a81c41a6970..2d29f09f00e659 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp @@ -812,10 +812,8 @@ bool DorisRAMFSDirectory::doDeleteFile(const char* name) { SCOPED_LOCK_MUTEX(this->THIS_LOCK); sizeInBytes -= itr->second->sizeInBytes; filesMap->removeitr(itr); - return true; - } else { - return false; } + return true; } bool DorisRAMFSDirectory::deleteDirectory() { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h index dde436054cd35b..41d9fb48356299 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h @@ -180,6 +180,7 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput : BufferedIndexInput(buffer_size) { this->_pos = 0; this->_handle = std::move(handle); + _io_ctx.is_inverted_index = true; } protected: diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 889fee1fc87ef9..9790d7273e1bff 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -121,7 +121,8 @@ Status InvertedIndexReader::read_null_bitmap(const io::IOContext* io_ctx, if (!dir) { // TODO: ugly code here, try to refact. - auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size); + auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, + io_ctx); if (!st.ok()) { LOG(WARNING) << st; return st; @@ -137,7 +138,6 @@ Status InvertedIndexReader::read_null_bitmap(const io::IOContext* io_ctx, InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); if (dir->fileExists(null_bitmap_file_name)) { null_bitmap_in = dir->openInput(null_bitmap_file_name); - null_bitmap_in->setIoContext(io_ctx); size_t null_bitmap_size = null_bitmap_in->length(); faststring buf; buf.resize(null_bitmap_size); @@ -164,23 +164,56 @@ Status InvertedIndexReader::read_null_bitmap(const io::IOContext* io_ctx, return Status::OK(); } +Status InvertedIndexReader::handle_query_cache(RuntimeState* runtime_state, + InvertedIndexQueryCache* cache, + const InvertedIndexQueryCache::CacheKey& cache_key, + InvertedIndexQueryCacheHandle* cache_handler, + OlapReaderStatistics* stats, + std::shared_ptr& bit_map) { + const auto& query_options = runtime_state->query_options(); + if (query_options.enable_inverted_index_query_cache && + cache->lookup(cache_key, cache_handler)) { + DBUG_EXECUTE_IF("InvertedIndexReader.handle_query_cache_hit", { + return Status::Error("handle query cache hit"); + }); + stats->inverted_index_query_cache_hit++; + SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_copy_timer); + bit_map = cache_handler->get_bitmap(); + return Status::OK(); + } + DBUG_EXECUTE_IF("InvertedIndexReader.handle_query_cache_miss", { + return Status::Error("handle query cache miss"); + }); + stats->inverted_index_query_cache_miss++; + return Status::Error("cache miss"); +} + Status InvertedIndexReader::handle_searcher_cache( - InvertedIndexCacheHandle* inverted_index_cache_handle, const io::IOContext* io_ctx, - OlapReaderStatistics* stats) { + RuntimeState* runtime_state, InvertedIndexCacheHandle* inverted_index_cache_handle, + const io::IOContext* io_ctx, OlapReaderStatistics* stats) { auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); - if (InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, + const auto& query_options = runtime_state->query_options(); + if (query_options.enable_inverted_index_searcher_cache && + InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, inverted_index_cache_handle)) { + DBUG_EXECUTE_IF("InvertedIndexReader.handle_searcher_cache_hit", { + return Status::Error("handle searcher cache hit"); + }); stats->inverted_index_searcher_cache_hit++; return Status::OK(); } else { + DBUG_EXECUTE_IF("InvertedIndexReader.handle_searcher_cache_miss", { + return Status::Error("handle searcher cache miss"); + }); // searcher cache miss stats->inverted_index_searcher_cache_miss++; auto mem_tracker = std::make_unique("InvertedIndexSearcherCacheWithRead"); SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); IndexSearcherPtr searcher; - auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size); + auto st = + _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, io_ctx); if (!st.ok()) { LOG(WARNING) << st; return st; @@ -211,6 +244,9 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, auto searcher_result = DORIS_TRY(index_searcher_builder->get_index_searcher(dir)); *searcher = searcher_result; + // When the meta information has been read, the ioContext needs to be reset to prevent it from being used by other queries. + static_cast(dir)->getDorisIndexInput()->setIoContext(nullptr); + // NOTE: before mem_tracker hook becomes active, we caculate reader memory size by hand. mem_tracker->consume(index_searcher_builder->get_reader_size()); return Status::OK(); @@ -307,14 +343,16 @@ Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatist InvertedIndexQueryCacheHandle cache_handler; std::shared_ptr term_match_bitmap = nullptr; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); + auto cache_status = + handle_query_cache(runtime_state, cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { return Status::OK(); } FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); + RETURN_IF_ERROR( + handle_searcher_cache(runtime_state, &inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { @@ -375,7 +413,8 @@ Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, search_str}; auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); + auto cache_status = + handle_query_cache(runtime_state, cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { return Status::OK(); } @@ -389,7 +428,8 @@ Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, auto result = std::make_shared(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); + RETURN_IF_ERROR( + handle_searcher_cache(runtime_state, &inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { @@ -605,11 +645,12 @@ Status BkdIndexReader::invoke_bkd_query(const void* query_value, InvertedIndexQu } Status BkdIndexReader::try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, uint32_t* count) { + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + uint32_t* count) { try { std::shared_ptr r; - auto st = get_bkd_reader(r, io_ctx, stats); + auto st = get_bkd_reader(r, io_ctx, stats, runtime_state); if (!st.ok()) { LOG(WARNING) << "get bkd reader for " << _inverted_index_file_reader->get_index_file_path(&_index_meta) @@ -625,7 +666,8 @@ Status BkdIndexReader::try_query(const io::IOContext* io_ctx, OlapReaderStatisti auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; std::shared_ptr bit_map; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); + auto cache_status = + handle_query_cache(runtime_state, cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { *count = bit_map->cardinality(); return Status::OK(); @@ -649,7 +691,7 @@ Status BkdIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* try { std::shared_ptr r; - auto st = get_bkd_reader(r, io_ctx, stats); + auto st = get_bkd_reader(r, io_ctx, stats, runtime_state); if (!st.ok()) { LOG(WARNING) << "get bkd reader for " << _inverted_index_file_reader->get_index_file_path(&_index_meta) @@ -664,7 +706,8 @@ Status BkdIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* query_str}; auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); + auto cache_status = + handle_query_cache(runtime_state, cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { return Status::OK(); } @@ -686,10 +729,11 @@ Status BkdIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* } Status BkdIndexReader::get_bkd_reader(BKDIndexSearcherPtr& bkd_reader, const io::IOContext* io_ctx, - OlapReaderStatistics* stats) { + OlapReaderStatistics* stats, RuntimeState* runtime_state) { BKDIndexSearcherPtr* bkd_searcher = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); + RETURN_IF_ERROR( + handle_searcher_cache(runtime_state, &inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); bkd_searcher = std::get_if(&searcher_variant); if (bkd_searcher) { @@ -1134,8 +1178,8 @@ Status InvertedIndexIterator::try_read_from_inverted_index(const std::string& co query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY || query_type == InvertedIndexQueryType::LESS_THAN_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY) { - RETURN_IF_ERROR( - _reader->try_query(&_io_ctx, _stats, column_name, query_value, query_type, count)); + RETURN_IF_ERROR(_reader->try_query(&_io_ctx, _stats, _runtime_state, column_name, + query_value, query_type, count)); } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index a1445603286619..bbd148fae5250d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -190,8 +190,9 @@ class InvertedIndexReader : public std::enable_shared_from_this& bit_map) = 0; virtual Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, uint32_t* count) = 0; + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + uint32_t* count) = 0; Status read_null_bitmap(const io::IOContext* io_ctx, OlapReaderStatistics* stats, InvertedIndexQueryCacheHandle* cache_handle, @@ -208,22 +209,14 @@ class InvertedIndexReader : public std::enable_shared_from_this& bit_map) { - if (cache->lookup(cache_key, cache_handler)) { - stats->inverted_index_query_cache_hit++; - SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_copy_timer); - bit_map = cache_handler->get_bitmap(); - return Status::OK(); - } - stats->inverted_index_query_cache_miss++; - return Status::Error("cache miss"); - } + std::shared_ptr& bit_map); - virtual Status handle_searcher_cache(InvertedIndexCacheHandle* inverted_index_cache_handle, + virtual Status handle_searcher_cache(RuntimeState* runtime_state, + InvertedIndexCacheHandle* inverted_index_cache_handle, const io::IOContext* io_ctx, OlapReaderStatistics* stats); std::string get_index_file_path(); static Status create_index_searcher(lucene::store::Directory* dir, IndexSearcherPtr* searcher, @@ -262,8 +255,9 @@ class FullTextIndexReader : public InvertedIndexReader { const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, uint32_t* count) override { + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + uint32_t* count) override { return Status::Error( "FullTextIndexReader not support try_query"); } @@ -289,8 +283,9 @@ class StringTypeInvertedIndexReader : public InvertedIndexReader { const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, uint32_t* count) override { + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + uint32_t* count) override { return Status::Error( "StringTypeInvertedIndexReader not support try_query"); } @@ -350,8 +345,9 @@ class BkdIndexReader : public InvertedIndexReader { const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, uint32_t* count) override; + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + uint32_t* count) override; Status invoke_bkd_try_query(const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr r, uint32_t* count); Status invoke_bkd_query(const void* query_value, InvertedIndexQueryType query_type, @@ -364,7 +360,7 @@ class BkdIndexReader : public InvertedIndexReader { InvertedIndexReaderType type() override; Status get_bkd_reader(BKDIndexSearcherPtr& reader, const io::IOContext* io_ctx, - OlapReaderStatistics* stats); + OlapReaderStatistics* stats, RuntimeState* runtime_state); private: const TypeInfo* _type_info {}; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 02edf2f1976e3e..4136adab64a132 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -138,12 +138,14 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { try { DBUG_EXECUTE_IF("InvertedIndexColumnWriter::close_on_error_throw_exception", { _CLTHROWA(CL_ERR_IO, "debug point: close on error"); }) - if (_index_writer) { - _index_writer->close(); - } + // delete directory must be done before index_writer close + // because index_writer will close the directory if (_dir) { _dir->deleteDirectory(); } + if (_index_writer) { + _index_writer->close(); + } } catch (CLuceneError& e) { LOG(ERROR) << "InvertedIndexWriter close_on_error failure: " << e.what(); } @@ -714,12 +716,14 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { std::unique_ptr _doc = nullptr; lucene::document::Field* _field = nullptr; bool _single_field = true; + // Since _index_writer's write.lock is created by _dir.lockFactory, + // _dir must destruct after _index_writer, so _dir must be defined before _index_writer. + std::shared_ptr _dir = nullptr; std::unique_ptr _index_writer = nullptr; std::unique_ptr _analyzer = nullptr; std::unique_ptr _char_string_reader = nullptr; std::shared_ptr _bkd_writer = nullptr; InvertedIndexCtxSPtr _inverted_index_ctx = nullptr; - std::shared_ptr _dir = nullptr; const KeyCoder* _value_key_coder; const TabletIndex* _index_meta; InvertedIndexParserType _parser_type; diff --git a/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp b/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp index 9ee82bacdd73d2..4995e779892646 100644 --- a/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp +++ b/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp @@ -69,15 +69,17 @@ Status OrdinalIndexWriter::finish(io::FileWriter* file_writer, ColumnIndexMetaPB return Status::OK(); } -Status OrdinalIndexReader::load(bool use_page_cache, bool kept_in_memory) { +Status OrdinalIndexReader::load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { // TODO yyq: implement a new once flag to avoid status construct. - return _load_once.call([this, use_page_cache, kept_in_memory] { - return _load(use_page_cache, kept_in_memory, std::move(_meta_pb)); + return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats] { + return _load(use_page_cache, kept_in_memory, std::move(_meta_pb), index_load_stats); }); } Status OrdinalIndexReader::_load(bool use_page_cache, bool kept_in_memory, - std::unique_ptr index_meta) { + std::unique_ptr index_meta, + OlapReaderStatistics* stats) { if (index_meta->root_page().is_root_data_page()) { // only one data page, no index page _num_pages = 1; @@ -88,6 +90,7 @@ Status OrdinalIndexReader::_load(bool use_page_cache, bool kept_in_memory, } // need to read index page OlapReaderStatistics tmp_stats; + OlapReaderStatistics* stats_ptr = stats != nullptr ? stats : &tmp_stats; PageReadOptions opts { .use_page_cache = use_page_cache, .kept_in_memory = kept_in_memory, @@ -96,8 +99,9 @@ Status OrdinalIndexReader::_load(bool use_page_cache, bool kept_in_memory, .page_pointer = PagePointer(index_meta->root_page().root_page()), // ordinal index page uses NO_COMPRESSION right now .codec = nullptr, - .stats = &tmp_stats, - .io_ctx = io::IOContext {.is_index_data = true}, + .stats = stats_ptr, + .io_ctx = io::IOContext {.is_index_data = true, + .file_cache_stats = &stats_ptr->file_cache_stats}, }; // read index page diff --git a/be/src/olap/rowset/segment_v2/ordinal_page_index.h b/be/src/olap/rowset/segment_v2/ordinal_page_index.h index 1d74cf989520aa..df60edb12d1481 100644 --- a/be/src/olap/rowset/segment_v2/ordinal_page_index.h +++ b/be/src/olap/rowset/segment_v2/ordinal_page_index.h @@ -75,7 +75,7 @@ class OrdinalIndexReader : public MetadataAdder { virtual ~OrdinalIndexReader(); // load and parse the index page into memory - Status load(bool use_page_cache, bool kept_in_memory); + Status load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats); // the returned iter points to the largest element which is less than `ordinal`, // or points to the first element if all elements are greater than `ordinal`, @@ -94,7 +94,8 @@ class OrdinalIndexReader : public MetadataAdder { private: Status _load(bool use_page_cache, bool kept_in_memory, - std::unique_ptr index_meta); + std::unique_ptr index_meta, + OlapReaderStatistics* index_load_stats); int64_t get_metadata_size() const override; diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 513c0be4f8cd14..b5ab3f0e873549 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -228,7 +228,7 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o if (read_options.runtime_state != nullptr) { _be_exec_version = read_options.runtime_state->be_exec_version(); } - RETURN_IF_ERROR(_create_column_readers_once()); + RETURN_IF_ERROR(_create_column_readers_once(read_options.stats)); read_options.stats->total_segment_number++; // trying to prune the current segment by segment-level zone map @@ -288,7 +288,11 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o } } - RETURN_IF_ERROR(load_index()); + { + SCOPED_RAW_TIMER(&read_options.stats->segment_load_index_timer_ns); + RETURN_IF_ERROR(load_index(read_options.stats)); + } + if (read_options.delete_condition_predicates->num_of_column_predicate() == 0 && read_options.push_down_agg_type_opt != TPushAggOp::NONE && read_options.push_down_agg_type_opt != TPushAggOp::COUNT_ON_INDEX) { @@ -471,7 +475,7 @@ Status Segment::_parse_footer(SegmentFooterPB* footer) { return Status::OK(); } -Status Segment::_load_pk_bloom_filter() { +Status Segment::_load_pk_bloom_filter(OlapReaderStatistics* stats) { #ifdef BE_TEST if (_pk_index_meta == nullptr) { // for BE UT "segment_cache_test" @@ -486,30 +490,30 @@ Status Segment::_load_pk_bloom_filter() { DCHECK(_pk_index_meta != nullptr); DCHECK(_pk_index_reader != nullptr); - return _load_pk_bf_once.call([this] { - RETURN_IF_ERROR(_pk_index_reader->parse_bf(_file_reader, *_pk_index_meta)); + return _load_pk_bf_once.call([this, stats] { + RETURN_IF_ERROR(_pk_index_reader->parse_bf(_file_reader, *_pk_index_meta, stats)); // _meta_mem_usage += _pk_index_reader->get_bf_memory_size(); return Status::OK(); }); } Status Segment::load_pk_index_and_bf(OlapReaderStatistics* index_load_stats) { - _pk_index_load_stats = index_load_stats; - RETURN_IF_ERROR(load_index()); - RETURN_IF_ERROR(_load_pk_bloom_filter()); + RETURN_IF_ERROR(load_index(index_load_stats)); + RETURN_IF_ERROR(_load_pk_bloom_filter(index_load_stats)); return Status::OK(); } -Status Segment::load_index() { - return _load_index_once.call([this] { +Status Segment::load_index(OlapReaderStatistics* stats) { + return _load_index_once.call([this, stats] { if (_tablet_schema->keys_type() == UNIQUE_KEYS && _pk_index_meta != nullptr) { - _pk_index_reader = std::make_unique(_pk_index_load_stats); - RETURN_IF_ERROR(_pk_index_reader->parse_index(_file_reader, *_pk_index_meta)); + _pk_index_reader = std::make_unique(); + RETURN_IF_ERROR(_pk_index_reader->parse_index(_file_reader, *_pk_index_meta, stats)); // _meta_mem_usage += _pk_index_reader->get_memory_size(); return Status::OK(); } else { // read and parse short key index page OlapReaderStatistics tmp_stats; + OlapReaderStatistics* stats_ptr = stats != nullptr ? stats : &tmp_stats; PageReadOptions opts { .use_page_cache = true, .type = INDEX_PAGE, @@ -518,7 +522,8 @@ Status Segment::load_index() { // short key index page uses NO_COMPRESSION for now .codec = nullptr, .stats = &tmp_stats, - .io_ctx = io::IOContext {.is_index_data = true}, + .io_ctx = io::IOContext {.is_index_data = true, + .file_cache_stats = &stats_ptr->file_cache_stats}, }; Slice body; PageFooterPB footer; @@ -594,7 +599,8 @@ vectorized::DataTypePtr Segment::get_data_type_of(const ColumnIdentifier& identi return nullptr; } -Status Segment::_create_column_readers_once() { +Status Segment::_create_column_readers_once(OlapReaderStatistics* stats) { + SCOPED_RAW_TIMER(&stats->segment_create_column_readers_timer_ns); return _create_column_readers_once_call.call([&] { DCHECK(_footer_pb); Defer defer([&]() { _footer_pb.reset(); }); @@ -868,10 +874,10 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, Status Segment::new_column_iterator(const TabletColumn& tablet_column, std::unique_ptr* iter, const StorageReadOptions* opt) { - if (opt != nullptr && opt->runtime_state != nullptr) { + if (opt->runtime_state != nullptr) { _be_exec_version = opt->runtime_state->be_exec_version(); } - RETURN_IF_ERROR(_create_column_readers_once()); + RETURN_IF_ERROR(_create_column_readers_once(opt->stats)); // init column iterator by path info if (tablet_column.has_path_info() || tablet_column.is_variant_type()) { @@ -899,8 +905,9 @@ Status Segment::new_column_iterator(const TabletColumn& tablet_column, return Status::OK(); } -Status Segment::new_column_iterator(int32_t unique_id, std::unique_ptr* iter) { - RETURN_IF_ERROR(_create_column_readers_once()); +Status Segment::new_column_iterator(int32_t unique_id, const StorageReadOptions* opt, + std::unique_ptr* iter) { + RETURN_IF_ERROR(_create_column_readers_once(opt->stats)); ColumnIterator* it; RETURN_IF_ERROR(_column_readers.at(unique_id)->new_iterator(&it)); iter->reset(it); @@ -928,8 +935,9 @@ ColumnReader* Segment::_get_column_reader(const TabletColumn& col) { } Status Segment::new_bitmap_index_iterator(const TabletColumn& tablet_column, + const StorageReadOptions& read_options, std::unique_ptr* iter) { - RETURN_IF_ERROR(_create_column_readers_once()); + RETURN_IF_ERROR(_create_column_readers_once(read_options.stats)); ColumnReader* reader = _get_column_reader(tablet_column); if (reader != nullptr && reader->has_bitmap_index()) { BitmapIndexIterator* it; @@ -947,7 +955,7 @@ Status Segment::new_inverted_index_iterator(const TabletColumn& tablet_column, if (read_options.runtime_state != nullptr) { _be_exec_version = read_options.runtime_state->be_exec_version(); } - RETURN_IF_ERROR(_create_column_readers_once()); + RETURN_IF_ERROR(_create_column_readers_once(read_options.stats)); ColumnReader* reader = _get_column_reader(tablet_column); if (reader != nullptr && index_meta) { if (_inverted_index_file_reader == nullptr) { @@ -963,8 +971,8 @@ Status Segment::new_inverted_index_iterator(const TabletColumn& tablet_column, Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_schema, bool with_seq_col, bool with_rowid, RowLocation* row_location, - std::string* encoded_seq_value, OlapReaderStatistics* stats) { - RETURN_IF_ERROR(load_pk_index_and_bf()); + OlapReaderStatistics* stats, std::string* encoded_seq_value) { + RETURN_IF_ERROR(load_pk_index_and_bf(stats)); bool has_seq_col = latest_schema->has_sequence_col(); bool has_rowid = !latest_schema->cluster_key_uids().empty(); size_t seq_col_length = 0; @@ -1064,9 +1072,10 @@ Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_sche } Status Segment::read_key_by_rowid(uint32_t row_id, std::string* key) { - RETURN_IF_ERROR(load_pk_index_and_bf()); + OlapReaderStatistics* null_stat = nullptr; + RETURN_IF_ERROR(load_pk_index_and_bf(null_stat)); std::unique_ptr iter; - RETURN_IF_ERROR(_pk_index_reader->new_iterator(&iter)); + RETURN_IF_ERROR(_pk_index_reader->new_iterator(&iter, null_stat)); auto index_type = vectorized::DataTypeFactory::instance().create_data_type( _pk_index_reader->type_info()->type(), 1, 0); @@ -1116,12 +1125,14 @@ Status Segment::seek_and_read_by_rowid(const TabletSchema& schema, SlotDescripto OlapReaderStatistics& stats, std::unique_ptr& iterator_hint) { StorageReadOptions storage_read_opt; + storage_read_opt.stats = &stats; storage_read_opt.io_ctx.reader_type = ReaderType::READER_QUERY; segment_v2::ColumnIteratorOptions opt { .use_page_cache = !config::disable_storage_page_cache, .file_reader = file_reader().get(), .stats = &stats, - .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY}, + .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY, + .file_cache_stats = &stats.file_cache_stats}, }; std::vector single_row_loc {row_id}; if (!slot->column_paths().empty()) { diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 1b20c1f066bdf9..441ae3e85e9b3f 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -111,9 +111,11 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd std::unique_ptr* iter, const StorageReadOptions* opt); - Status new_column_iterator(int32_t unique_id, std::unique_ptr* iter); + Status new_column_iterator(int32_t unique_id, const StorageReadOptions* opt, + std::unique_ptr* iter); Status new_bitmap_index_iterator(const TabletColumn& tablet_column, + const StorageReadOptions& read_options, std::unique_ptr* iter); Status new_inverted_index_iterator(const TabletColumn& tablet_column, @@ -132,9 +134,8 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd } Status lookup_row_key(const Slice& key, const TabletSchema* latest_schema, bool with_seq_col, - bool with_rowid, RowLocation* row_location, - std::string* encoded_seq_value = nullptr, - OlapReaderStatistics* stats = nullptr); + bool with_rowid, RowLocation* row_location, OlapReaderStatistics* stats, + std::string* encoded_seq_value = nullptr); Status read_key_by_rowid(uint32_t row_id, std::string* key); @@ -142,9 +143,9 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd vectorized::MutableColumnPtr& result, OlapReaderStatistics& stats, std::unique_ptr& iterator_hint); - Status load_index(); + Status load_index(OlapReaderStatistics* stats); - Status load_pk_index_and_bf(OlapReaderStatistics* index_load_stats = nullptr); + Status load_pk_index_and_bf(OlapReaderStatistics* stats); void update_healthy_status(Status new_status) { _healthy_status.update(new_status); } // The segment is loaded into SegmentCache and then will load indices, if there are something wrong @@ -225,7 +226,7 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd Status _open(); Status _parse_footer(SegmentFooterPB* footer); Status _create_column_readers(const SegmentFooterPB& footer); - Status _load_pk_bloom_filter(); + Status _load_pk_bloom_filter(OlapReaderStatistics* stats); ColumnReader* _get_column_reader(const TabletColumn& col); // Get Iterator which will read variant root column and extract with paths and types info @@ -238,7 +239,7 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd Status _open_inverted_index(); - Status _create_column_readers_once(); + Status _create_column_readers_once(OlapReaderStatistics* stats); private: friend class SegmentIterator; @@ -303,7 +304,6 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd InvertedIndexFileInfo _idx_file_info; int _be_exec_version = BeExecVersionManager::get_newest_version(); - OlapReaderStatistics* _pk_index_load_stats = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index abdf9116756f0e..ec0f9104e050e7 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -281,9 +281,10 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { if (_inited) { return Status::OK(); } + _opts = opts; + SCOPED_RAW_TIMER(&_opts.stats->segment_iterator_init_timer_ns); _inited = true; _file_reader = _segment->_file_reader; - _opts = opts; _col_predicates.clear(); for (const auto& predicate : opts.column_predicates) { @@ -838,7 +839,13 @@ bool SegmentIterator::_downgrade_without_index(Status res, bool need_remaining) // such as when index segment files are not generated // above case can downgrade without index query _opts.stats->inverted_index_downgrade_count++; - LOG(INFO) << "will downgrade without index to evaluate predicate, because of res: " << res; + if (!res.is()) { + LOG(INFO) << "will downgrade without index to evaluate predicate, because of res: " + << res; + } else { + VLOG_DEBUG << "will downgrade without index to evaluate predicate, because of res: " + << res; + } return true; } return false; @@ -1005,6 +1012,7 @@ bool SegmentIterator::_check_all_conditions_passed_inverted_index_for_column(Col } Status SegmentIterator::_init_return_column_iterators() { + SCOPED_RAW_TIMER(&_opts.stats->segment_iterator_init_return_column_iterators_timer_ns); if (_cur_rowid >= num_rows()) { return Status::OK(); } @@ -1047,19 +1055,21 @@ Status SegmentIterator::_init_return_column_iterators() { } Status SegmentIterator::_init_bitmap_index_iterators() { + SCOPED_RAW_TIMER(&_opts.stats->segment_iterator_init_bitmap_index_iterators_timer_ns); if (_cur_rowid >= num_rows()) { return Status::OK(); } for (auto cid : _schema->column_ids()) { if (_bitmap_index_iterators[cid] == nullptr) { - RETURN_IF_ERROR(_segment->new_bitmap_index_iterator(_opts.tablet_schema->column(cid), - &_bitmap_index_iterators[cid])); + RETURN_IF_ERROR(_segment->new_bitmap_index_iterator( + _opts.tablet_schema->column(cid), _opts, &_bitmap_index_iterators[cid])); } } return Status::OK(); } Status SegmentIterator::_init_inverted_index_iterators() { + SCOPED_RAW_TIMER(&_opts.stats->segment_iterator_init_inverted_index_iterators_timer_ns); if (_cur_rowid >= num_rows()) { return Status::OK(); } @@ -1177,7 +1187,7 @@ Status SegmentIterator::_lookup_ordinal_from_pk_index(const RowCursor& key, bool bool exact_match = false; std::unique_ptr index_iterator; - RETURN_IF_ERROR(pk_index_reader->new_iterator(&index_iterator)); + RETURN_IF_ERROR(pk_index_reader->new_iterator(&index_iterator, _opts.stats)); Status status = index_iterator->seek_at_or_after(&index_key, &exact_match); if (UNLIKELY(!status.ok())) { @@ -1951,8 +1961,7 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { Status SegmentIterator::_convert_to_expected_type(const std::vector& col_ids) { for (ColumnId i : col_ids) { - if (_current_return_columns[i] == nullptr || _converted_column_ids[i] || - _is_pred_column[i]) { + if (!_current_return_columns[i] || _converted_column_ids[i] || _is_pred_column[i]) { continue; } if (!_segment->same_with_storage_type( @@ -1995,7 +2004,7 @@ Status SegmentIterator::copy_column_data_by_selector(vectorized::IColumn* input_ return Status::RuntimeError("copy_column_data_by_selector nullable mismatch"); } - return input_col_ptr->filter_by_selector(sel_rowid_idx, select_size, output_col); + return input_col_ptr->filter_by_selector(sel_rowid_idx, select_size, output_col.get()); } void SegmentIterator::_clear_iterators() { diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index fe465f98a2aad2..2457a44de39e10 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -363,7 +363,9 @@ Status SegmentWriter::append_block_with_variant_subcolumns(vectorized::Block& da continue; } if (_flush_schema == nullptr) { - _flush_schema = std::make_shared(*_tablet_schema); + _flush_schema = std::make_shared(); + // deep copy + _flush_schema->copy_from(*_tablet_schema); } auto column_ref = data.get_by_position(i).column; const vectorized::ColumnObject& object_column = assert_cast( diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.cpp b/be/src/olap/rowset/segment_v2/zone_map_index.cpp index c2139ff0899090..9249c82aedfdc3 100644 --- a/be/src/olap/rowset/segment_v2/zone_map_index.cpp +++ b/be/src/olap/rowset/segment_v2/zone_map_index.cpp @@ -140,18 +140,21 @@ Status TypedZoneMapIndexWriter::finish(io::FileWriter* file_writer, return writer.finish(meta->mutable_page_zone_maps()); } -Status ZoneMapIndexReader::load(bool use_page_cache, bool kept_in_memory) { +Status ZoneMapIndexReader::load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { // TODO yyq: implement a new once flag to avoid status construct. - return _load_once.call([this, use_page_cache, kept_in_memory] { - return _load(use_page_cache, kept_in_memory, std::move(_page_zone_maps_meta)); + return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats] { + return _load(use_page_cache, kept_in_memory, std::move(_page_zone_maps_meta), + index_load_stats); }); } Status ZoneMapIndexReader::_load(bool use_page_cache, bool kept_in_memory, - std::unique_ptr page_zone_maps_meta) { + std::unique_ptr page_zone_maps_meta, + OlapReaderStatistics* index_load_stats) { IndexedColumnReader reader(_file_reader, *page_zone_maps_meta); - RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory)); - IndexedColumnIterator iter(&reader); + RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory, index_load_stats)); + IndexedColumnIterator iter(&reader, index_load_stats); _page_zone_maps.resize(reader.num_values()); diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.h b/be/src/olap/rowset/segment_v2/zone_map_index.h index 34869bbbfeea62..04cae12975c5fa 100644 --- a/be/src/olap/rowset/segment_v2/zone_map_index.h +++ b/be/src/olap/rowset/segment_v2/zone_map_index.h @@ -154,14 +154,16 @@ class ZoneMapIndexReader : public MetadataAdder { virtual ~ZoneMapIndexReader(); // load all page zone maps into memory - Status load(bool use_page_cache, bool kept_in_memory); + Status load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats = nullptr); const std::vector& page_zone_maps() const { return _page_zone_maps; } int32_t num_pages() const { return _page_zone_maps.size(); } private: - Status _load(bool use_page_cache, bool kept_in_memory, std::unique_ptr); + Status _load(bool use_page_cache, bool kept_in_memory, std::unique_ptr, + OlapReaderStatistics* index_load_stats); int64_t get_metadata_size() const override; diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index ec7463d5b9d75d..ccc006e1f040a6 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -346,21 +346,22 @@ Status RowsetBuilder::commit_txn() { SCOPED_TIMER(_commit_txn_timer); const RowsetWriterContext& rw_ctx = _rowset_writer->context(); - if (rw_ctx.tablet_schema->num_variant_columns() > 0) { + if (rw_ctx.tablet_schema->num_variant_columns() > 0 && _rowset->num_rows() > 0) { // Need to merge schema with `rw_ctx.merged_tablet_schema` in prior, // merged schema keeps the newest merged schema for the rowset, which is updated and merged // during flushing segments. if (rw_ctx.merged_tablet_schema != nullptr) { RETURN_IF_ERROR(tablet()->update_by_least_common_schema(rw_ctx.merged_tablet_schema)); + } else { + // We should merge rowset schema further, in case that the merged_tablet_schema maybe null + // when enable_memtable_on_sink_node is true, the merged_tablet_schema will not be passed to + // the destination backend. + // update tablet schema when meet variant columns, before commit_txn + // Eg. rowset schema: A(int), B(float), C(int), D(int) + // _tabelt->tablet_schema: A(bigint), B(double) + // => update_schema: A(bigint), B(double), C(int), D(int) + RETURN_IF_ERROR(tablet()->update_by_least_common_schema(rw_ctx.tablet_schema)); } - // We should merge rowset schema further, in case that the merged_tablet_schema maybe null - // when enable_memtable_on_sink_node is true, the merged_tablet_schema will not be passed to - // the destination backend. - // update tablet schema when meet variant columns, before commit_txn - // Eg. rowset schema: A(int), B(float), C(int), D(int) - // _tabelt->tablet_schema: A(bigint), B(double) - // => update_schema: A(bigint), B(double), C(int), D(int) - RETURN_IF_ERROR(tablet()->update_by_least_common_schema(rw_ctx.tablet_schema)); } // Transfer ownership of `PendingRowsetGuard` to `TxnManager` @@ -398,7 +399,6 @@ Status BaseRowsetBuilder::cancel() { void BaseRowsetBuilder::_build_current_tablet_schema(int64_t index_id, const OlapTableSchemaParam* table_schema_param, const TabletSchema& ori_tablet_schema) { - _tablet_schema->copy_from(ori_tablet_schema); // find the right index id int i = 0; auto indexes = table_schema_param->indexes(); @@ -407,11 +407,13 @@ void BaseRowsetBuilder::_build_current_tablet_schema(int64_t index_id, break; } } - if (!indexes.empty() && !indexes[i]->columns.empty() && indexes[i]->columns[0]->unique_id() >= 0) { + _tablet_schema->shawdow_copy_without_columns(ori_tablet_schema); _tablet_schema->build_current_tablet_schema(index_id, table_schema_param->version(), indexes[i], ori_tablet_schema); + } else { + _tablet_schema->copy_from(ori_tablet_schema); } if (_tablet_schema->schema_version() > ori_tablet_schema.schema_version()) { // After schema change, should include extracted column diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index cdb637b1c42647..658ff05b67f0d6 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -337,7 +337,7 @@ Status BlockChanger::change_block(vectorized::Block* ref_block, int result_tmp_column_idx = -1; RETURN_IF_ERROR(ctx->execute(ref_block, &result_tmp_column_idx)); auto& result_tmp_column_def = ref_block->get_by_position(result_tmp_column_idx); - if (result_tmp_column_def.column == nullptr) { + if (!result_tmp_column_def.column) { return Status::Error( "result column={} is nullptr, input expr={}", result_tmp_column_def.name, apache::thrift::ThriftDebugString(*expr)); @@ -430,7 +430,7 @@ Status BlockChanger::_check_cast_valid(vectorized::ColumnPtr input_column, if (input_column->is_nullable() != output_column->is_nullable()) { if (input_column->is_nullable()) { const auto* ref_null_map = - vectorized::check_and_get_column(input_column) + vectorized::check_and_get_column(input_column.get()) ->get_null_map_column() .get_data() .data(); @@ -446,10 +446,12 @@ Status BlockChanger::_check_cast_valid(vectorized::ColumnPtr input_column, } } else { const auto& null_map_column = - vectorized::check_and_get_column(output_column) + vectorized::check_and_get_column( + output_column.get()) ->get_null_map_column(); const auto& nested_column = - vectorized::check_and_get_column(output_column) + vectorized::check_and_get_column( + output_column.get()) ->get_nested_column(); const auto* new_null_map = null_map_column.get_data().data(); @@ -481,12 +483,12 @@ Status BlockChanger::_check_cast_valid(vectorized::ColumnPtr input_column, if (input_column->is_nullable() && output_column->is_nullable()) { const auto* ref_null_map = - vectorized::check_and_get_column(input_column) + vectorized::check_and_get_column(input_column.get()) ->get_null_map_column() .get_data() .data(); const auto* new_null_map = - vectorized::check_and_get_column(output_column) + vectorized::check_and_get_column(output_column.get()) ->get_null_map_column() .get_data() .data(); @@ -866,6 +868,9 @@ Status SchemaChangeJob::_do_process_alter_tablet(const TAlterTabletReqV2& reques for (int i = 0; i < num_cols; ++i) { return_columns[i] = i; } + std::vector cluster_key_idxes; + + DBUG_EXECUTE_IF("SchemaChangeJob::_do_process_alter_tablet.block", DBUG_BLOCK); // begin to find deltas to convert from base tablet to new tablet so that // obtain base tablet and new tablet's push lock and header write lock to prevent loading data @@ -980,6 +985,14 @@ Status SchemaChangeJob::_do_process_alter_tablet(const TAlterTabletReqV2& reques reader_context.batch_size = ALTER_TABLE_BATCH_SIZE; reader_context.delete_bitmap = &_base_tablet->tablet_meta()->delete_bitmap(); reader_context.version = Version(0, end_version); + if (!_base_tablet_schema->cluster_key_uids().empty()) { + for (const auto& uid : _base_tablet_schema->cluster_key_uids()) { + cluster_key_idxes.emplace_back(_base_tablet_schema->field_index(uid)); + } + reader_context.read_orderby_key_columns = &cluster_key_idxes; + reader_context.is_unique = false; + reader_context.sequence_id_idx = -1; + } for (auto& rs_split : rs_splits) { res = rs_split.rs_reader->init(&reader_context); if (!res) { diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp index 67205835b53947..8202feb68c65b5 100644 --- a/be/src/olap/snapshot_manager.cpp +++ b/be/src/olap/snapshot_manager.cpp @@ -700,8 +700,10 @@ Status SnapshotManager::_create_snapshot_files(const TabletSharedPtr& ref_tablet InvertedIndexStorageFormatPB::V1) { for (const auto& index : tablet_schema.inverted_indexes()) { auto index_id = index->index_id(); - auto index_file = ref_tablet->get_segment_index_filepath( - rowset_id, segment_index, index_id); + auto index_file = InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix( + segment_file_path), + index_id, index->get_index_suffix()); auto snapshot_segment_index_file_path = fmt::format("{}/{}_{}_{}.binlog-index", schema_full_path, rowset_id, segment_index, index_id); diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index e00b5b595e20dc..24cda8232f115c 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -463,6 +463,16 @@ Status StorageEngine::_check_file_descriptor_number() { << ", use default configuration instead."; return Status::OK(); } + if (getenv("SKIP_CHECK_ULIMIT") == nullptr) { + LOG(INFO) << "will check 'ulimit' value."; + } else if (std::string(getenv("SKIP_CHECK_ULIMIT")) == "true") { + LOG(INFO) << "the 'ulimit' value check is skipped" + << ", the SKIP_CHECK_ULIMIT env value is " << getenv("SKIP_CHECK_ULIMIT"); + return Status::OK(); + } else { + LOG(INFO) << "the SKIP_CHECK_ULIMIT env value is " << getenv("SKIP_CHECK_ULIMIT") + << ", will check ulimit value."; + } if (l.rlim_cur < config::min_file_descriptor_number) { LOG(ERROR) << "File descriptor number is less than " << config::min_file_descriptor_number << ". Please use (ulimit -n) to set a value equal or greater than " diff --git a/be/src/olap/storage_policy.cpp b/be/src/olap/storage_policy.cpp index 837e9bed178e3a..3b4a1f1a185678 100644 --- a/be/src/olap/storage_policy.cpp +++ b/be/src/olap/storage_policy.cpp @@ -141,8 +141,10 @@ std::vector> get_storage_resource_ids() { namespace { [[noreturn]] void exit_at_unknown_path_version(std::string_view resource_id, int64_t path_version) { - LOG(FATAL) << "unknown path version, please upgrade BE or drop this storage vault. resource_id=" - << resource_id << " path_version=" << path_version; + throw Exception( + Status::FatalError("unknown path version, please upgrade BE or drop this storage " + "vault. resource_id={} path_version={}", + resource_id, path_version)); } } // namespace diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 644ca9133eb885..1758166e76edee 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -512,6 +512,15 @@ Status Tablet::add_rowset(RowsetSharedPtr rowset) { return Status::OK(); } +bool Tablet::rowset_exists_unlocked(const RowsetSharedPtr& rowset) { + if (auto it = _rs_version_map.find(rowset->version()); it == _rs_version_map.end()) { + return false; + } else if (rowset->rowset_id() != it->second->rowset_id()) { + return false; + } + return true; +} + Status Tablet::modify_rowsets(std::vector& to_add, std::vector& to_delete, bool check_delete) { // the compaction process allow to compact the single version, eg: version[4-4]. @@ -1741,8 +1750,13 @@ Status Tablet::prepare_compaction_and_calculate_permits( } if (!res.ok()) { - tablet->set_last_cumu_compaction_failure_time(UnixMillis()); permits = 0; + // if we meet a delete version, should increase the cumulative point to let base compaction handle the delete version. + // no need to wait 5s. + if (!(res.msg() == "_last_delete_version.first not equal to -1") || + config::enable_sleep_between_delete_cumu_compaction) { + tablet->set_last_cumu_compaction_failure_time(UnixMillis()); + } if (!res.is()) { DorisMetrics::instance()->cumulative_compaction_request_failed->increment(1); return Status::InternalError("prepare cumulative compaction with err: {}", res); @@ -1750,6 +1764,12 @@ Status Tablet::prepare_compaction_and_calculate_permits( // return OK if OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSION, so that we don't need to // print too much useless logs. // And because we set permits to 0, so even if we return OK here, nothing will be done. + LOG_INFO( + "cumulative compaction meet delete rowset, increase cumu point without other " + "operation.") + .tag("tablet id:", tablet->tablet_id()) + .tag("after cumulative compaction, cumu point:", + tablet->cumulative_layer_point()); return Status::OK(); } } else if (compaction_type == CompactionType::BASE_COMPACTION) { @@ -2595,30 +2615,6 @@ std::string Tablet::get_segment_filepath(std::string_view rowset_id, int64_t seg return fmt::format("{}/_binlog/{}_{}.dat", _tablet_path, rowset_id, segment_index); } -std::string Tablet::get_segment_index_filepath(std::string_view rowset_id, - std::string_view segment_index, - std::string_view index_id) const { - auto format = _tablet_meta->tablet_schema()->get_inverted_index_storage_format(); - if (format == doris::InvertedIndexStorageFormatPB::V1) { - return fmt::format("{}/_binlog/{}_{}_{}.idx", _tablet_path, rowset_id, segment_index, - index_id); - } else { - return fmt::format("{}/_binlog/{}_{}.idx", _tablet_path, rowset_id, segment_index); - } -} - -std::string Tablet::get_segment_index_filepath(std::string_view rowset_id, int64_t segment_index, - int64_t index_id) const { - auto format = _tablet_meta->tablet_schema()->get_inverted_index_storage_format(); - if (format == doris::InvertedIndexStorageFormatPB::V1) { - return fmt::format("{}/_binlog/{}_{}_{}.idx", _tablet_path, rowset_id, segment_index, - index_id); - } else { - DCHECK(index_id == -1); - return fmt::format("{}/_binlog/{}_{}.idx", _tablet_path, rowset_id, segment_index); - } -} - std::vector Tablet::get_binlog_filepath(std::string_view binlog_version) const { const auto& [rowset_id, num_segments] = get_binlog_info(binlog_version); std::vector binlog_filepath; @@ -2663,10 +2659,25 @@ void Tablet::gc_binlogs(int64_t version) { // add binlog segment files and index files for (int64_t i = 0; i < num_segments; ++i) { - wait_for_deleted_binlog_files.emplace_back(get_segment_filepath(rowset_id, i)); - for (const auto& index : this->tablet_schema()->inverted_indexes()) { - wait_for_deleted_binlog_files.emplace_back( - get_segment_index_filepath(rowset_id, i, index->index_id())); + auto segment_file_path = get_segment_filepath(rowset_id, i); + wait_for_deleted_binlog_files.emplace_back(segment_file_path); + + // index files + if (tablet_schema()->has_inverted_index()) { + if (tablet_schema()->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { + for (const auto& index : tablet_schema()->inverted_indexes()) { + auto index_file = InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix( + segment_file_path), + index->index_id(), index->get_index_suffix()); + wait_for_deleted_binlog_files.emplace_back(index_file); + } + } else { + auto index_file = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_file_path)); + wait_for_deleted_binlog_files.emplace_back(index_file); + } } } }; @@ -2755,7 +2766,7 @@ void Tablet::check_table_size_correctness() { const std::vector& all_rs_metas = _tablet_meta->all_rs_metas(); for (const auto& rs_meta : all_rs_metas) { int64_t total_segment_size = get_segment_file_size(rs_meta); - int64_t total_inverted_index_size = get_inverted_index_file_szie(rs_meta); + int64_t total_inverted_index_size = get_inverted_index_file_size(rs_meta); if (rs_meta->data_disk_size() != total_segment_size || rs_meta->index_disk_size() != total_inverted_index_size || rs_meta->data_disk_size() + rs_meta->index_disk_size() != rs_meta->total_disk_size()) { @@ -2806,7 +2817,7 @@ int64_t Tablet::get_segment_file_size(const RowsetMetaSharedPtr& rs_meta) { return total_segment_size; } -int64_t Tablet::get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta) { +int64_t Tablet::get_inverted_index_file_size(const RowsetMetaSharedPtr& rs_meta) { const auto& fs = rs_meta->fs(); if (!fs) { LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta->resource_id(); diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 0b7d758ab8fd88..afe043bf15195b 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -173,6 +173,7 @@ class Tablet final : public BaseTablet { // MUST hold EXCLUSIVE `_meta_lock`. Status modify_rowsets(std::vector& to_add, std::vector& to_delete, bool check_delete = false); + bool rowset_exists_unlocked(const RowsetSharedPtr& rowset); Status add_inc_rowset(const RowsetSharedPtr& rowset); /// Delete stale rowset by timing. This delete policy uses now() minutes @@ -213,6 +214,7 @@ class Tablet final : public BaseTablet { std::mutex& get_push_lock() { return _ingest_lock; } std::mutex& get_base_compaction_lock() { return _base_compaction_lock; } std::mutex& get_cumulative_compaction_lock() { return _cumulative_compaction_lock; } + std::shared_mutex& get_meta_store_lock() { return _meta_store_lock; } std::shared_timed_mutex& get_migration_lock() { return _migration_lock; } @@ -440,11 +442,6 @@ class Tablet final : public BaseTablet { std::string get_segment_filepath(std::string_view rowset_id, std::string_view segment_index) const; std::string get_segment_filepath(std::string_view rowset_id, int64_t segment_index) const; - std::string get_segment_index_filepath(std::string_view rowset_id, - std::string_view segment_index, - std::string_view index_id) const; - std::string get_segment_index_filepath(std::string_view rowset_id, int64_t segment_index, - int64_t index_id) const; bool can_add_binlog(uint64_t total_binlog_size) const; void gc_binlogs(int64_t version); Status ingest_binlog_metas(RowsetBinlogMetasPB* metas_pb); @@ -535,7 +532,7 @@ class Tablet final : public BaseTablet { void check_table_size_correctness(); std::string get_segment_path(const RowsetMetaSharedPtr& rs_meta, int64_t seg_id); int64_t get_segment_file_size(const RowsetMetaSharedPtr& rs_meta); - int64_t get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta); + int64_t get_inverted_index_file_size(const RowsetMetaSharedPtr& rs_meta); public: static const int64_t K_INVALID_CUMULATIVE_POINT = -1; @@ -592,7 +589,7 @@ class Tablet final : public BaseTablet { std::shared_ptr _cumulative_compaction_policy; std::string_view _cumulative_compaction_type; - // use a seperate thread to check all tablets paths existance + // use a separate thread to check all tablets paths existence std::atomic _is_tablet_path_exists; int64_t _last_missed_version; diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index 33fee7ca350900..18e317cb12d1e0 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -523,7 +523,8 @@ Status TabletManager::drop_tablet(TTabletId tablet_id, TReplicaId replica_id, Status TabletManager::_drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, bool is_drop_table_or_partition, bool had_held_shard_lock) { LOG(INFO) << "begin drop tablet. tablet_id=" << tablet_id << ", replica_id=" << replica_id - << ", is_drop_table_or_partition=" << is_drop_table_or_partition; + << ", is_drop_table_or_partition=" << is_drop_table_or_partition + << ", keep_files=" << keep_files; DorisMetrics::instance()->drop_tablet_requests_total->increment(1); RETURN_IF_ERROR(register_transition_tablet(tablet_id, "drop tablet")); @@ -558,27 +559,32 @@ Status TabletManager::_drop_tablet(TTabletId tablet_id, TReplicaId replica_id, b to_drop_tablet->clear_cache(); - if (!keep_files) { + { // drop tablet will update tablet meta, should lock std::lock_guard wrlock(to_drop_tablet->get_header_lock()); SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); - LOG(INFO) << "set tablet to shutdown state and remove it from memory. " - << "tablet_id=" << tablet_id << ", tablet_path=" << to_drop_tablet->tablet_path(); // NOTE: has to update tablet here, but must not update tablet meta directly. // because other thread may hold the tablet object, they may save meta too. // If update meta directly here, other thread may override the meta // and the tablet will be loaded at restart time. // To avoid this exception, we first set the state of the tablet to `SHUTDOWN`. + // + // Until now, only the restore task uses keep files. RETURN_IF_ERROR(to_drop_tablet->set_tablet_state(TABLET_SHUTDOWN)); - // We must record unused remote rowsets path info to OlapMeta before tablet state is marked as TABLET_SHUTDOWN in OlapMeta, - // otherwise if BE shutdown after saving tablet state, these remote rowsets path info will lost. - if (is_drop_table_or_partition) { - RETURN_IF_ERROR(to_drop_tablet->remove_all_remote_rowsets()); - } - to_drop_tablet->save_meta(); - { - std::lock_guard wrdlock(_shutdown_tablets_lock); - _shutdown_tablets.push_back(to_drop_tablet); + if (!keep_files) { + LOG(INFO) << "set tablet to shutdown state and remove it from memory. " + << "tablet_id=" << tablet_id + << ", tablet_path=" << to_drop_tablet->tablet_path(); + // We must record unused remote rowsets path info to OlapMeta before tablet state is marked as TABLET_SHUTDOWN in OlapMeta, + // otherwise if BE shutdown after saving tablet state, these remote rowsets path info will lost. + if (is_drop_table_or_partition) { + RETURN_IF_ERROR(to_drop_tablet->remove_all_remote_rowsets()); + } + to_drop_tablet->save_meta(); + { + std::lock_guard wrdlock(_shutdown_tablets_lock); + _shutdown_tablets.push_back(to_drop_tablet); + } } } @@ -719,6 +725,11 @@ void TabletManager::get_tablet_stat(TTabletStatResult* result) { result->__set_tablet_stat_list(*local_cache); } +struct TabletScore { + TabletSharedPtr tablet_ptr; + int score; +}; + std::vector TabletManager::find_best_tablets_to_compaction( CompactionType compaction_type, DataDir* data_dir, const std::unordered_set& tablet_submitted_compaction, uint32_t* score, @@ -732,6 +743,9 @@ std::vector TabletManager::find_best_tablets_to_compaction( uint32_t single_compact_highest_score = 0; TabletSharedPtr best_tablet; TabletSharedPtr best_single_compact_tablet; + auto cmp = [](TabletScore left, TabletScore right) { return left.score > right.score; }; + std::priority_queue, decltype(cmp)> top_tablets(cmp); + auto handler = [&](const TabletSharedPtr& tablet_ptr) { if (tablet_ptr->tablet_meta()->tablet_schema()->disable_auto_compaction()) { LOG_EVERY_N(INFO, 500) << "Tablet " << tablet_ptr->tablet_id() @@ -798,13 +812,33 @@ std::vector TabletManager::find_best_tablets_to_compaction( } } - // tablet should do cumu or base compaction - if (current_compaction_score > highest_score && !tablet_ptr->should_fetch_from_peer()) { - bool ret = tablet_ptr->suitable_for_compaction(compaction_type, - cumulative_compaction_policy); - if (ret) { - highest_score = current_compaction_score; - best_tablet = tablet_ptr; + if (config::compaction_num_per_round > 1 && !tablet_ptr->should_fetch_from_peer()) { + TabletScore ts; + ts.score = current_compaction_score; + ts.tablet_ptr = tablet_ptr; + if ((top_tablets.size() >= config::compaction_num_per_round && + current_compaction_score > top_tablets.top().score) || + top_tablets.size() < config::compaction_num_per_round) { + bool ret = tablet_ptr->suitable_for_compaction(compaction_type, + cumulative_compaction_policy); + if (ret) { + top_tablets.push(ts); + if (top_tablets.size() > config::compaction_num_per_round) { + top_tablets.pop(); + } + if (current_compaction_score > highest_score) { + highest_score = current_compaction_score; + } + } + } + } else { + if (current_compaction_score > highest_score && !tablet_ptr->should_fetch_from_peer()) { + bool ret = tablet_ptr->suitable_for_compaction(compaction_type, + cumulative_compaction_policy); + if (ret) { + highest_score = current_compaction_score; + best_tablet = tablet_ptr; + } } } }; @@ -820,6 +854,16 @@ std::vector TabletManager::find_best_tablets_to_compaction( picked_tablet.emplace_back(std::move(best_tablet)); } + std::vector reverse_top_tablets; + while (!top_tablets.empty()) { + reverse_top_tablets.emplace_back(top_tablets.top().tablet_ptr); + top_tablets.pop(); + } + + for (auto it = reverse_top_tablets.rbegin(); it != reverse_top_tablets.rend(); ++it) { + picked_tablet.emplace_back(*it); + } + // pick single compaction tablet needs the highest score if (best_single_compact_tablet != nullptr && single_compact_highest_score >= highest_score) { VLOG_CRITICAL << "Found the best tablet for single compaction. " diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp index a83e0bfdbf4c30..416d0fea476b32 100644 --- a/be/src/olap/tablet_reader.cpp +++ b/be/src/olap/tablet_reader.cpp @@ -61,7 +61,7 @@ using namespace ErrorCode; void TabletReader::ReaderParams::check_validation() const { if (UNLIKELY(version.first == -1 && is_segcompaction == false)) { - LOG(FATAL) << "version is not set. tablet=" << tablet->tablet_id(); + throw Exception(Status::FatalError("version is not set. tablet={}", tablet->tablet_id())); } } @@ -120,6 +120,7 @@ TabletReader::~TabletReader() { } Status TabletReader::init(const ReaderParams& read_params) { + SCOPED_RAW_TIMER(&_stats.tablet_reader_init_timer_ns); _predicate_arena = std::make_unique(); Status res = _init_params(read_params); @@ -159,6 +160,7 @@ bool TabletReader::_optimize_for_single_rowset( } Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) { + SCOPED_RAW_TIMER(&_stats.tablet_reader_capture_rs_readers_timer_ns); if (read_params.rs_splits.empty()) { return Status::InternalError("fail to acquire data sources. tablet={}", _tablet->tablet_id()); @@ -331,6 +333,7 @@ Status TabletReader::_init_params(const ReaderParams& read_params) { } Status TabletReader::_init_return_columns(const ReaderParams& read_params) { + SCOPED_RAW_TIMER(&_stats.tablet_reader_init_return_columns_timer_ns); if (read_params.reader_type == ReaderType::READER_QUERY) { _return_columns = read_params.return_columns; _tablet_columns_convert_to_null_set = read_params.tablet_columns_convert_to_null_set; @@ -387,6 +390,7 @@ Status TabletReader::_init_return_columns(const ReaderParams& read_params) { } Status TabletReader::_init_keys_param(const ReaderParams& read_params) { + SCOPED_RAW_TIMER(&_stats.tablet_reader_init_keys_param_timer_ns); if (read_params.start_key.empty()) { return Status::OK(); } @@ -461,6 +465,7 @@ Status TabletReader::_init_keys_param(const ReaderParams& read_params) { } Status TabletReader::_init_orderby_keys_param(const ReaderParams& read_params) { + SCOPED_RAW_TIMER(&_stats.tablet_reader_init_orderby_keys_param_timer_ns); // UNIQUE_KEYS will compare all keys as before if (_tablet_schema->keys_type() == DUP_KEYS || (_tablet_schema->keys_type() == UNIQUE_KEYS && _tablet->enable_unique_key_merge_on_write())) { @@ -513,6 +518,7 @@ Status TabletReader::_init_orderby_keys_param(const ReaderParams& read_params) { } Status TabletReader::_init_conditions_param(const ReaderParams& read_params) { + SCOPED_RAW_TIMER(&_stats.tablet_reader_init_conditions_param_timer_ns); std::vector predicates; for (const auto& condition : read_params.conditions) { TCondition tmp_cond = condition; @@ -639,6 +645,7 @@ ColumnPredicate* TabletReader::_parse_to_predicate(const FunctionFilter& functio } Status TabletReader::_init_delete_condition(const ReaderParams& read_params) { + SCOPED_RAW_TIMER(&_stats.tablet_reader_init_delete_condition_param_timer_ns); // If it's cumu and not allow do delete when cumu if (read_params.reader_type == ReaderType::READER_SEGMENT_COMPACTION || (read_params.reader_type == ReaderType::READER_CUMULATIVE_COMPACTION && diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index 3ec5d22166477f..7b6b5f313c144e 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -1064,6 +1064,21 @@ void TabletSchema::copy_from(const TabletSchema& tablet_schema) { _table_id = tablet_schema.table_id(); } +void TabletSchema::shawdow_copy_without_columns(const TabletSchema& tablet_schema) { + *this = tablet_schema; + _field_path_to_index.clear(); + _field_name_to_index.clear(); + _field_id_to_index.clear(); + _num_columns = 0; + _num_variant_columns = 0; + _num_null_columns = 0; + _num_key_columns = 0; + _cols.clear(); + _vl_field_mem_size = 0; + // notice : do not ref columns + _column_cache_handlers.clear(); +} + void TabletSchema::update_index_info_from(const TabletSchema& tablet_schema) { for (auto& col : _cols) { if (col->unique_id() < 0) { diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index c813d6f0ef8722..3dfe055fbf4a89 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -330,6 +330,8 @@ class TabletSchema : public MetadataAdder { // Must make sure the row column is always the last column void add_row_column(); void copy_from(const TabletSchema& tablet_schema); + // lightweight copy, take care of lifecycle of TabletColumn + void shawdow_copy_without_columns(const TabletSchema& tablet_schema); void update_index_info_from(const TabletSchema& tablet_schema); std::string to_key() const; // get_metadata_size is only the memory of the TabletSchema itself, not include child objects. @@ -531,6 +533,7 @@ class TabletSchema : public MetadataAdder { private: friend bool operator==(const TabletSchema& a, const TabletSchema& b); friend bool operator!=(const TabletSchema& a, const TabletSchema& b); + TabletSchema(const TabletSchema&) = default; void clear_column_cache_handlers(); diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index fa8d9b8248e3f4..ecf1bdfc6d5c7d 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -171,6 +171,16 @@ Status EngineCloneTask::_do_clone() { auto duration = std::chrono::milliseconds(dp->param("duration", 10 * 1000)); std::this_thread::sleep_for(duration); }); + + DBUG_EXECUTE_IF("EngineCloneTask.failed_clone", { + LOG_WARNING("EngineCloneTask.failed_clone") + .tag("tablet_id", _clone_req.tablet_id) + .tag("replica_id", _clone_req.replica_id) + .tag("version", _clone_req.version); + return Status::InternalError( + "in debug point, EngineCloneTask.failed_clone tablet={}, replica={}, version={}", + _clone_req.tablet_id, _clone_req.replica_id, _clone_req.version); + }); Status status = Status::OK(); string src_file_path; TBackend src_host; @@ -803,8 +813,6 @@ Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_d /// Traverse all downloaded clone files in CLONE dir. /// If it does not exist in local tablet dir, link the file to local tablet dir /// And save all linked files in linked_success_files. - /// if binlog exist in clone dir and md5sum equal, then skip link file - bool skip_link_file = false; for (const string& clone_file : clone_file_names) { if (local_file_names.find(clone_file) != local_file_names.end()) { VLOG_NOTICE << "find same file when clone, skip it. " @@ -812,6 +820,8 @@ Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_d continue; } + /// if binlog exist in clone dir and md5sum equal, then skip link file + bool skip_link_file = false; std::string to; if (clone_file.ends_with(".binlog") || clone_file.ends_with(".binlog-index")) { if (!contain_binlog) { diff --git a/be/src/olap/task/engine_storage_migration_task.cpp b/be/src/olap/task/engine_storage_migration_task.cpp index a300e6e0f09fa3..210aa6a8c56f08 100644 --- a/be/src/olap/task/engine_storage_migration_task.cpp +++ b/be/src/olap/task/engine_storage_migration_task.cpp @@ -409,8 +409,9 @@ Status EngineStorageMigrationTask::_copy_index_and_data_files( InvertedIndexStorageFormatPB::V1) { for (const auto& index : tablet_schema.inverted_indexes()) { auto index_id = index->index_id(); - auto index_file = - _tablet->get_segment_index_filepath(rowset_id, segment_index, index_id); + auto index_file = InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_file_path), + index_id, index->get_index_suffix()); auto snapshot_segment_index_file_path = fmt::format("{}/{}_{}_{}.binlog-index", full_path, rowset_id, segment_index, index_id); diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp index d227f53053128b..c54b9c5e8f980f 100644 --- a/be/src/olap/txn_manager.cpp +++ b/be/src/olap/txn_manager.cpp @@ -548,8 +548,9 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, if (!status.ok()) { return Status::Error( "add rowset to binlog failed. when publish txn rowset_id: {}, tablet id: {}, " - "txn id: {}", - rowset->rowset_id().to_string(), tablet_id, transaction_id); + "txn id: {}, status: {}", + rowset->rowset_id().to_string(), tablet_id, transaction_id, + status.to_string_no_stack()); } } diff --git a/be/src/pipeline/common/set_utils.h b/be/src/pipeline/common/set_utils.h index 2caf5b7d0b814c..38a82a501ff534 100644 --- a/be/src/pipeline/common/set_utils.h +++ b/be/src/pipeline/common/set_utils.h @@ -20,13 +20,20 @@ #include #include -#include "pipeline/exec/join/join_op.h" #include "vec/common/hash_table/hash_map_util.h" namespace doris { +struct RowRefWithFlag { + bool visited; + uint32_t row_num = 0; + RowRefWithFlag() = default; + RowRefWithFlag(size_t row_num_count, bool is_visited = false) + : visited(is_visited), row_num(row_num_count) {} +}; + template -using SetData = PHHashMap>; +using SetData = PHHashMap>; template using SetFixedKeyHashTableContext = vectorized::MethodKeysFixed>; @@ -39,9 +46,8 @@ using SetPrimaryTypeHashTableContextNullable = vectorized::MethodSingleNullableC vectorized::MethodOneNumber>>>; using SetSerializedHashTableContext = - vectorized::MethodSerialized>; -using SetMethodOneString = - vectorized::MethodStringNoCache>; + vectorized::MethodSerialized>; +using SetMethodOneString = vectorized::MethodStringNoCache>; using SetHashTableVariants = std::variant lc(le_lock); if (exchanger->_running_source_operators.fetch_sub(1) == 1) { _set_always_ready(); - exchanger->finalize(local_state); + exchanger->finalize(); } } diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index ad018c8b4f8f3d..ecbd49a5647c2e 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -723,8 +723,7 @@ inline std::string get_exchange_type_name(ExchangeType idx) { case ExchangeType::LOCAL_MERGE_SORT: return "LOCAL_MERGE_SORT"; } - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); + throw Exception(Status::FatalError("__builtin_unreachable")); } struct DataDistribution { @@ -758,7 +757,7 @@ struct LocalExchangeSharedState : public BasicSharedState { } } void sub_running_sink_operators(); - void sub_running_source_operators(LocalExchangeSourceLocalState& local_state); + void sub_running_source_operators(); void _set_always_ready() { for (auto& dep : source_deps) { DCHECK(dep); diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index 3a9156f45b6758..fe0ab0b148e55a 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -352,17 +352,17 @@ Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { int64_t range_start, range_end; if (!_parent->cast()._window.__isset.window_start && _parent->cast()._window.window_end.type == - TAnalyticWindowBoundaryType:: - CURRENT_ROW) { //[preceding, current_row],[current_row, following] + TAnalyticWindowBoundaryType::CURRENT_ROW) { + // [preceding, current_row], [current_row, following] rewrite it's same + // as could reuse the previous calculate result, so don't call _reset_agg_status function + // going on calculate, add up data, no need to reset state range_start = _shared_state->current_row_position; - range_end = _shared_state->current_row_position + - 1; //going on calculate,add up data, no need to reset state + range_end = _shared_state->current_row_position + 1; } else { _reset_agg_status(); range_end = _shared_state->current_row_position + _rows_end_offset + 1; - if (!_parent->cast() - ._window.__isset - .window_start) { //[preceding, offset] --unbound: [preceding, following] + //[preceding, offset] --unbound: [preceding, following] + if (!_parent->cast()._window.__isset.window_start) { range_start = _partition_by_start.pos; } else { range_start = _shared_state->current_row_position + _rows_start_offset; diff --git a/be/src/pipeline/exec/assert_num_rows_operator.h b/be/src/pipeline/exec/assert_num_rows_operator.h index dcc64f57878d38..a7408d695928c5 100644 --- a/be/src/pipeline/exec/assert_num_rows_operator.h +++ b/be/src/pipeline/exec/assert_num_rows_operator.h @@ -20,6 +20,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class AssertNumRowsLocalState final : public PipelineXLocalState { public: @@ -55,4 +56,5 @@ class AssertNumRowsOperatorX final : public StreamingOperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/exec/cache_source_operator.h b/be/src/pipeline/exec/cache_source_operator.h index e764323846b153..146c984d04aa3f 100644 --- a/be/src/pipeline/exec/cache_source_operator.h +++ b/be/src/pipeline/exec/cache_source_operator.h @@ -25,6 +25,7 @@ #include "pipeline/query_cache/query_cache.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace vectorized { @@ -101,4 +102,5 @@ class CacheSourceOperatorX final : public OperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/data_queue.h b/be/src/pipeline/exec/data_queue.h index f5bd84cc278d0a..d97f58c0debdb6 100644 --- a/be/src/pipeline/exec/data_queue.h +++ b/be/src/pipeline/exec/data_queue.h @@ -29,6 +29,7 @@ #include "vec/core/block.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class Dependency; @@ -108,4 +109,5 @@ class DataQueue { SpinLock _source_lock; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/datagen_operator.h b/be/src/pipeline/exec/datagen_operator.h index bada5ec4080d08..ffc2c6f946fb3a 100644 --- a/be/src/pipeline/exec/datagen_operator.h +++ b/be/src/pipeline/exec/datagen_operator.h @@ -24,6 +24,7 @@ #include "pipeline/exec/operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -70,4 +71,5 @@ class DataGenSourceOperatorX final : public OperatorX { std::vector _runtime_filter_descs; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/exec/empty_set_operator.cpp b/be/src/pipeline/exec/empty_set_operator.cpp index 7233e46dfd1e52..2dfe9701558da0 100644 --- a/be/src/pipeline/exec/empty_set_operator.cpp +++ b/be/src/pipeline/exec/empty_set_operator.cpp @@ -22,6 +22,7 @@ #include "pipeline/exec/operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status EmptySetSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { @@ -29,4 +30,5 @@ Status EmptySetSourceOperatorX::get_block(RuntimeState* state, vectorized::Block return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/empty_set_operator.h b/be/src/pipeline/exec/empty_set_operator.h index 6b200bfdbde249..d8e920b256494d 100644 --- a/be/src/pipeline/exec/empty_set_operator.h +++ b/be/src/pipeline/exec/empty_set_operator.h @@ -22,6 +22,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class EmptySetLocalState final : public PipelineXLocalState { public: @@ -43,4 +44,5 @@ class EmptySetSourceOperatorX final : public OperatorX { [[nodiscard]] bool is_source() const override { return true; } }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/es_scan_operator.h b/be/src/pipeline/exec/es_scan_operator.h index 2ae562e4fc7f32..6e64110997e3af 100644 --- a/be/src/pipeline/exec/es_scan_operator.h +++ b/be/src/pipeline/exec/es_scan_operator.h @@ -26,6 +26,7 @@ #include "pipeline/exec/scan_operator.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class NewEsScanner; @@ -86,4 +87,5 @@ class EsScanOperatorX final : public ScanOperatorX { std::vector _column_names; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/exchange_sink_buffer.cpp b/be/src/pipeline/exec/exchange_sink_buffer.cpp index 65e7698737076e..800ef6150738d6 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.cpp +++ b/be/src/pipeline/exec/exchange_sink_buffer.cpp @@ -47,6 +47,7 @@ #include "vec/sink/vdata_stream_sender.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { BroadcastPBlockHolder::~BroadcastPBlockHolder() { @@ -421,8 +422,7 @@ void ExchangeSinkBuffer::_ended(InstanceLoId id) { } LOG(INFO) << ss.str(); - LOG(FATAL) << "not find the instance id"; - __builtin_unreachable(); + throw Exception(Status::FatalError("not find the instance id")); } else { std::unique_lock lock(*_instance_to_package_queue_mutex[id]); _running_sink_count[id]--; @@ -576,4 +576,5 @@ void ExchangeSinkBuffer::update_profile(RuntimeProfile* profile) { } } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/exchange_sink_buffer.h b/be/src/pipeline/exec/exchange_sink_buffer.h index b2eb32414feca2..458c7c3f66e3ee 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.h +++ b/be/src/pipeline/exec/exchange_sink_buffer.h @@ -40,6 +40,7 @@ #include "util/ref_count_closure.h" namespace doris { +#include "common/compile_check_begin.h" class PTransmitDataParams; class TUniqueId; @@ -318,4 +319,5 @@ class ExchangeSinkBuffer : public HasTaskExecutionCtx { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp index aa893fc0a26f2e..cc789f6e25b20b 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.cpp +++ b/be/src/pipeline/exec/exchange_sink_operator.cpp @@ -32,7 +32,6 @@ #include "pipeline/exec/operator.h" #include "pipeline/exec/sort_source_operator.h" #include "pipeline/local_exchange/local_exchange_sink_operator.h" -#include "pipeline/local_exchange/local_exchange_source_operator.h" #include "pipeline/pipeline_fragment_context.h" #include "util/runtime_profile.h" #include "util/uid_util.h" @@ -112,6 +111,7 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf } } only_local_exchange = local_size == channels.size(); + _rpc_channels_num = channels.size() - local_size; if (!only_local_exchange) { _sink_buffer = p.get_sink_buffer(state->fragment_instance_id().lo); @@ -206,17 +206,12 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { std::mt19937 g(rd()); shuffle(channels.begin(), channels.end(), g); } - size_t local_size = 0; for (int i = 0; i < channels.size(); ++i) { RETURN_IF_ERROR(channels[i]->open(state)); if (channels[i]->is_local()) { - local_size++; _last_local_channel_idx = i; } } - only_local_exchange = local_size == channels.size(); - - _rpc_channels_num = channels.size() - local_size; PUniqueId id; id.set_hi(_state->query_id().hi); @@ -228,7 +223,7 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { _parent->operator_id(), _parent->node_id(), "BroadcastDependency", true); _broadcast_pb_mem_limiter = vectorized::BroadcastPBlockHolderMemLimiter::create_shared(_broadcast_dependency); - } else if (local_size > 0) { + } else if (!only_local_exchange) { size_t dep_id = 0; for (auto& channel : channels) { if (channel->is_local()) { @@ -283,6 +278,7 @@ ExchangeSinkOperatorX::ExchangeSinkOperatorX( _tablet_sink_txn_id(sink.tablet_sink_txn_id), _t_tablet_sink_exprs(&sink.tablet_sink_exprs), _enable_local_merge_sort(state->enable_local_merge_sort()), + _dest_is_merge(sink.__isset.is_merge && sink.is_merge), _fragment_instance_ids(fragment_instance_ids) { DCHECK_GT(destinations.size(), 0); DCHECK(sink.output_partition.type == TPartitionType::UNPARTITIONED || @@ -575,19 +571,13 @@ std::shared_ptr ExchangeSinkOperatorX::_create_buffer( // Therefore, a shared sink buffer is used here to limit the number of concurrent RPCs. // (Note: This does not reduce the total number of RPCs.) // In a merge sort scenario, there are only n RPCs, so a shared sink buffer is not needed. -/// TODO: Modify this to let FE handle the judgment instead of BE. std::shared_ptr ExchangeSinkOperatorX::get_sink_buffer( InstanceLoId sender_ins_id) { - if (!_child) { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, - "ExchangeSinkOperatorX did not correctly set the child."); - } // When the child is SortSourceOperatorX or LocalExchangeSourceOperatorX, // it is an order-by scenario. // In this case, there is only one target instance, and no n * n RPC concurrency will occur. // Therefore, sharing a sink buffer is not necessary. - if (std::dynamic_pointer_cast(_child) || - std::dynamic_pointer_cast(_child)) { + if (_dest_is_merge) { return _create_buffer({sender_ins_id}); } if (_state->enable_shared_exchange_sink_buffer()) { diff --git a/be/src/pipeline/exec/exchange_sink_operator.h b/be/src/pipeline/exec/exchange_sink_operator.h index e88389b1d7bb5a..3d6eeb4b39e94f 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.h +++ b/be/src/pipeline/exec/exchange_sink_operator.h @@ -31,6 +31,7 @@ #include "vec/sink/vdata_stream_sender.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; class TDataSink; @@ -204,7 +205,6 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorX get_sink_buffer(InstanceLoId sender_ins_id); vectorized::VExprContextSPtrs& tablet_sink_expr_ctxs() { return _tablet_sink_expr_ctxs; } @@ -259,8 +259,12 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorX& _fragment_instance_ids; }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/exchange_source_operator.h b/be/src/pipeline/exec/exchange_source_operator.h index f938f5007d1643..ff9c5840033777 100644 --- a/be/src/pipeline/exec/exchange_source_operator.h +++ b/be/src/pipeline/exec/exchange_source_operator.h @@ -22,6 +22,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class ExecNode; } // namespace doris @@ -109,4 +110,5 @@ class ExchangeSourceOperatorX final : public OperatorX { std::vector _nulls_first; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/file_scan_operator.h b/be/src/pipeline/exec/file_scan_operator.h index 2777a013d62f61..87c5bcd2e54de5 100644 --- a/be/src/pipeline/exec/file_scan_operator.h +++ b/be/src/pipeline/exec/file_scan_operator.h @@ -29,6 +29,7 @@ #include "vec/exec/scan/split_source_connector.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class VFileScanner; } // namespace vectorized @@ -86,4 +87,5 @@ class FileScanOperatorX final : public ScanOperatorX { const std::string _table_name; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.h b/be/src/pipeline/exec/group_commit_block_sink_operator.h index e469aee8df595c..5eabb280c4315d 100644 --- a/be/src/pipeline/exec/group_commit_block_sink_operator.h +++ b/be/src/pipeline/exec/group_commit_block_sink_operator.h @@ -22,8 +22,9 @@ #include "runtime/group_commit_mgr.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class OlapTableBlockConvertor; -} +} // namespace doris::vectorized namespace doris::pipeline { @@ -125,4 +126,5 @@ class GroupCommitBlockSinkOperatorX final TGroupCommitMode::type _group_commit_mode; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/group_commit_scan_operator.h b/be/src/pipeline/exec/group_commit_scan_operator.h index 46f50f3772440a..d1428899ede6b9 100644 --- a/be/src/pipeline/exec/group_commit_scan_operator.h +++ b/be/src/pipeline/exec/group_commit_scan_operator.h @@ -27,6 +27,7 @@ #include "runtime/group_commit_mgr.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class GroupCommitOperatorX; class GroupCommitLocalState final : public ScanLocalState { @@ -60,4 +61,5 @@ class GroupCommitOperatorX final : public ScanOperatorX { const int64_t _table_id; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index b2a79a941f79e7..b71feff3ed4460 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -135,26 +135,16 @@ Status HashJoinBuildSinkLocalState::close(RuntimeState* state, Status exec_statu } }}; - if (!_runtime_filter_slots || _runtime_filters.empty() || state->is_cancelled()) { + if (!_runtime_filter_slots || _runtime_filters.empty() || state->is_cancelled() || !_eos) { return Base::close(state, exec_status); } try { - if (state->get_task()->wake_up_by_downstream()) { - if (_should_build_hash_table) { - // partitial ignore rf to make global rf work - RETURN_IF_ERROR( - _runtime_filter_slots->send_filter_size(state, 0, _finish_dependency)); - RETURN_IF_ERROR(_runtime_filter_slots->ignore_all_filters()); - } else { - // do not publish filter coz local rf not inited and useless - return Base::close(state, exec_status); - } + if (state->get_task()->wake_up_early()) { + // partitial ignore rf to make global rf work or ignore useless rf + RETURN_IF_ERROR(_runtime_filter_slots->send_filter_size(state, 0, _finish_dependency)); + RETURN_IF_ERROR(_runtime_filter_slots->ignore_all_filters()); } else if (_should_build_hash_table) { - if (p._shared_hashtable_controller && - !p._shared_hash_table_context->complete_build_stage) { - return Status::InternalError("close before sink meet eos"); - } auto* block = _shared_state->build_block.get(); uint64_t hash_table_size = block ? block->rows() : 0; { @@ -166,26 +156,25 @@ Status HashJoinBuildSinkLocalState::close(RuntimeState* state, Status exec_statu SCOPED_TIMER(_runtime_filter_compute_timer); _runtime_filter_slots->insert(block); } - } else if ((p._shared_hashtable_controller && !p._shared_hash_table_context->signaled) || - (p._shared_hash_table_context && - !p._shared_hash_table_context->complete_build_stage)) { - throw Exception(ErrorCode::INTERNAL_ERROR, "build_sink::close meet error state"); - } else { - RETURN_IF_ERROR( - _runtime_filter_slots->copy_from_shared_context(p._shared_hash_table_context)); } SCOPED_TIMER(_publish_runtime_filter_timer); RETURN_IF_ERROR(_runtime_filter_slots->publish(state, !_should_build_hash_table)); } catch (Exception& e) { + bool blocked_by_complete_build_stage = p._shared_hashtable_controller && + !p._shared_hash_table_context->complete_build_stage; + bool blocked_by_shared_hash_table_signal = !_should_build_hash_table && + p._shared_hashtable_controller && + !p._shared_hash_table_context->signaled; + return Status::InternalError( - "rf process meet error: {}, wake_up_by_downstream: {}, should_build_hash_table: " - "{}, _finish_dependency: {}, complete_build_stage: {}, shared_hash_table_signaled: " + "rf process meet error: {}, wake_up_early: {}, should_build_hash_table: " + "{}, _finish_dependency: {}, blocked_by_complete_build_stage: {}, " + "blocked_by_shared_hash_table_signal: " "{}", - e.to_string(), state->get_task()->wake_up_by_downstream(), _should_build_hash_table, - _finish_dependency->debug_string(), - p._shared_hash_table_context && !p._shared_hash_table_context->complete_build_stage, - p._shared_hashtable_controller && !p._shared_hash_table_context->signaled); + e.to_string(), state->get_task()->wake_up_early(), _should_build_hash_table, + _finish_dependency->debug_string(), blocked_by_complete_build_stage, + blocked_by_shared_hash_table_signal); } return Base::close(state, exec_status); } @@ -265,7 +254,7 @@ Status HashJoinBuildSinkLocalState::_extract_join_column( // update nulllmap and split nested out of ColumnNullable when serialize_null_into_key is false and column is nullable const auto& col_nested = nullable->get_nested_column(); const auto& col_nullmap = nullable->get_null_map_data(); - DCHECK(null_map != nullptr); + DCHECK(null_map); vectorized::VectorizedUtils::update_null_map(null_map->get_data(), col_nullmap); raw_ptrs[i] = &col_nested; } else { @@ -314,9 +303,7 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, [&](std::monostate& arg, auto join_op, auto short_circuit_for_null_in_build_side, auto with_other_conjuncts) -> Status { - LOG(FATAL) << "FATAL: uninited hash table"; - __builtin_unreachable(); - return Status::OK(); + throw Exception(Status::FatalError("FATAL: uninited hash table")); }, [&](auto&& arg, auto&& join_op, auto short_circuit_for_null_in_build_side, auto with_other_conjuncts) -> Status { @@ -479,7 +466,6 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); - local_state._eos = eos; if (local_state._should_build_hash_table) { // If eos or have already met a null value using short-circuit strategy, we do not need to pull // data from probe side. @@ -556,6 +542,9 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* return _shared_hash_table_context->status; } + RETURN_IF_ERROR(local_state._runtime_filter_slots->copy_from_shared_context( + _shared_hash_table_context)); + local_state.profile()->add_info_string( "SharedHashTableFrom", print_id( @@ -581,6 +570,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* } if (eos) { + local_state._eos = true; local_state.init_short_circuit_for_probe(); // Since the comparison of null values is meaningless, null aware left anti/semi join should not output null // when the build side is not empty. diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index 7c663b256832ed..37ccd6206f3e0f 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -17,6 +17,8 @@ #include "hashjoin_probe_operator.h" +#include + #include #include "common/cast_set.h" @@ -240,7 +242,7 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc // If we use a short-circuit strategy, should return block directly by add additional null data. auto block_rows = local_state._probe_block.rows(); if (local_state._probe_eos && block_rows == 0) { - *eos = local_state._probe_eos; + *eos = true; return Status::OK(); } @@ -369,7 +371,7 @@ Status HashJoinProbeLocalState::_extract_join_column(vectorized::Block& block, _need_null_map_for_probe = _need_probe_null_map(block, res_col_ids); } if (_need_null_map_for_probe) { - if (_null_map_column == nullptr) { + if (!_null_map_column) { _null_map_column = vectorized::ColumnUInt8::create(); } _null_map_column->get_data().assign(block.rows(), (uint8_t)0); @@ -387,7 +389,7 @@ Status HashJoinProbeLocalState::_extract_join_column(vectorized::Block& block, // update nulllmap and split nested out of ColumnNullable when serialize_null_into_key is false and column is nullable const auto& col_nested = nullable->get_nested_column(); const auto& col_nullmap = nullable->get_null_map_data(); - DCHECK(_null_map_column != nullptr); + DCHECK(_null_map_column); vectorized::VectorizedUtils::update_null_map(_null_map_column->get_data(), col_nullmap); _probe_columns[i] = &col_nested; } else { @@ -616,21 +618,34 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) { size_t idx = 0; for (const auto* slot : slots_to_check) { auto data_type = slot->get_data_type_ptr(); - auto target_data_type = idx < right_col_idx ? _left_table_data_types[idx] - : _right_table_data_types[idx - right_col_idx]; + const auto slot_on_left = idx < right_col_idx; + auto target_data_type = slot_on_left ? _left_table_data_types[idx] + : _right_table_data_types[idx - right_col_idx]; ++idx; if (data_type->equals(*target_data_type)) { continue; } - auto data_type_non_nullable = vectorized::remove_nullable(data_type); - if (data_type_non_nullable->equals(*target_data_type)) { + /// For outer join(left/right/full), the non-nullable columns may be converted to nullable. + const auto accept_nullable_not_match = + _join_op == TJoinOp::FULL_OUTER_JOIN || + (slot_on_left ? _join_op == TJoinOp::RIGHT_OUTER_JOIN + : _join_op == TJoinOp::LEFT_OUTER_JOIN); + + if (accept_nullable_not_match) { + auto data_type_non_nullable = vectorized::remove_nullable(data_type); + if (data_type_non_nullable->equals(*target_data_type)) { + continue; + } + } else if (data_type->equals(*target_data_type)) { continue; } - return Status::InternalError("intermediate slot({}) data type not match: '{}' vs '{}'", - slot->id(), data_type->get_name(), - _left_table_data_types[idx]->get_name()); + return Status::InternalError( + "Join node(id={}, OP={}) intermediate slot({}, #{})'s on {} table data type not " + "match: '{}' vs '{}'", + _node_id, _join_op, slot->col_name(), slot->id(), (slot_on_left ? "left" : "right"), + data_type->get_name(), target_data_type->get_name()); } _build_side_child.reset(); diff --git a/be/src/pipeline/exec/hive_table_sink_operator.h b/be/src/pipeline/exec/hive_table_sink_operator.h index 58e705fd8e46c7..8af3e5bd5e9764 100644 --- a/be/src/pipeline/exec/hive_table_sink_operator.h +++ b/be/src/pipeline/exec/hive_table_sink_operator.h @@ -21,6 +21,7 @@ #include "vec/sink/writer/vhive_table_writer.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class HiveTableSinkOperatorX; @@ -83,4 +84,5 @@ class HiveTableSinkOperatorX final : public DataSinkOperatorX { TOdbcTableType::type _table_type; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/jdbc_table_sink_operator.h b/be/src/pipeline/exec/jdbc_table_sink_operator.h index 3ea702fd0baf0a..a0dae301a5fcad 100644 --- a/be/src/pipeline/exec/jdbc_table_sink_operator.h +++ b/be/src/pipeline/exec/jdbc_table_sink_operator.h @@ -23,6 +23,7 @@ #include "vec/sink/writer/vjdbc_table_writer.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class JdbcTableSinkOperatorX; class JdbcTableSinkLocalState final @@ -59,4 +60,5 @@ class JdbcTableSinkOperatorX final : public DataSinkOperatorX -struct Batch { - static constexpr uint32_t MAX_SIZE = 7; /// Adequate values are 3, 7, 15, 31. - - uint8_t size = 0; /// It's smaller than size_t but keeps align in Arena. - Batch* next = nullptr; - RowRefType row_refs[MAX_SIZE]; - - Batch(Batch* parent) : next(parent) {} - - bool full() const { return size == MAX_SIZE; } - - Batch* insert(RowRefType&& row_ref, vectorized::Arena& pool) { - if (full()) { - auto batch = pool.alloc>(); - *batch = Batch(this); - batch->insert(std::move(row_ref), pool); - return batch; - } - - row_refs[size++] = std::move(row_ref); - return this; - } -}; - -template -class ForwardIterator { -public: - using RowRefType = typename RowRefListType::RowRefType; - ForwardIterator() : root(nullptr), first(false), batch(nullptr), position(0) {} - - ForwardIterator(RowRefListType* begin) - : root(begin), first(true), batch((&root->next)), position(0) {} - - RowRefType& operator*() { - if (first) { - return *root; - } - return batch->operator[](position); - } - - RowRefType* operator->() { return &(**this); } - - void operator++() { - if (first) { - first = false; - return; - } - - if (batch && position < batch->size()) { - ++position; - } - } - - bool ok() const { return first || (batch && position < batch->size()); } - -private: - RowRefListType* root = nullptr; - bool first; - std::vector* batch = nullptr; - size_t position; -}; - -struct RowRefList : RowRef { - using RowRefType = RowRef; - - RowRefList() = default; - RowRefList(size_t row_num_) : RowRef(row_num_) {} - - ForwardIterator begin() { return {this}; } - - /// insert element after current one - void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } - - void clear() { next.clear(); } - -private: - friend class ForwardIterator; - std::vector next; -}; - -struct RowRefListWithFlag : RowRef { - using RowRefType = RowRef; - - RowRefListWithFlag() = default; - RowRefListWithFlag(size_t row_num_) : RowRef(row_num_) {} - - ForwardIterator begin() { return {this}; } - - /// insert element after current one - void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } - - void clear() { next.clear(); } - - bool visited = false; - -private: - friend class ForwardIterator; - std::vector next; -}; - -struct RowRefListWithFlags : RowRefWithFlag { - using RowRefType = RowRefWithFlag; - - RowRefListWithFlags() = default; - RowRefListWithFlags(size_t row_num_) : RowRefWithFlag(row_num_) {} - - ForwardIterator begin() { return {this}; } - - /// insert element after current one - void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } - - void clear() { next.clear(); } - -private: - friend class ForwardIterator; - std::vector next; -}; - -} // namespace doris diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h b/be/src/pipeline/exec/join/process_hash_table_probe.h index 14e0edd977f57b..91fd82f0644939 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe.h @@ -19,7 +19,6 @@ #include -#include "join_op.h" #include "vec/columns/column.h" #include "vec/columns/columns_number.h" #include "vec/common/arena.h" diff --git a/be/src/pipeline/exec/join_build_sink_operator.h b/be/src/pipeline/exec/join_build_sink_operator.h index 9d79a97397ff77..2a24f6a0492f3b 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.h +++ b/be/src/pipeline/exec/join_build_sink_operator.h @@ -20,6 +20,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" template class JoinBuildSinkOperatorX; @@ -78,4 +79,5 @@ class JoinBuildSinkOperatorX : public DataSinkOperatorX { const std::vector _runtime_filter_descs; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/join_probe_operator.cpp b/be/src/pipeline/exec/join_probe_operator.cpp index 11b5b29c8b556b..9a50d76a48ce8c 100644 --- a/be/src/pipeline/exec/join_probe_operator.cpp +++ b/be/src/pipeline/exec/join_probe_operator.cpp @@ -150,7 +150,7 @@ Status JoinProbeLocalState::_build_output_block( /// TODO: maybe need a method to check if a column need to be converted to full /// column. if (is_column_const(*origin_column) || - check_column(origin_column)) { + check_column(origin_column.get())) { auto column_ptr = origin_column->convert_to_full_column_if_const(); insert_column_datas(mutable_columns[i], column_ptr, rows); } else { diff --git a/be/src/pipeline/exec/join_probe_operator.h b/be/src/pipeline/exec/join_probe_operator.h index 078806cea4fc5a..161fd18fa1dab8 100644 --- a/be/src/pipeline/exec/join_probe_operator.h +++ b/be/src/pipeline/exec/join_probe_operator.h @@ -20,6 +20,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" template class JoinProbeOperatorX; template @@ -123,4 +124,5 @@ class JoinProbeOperatorX : public StatefulOperatorX { const bool _use_specific_projections; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/memory_scratch_sink_operator.h b/be/src/pipeline/exec/memory_scratch_sink_operator.h index c74659d15b96f2..352826955fca99 100644 --- a/be/src/pipeline/exec/memory_scratch_sink_operator.h +++ b/be/src/pipeline/exec/memory_scratch_sink_operator.h @@ -23,6 +23,7 @@ #include "runtime/result_queue_mgr.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class MemoryScratchSinkOperatorX; class MemoryScratchSinkLocalState final : public PipelineXSinkLocalState { @@ -67,4 +68,5 @@ class MemoryScratchSinkOperatorX final : public DataSinkOperatorX { TUserIdentity _user_identity; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/multi_cast_data_stream_sink.h b/be/src/pipeline/exec/multi_cast_data_stream_sink.h index 57b5974064b6a2..9d69b3fb5bdc9e 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_sink.h +++ b/be/src/pipeline/exec/multi_cast_data_stream_sink.h @@ -20,6 +20,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class MultiCastDataStreamSinkOperatorX; class MultiCastDataStreamSinkLocalState final @@ -75,4 +76,5 @@ class MultiCastDataStreamSinkOperatorX final std::atomic _num_dests; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/multi_cast_data_streamer.h b/be/src/pipeline/exec/multi_cast_data_streamer.h index 51a73cf0c2b053..380538d0ac0805 100644 --- a/be/src/pipeline/exec/multi_cast_data_streamer.h +++ b/be/src/pipeline/exec/multi_cast_data_streamer.h @@ -20,6 +20,7 @@ #include "vec/sink/vdata_stream_sender.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class Dependency; struct MultiCastBlock { @@ -84,4 +85,5 @@ class MultiCastDataStreamer { std::vector _dependencies; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.h b/be/src/pipeline/exec/nested_loop_join_build_operator.h index 5c41088a7059d4..11bcba2bd8fc3a 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.h @@ -23,6 +23,7 @@ #include "pipeline/exec/join_build_sink_operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class NestedLoopJoinBuildSinkOperatorX; @@ -89,4 +90,5 @@ class NestedLoopJoinBuildSinkOperatorX final RowDescriptor _row_descriptor; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index 7b06e216b81bf7..b1ab62743323c6 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -40,6 +40,7 @@ #include "vec/functions/in.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status OlapScanLocalState::_init_profile() { RETURN_IF_ERROR(ScanLocalState::_init_profile()); @@ -149,6 +150,47 @@ Status OlapScanLocalState::_init_profile() { _tablet_counter = ADD_COUNTER(_runtime_profile, "TabletNum", TUnit::UNIT); _key_range_counter = ADD_COUNTER(_runtime_profile, "KeyRangesNum", TUnit::UNIT); _runtime_filter_info = ADD_LABEL_COUNTER_WITH_LEVEL(_runtime_profile, "RuntimeFilterInfo", 1); + + _tablet_reader_init_timer = ADD_TIMER(_scanner_profile, "TabletReaderInitTimer"); + _tablet_reader_capture_rs_readers_timer = + ADD_TIMER(_scanner_profile, "TabletReaderCaptureRsReadersTimer"); + _tablet_reader_init_return_columns_timer = + ADD_TIMER(_scanner_profile, "TabletReaderInitReturnColumnsTimer"); + _tablet_reader_init_keys_param_timer = + ADD_TIMER(_scanner_profile, "TabletReaderInitKeysParamTimer"); + _tablet_reader_init_orderby_keys_param_timer = + ADD_TIMER(_scanner_profile, "TabletReaderInitOrderbyKeysParamTimer"); + _tablet_reader_init_conditions_param_timer = + ADD_TIMER(_scanner_profile, "TabletReaderInitConditionsParamTimer"); + _tablet_reader_init_delete_condition_param_timer = + ADD_TIMER(_scanner_profile, "TabletReaderInitDeleteConditionParamTimer"); + _block_reader_vcollect_iter_init_timer = + ADD_TIMER(_scanner_profile, "BlockReaderVcollectIterInitTimer"); + _block_reader_rs_readers_init_timer = + ADD_TIMER(_scanner_profile, "BlockReaderRsReadersInitTimer"); + _block_reader_build_heap_init_timer = + ADD_TIMER(_scanner_profile, "BlockReaderBuildHeapInitTimer"); + + _rowset_reader_get_segment_iterators_timer = + ADD_TIMER(_scanner_profile, "RowsetReaderGetSegmentIteratorsTimer"); + _rowset_reader_create_iterators_timer = + ADD_TIMER(_scanner_profile, "RowsetReaderCreateIteratorsTimer"); + _rowset_reader_init_iterators_timer = + ADD_TIMER(_scanner_profile, "RowsetReaderInitIteratorsTimer"); + _rowset_reader_load_segments_timer = + ADD_TIMER(_scanner_profile, "RowsetReaderLoadSegmentsTimer"); + + _segment_iterator_init_timer = ADD_TIMER(_scanner_profile, "SegmentIteratorInitTimer"); + _segment_iterator_init_return_column_iterators_timer = + ADD_TIMER(_scanner_profile, "SegmentIteratorInitReturnColumnIteratorsTimer"); + _segment_iterator_init_bitmap_index_iterators_timer = + ADD_TIMER(_scanner_profile, "SegmentIteratorInitBitmapIndexIteratorsTimer"); + _segment_iterator_init_inverted_index_iterators_timer = + ADD_TIMER(_scanner_profile, "SegmentIteratorInitInvertedIndexIteratorsTimer"); + + _segment_create_column_readers_timer = + ADD_TIMER(_scanner_profile, "SegmentCreateColumnReadersTimer"); + _segment_load_index_timer = ADD_TIMER(_scanner_profile, "SegmentLoadIndexTimer"); return Status::OK(); } @@ -204,9 +246,8 @@ Status OlapScanLocalState::_should_push_down_function_filter(vectorized::Vectori DCHECK(children[1 - i]->type().is_string_type()); std::shared_ptr const_col_wrapper; RETURN_IF_ERROR(children[1 - i]->get_const_col(expr_ctx, &const_col_wrapper)); - if (const vectorized::ColumnConst* const_column = - check_and_get_column( - const_col_wrapper->column_ptr)) { + if (const auto* const_column = check_and_get_column( + const_col_wrapper->column_ptr.get())) { *constant_str = const_column->get_data_at(0); } else { pdt = PushDownType::UNACCEPTABLE; @@ -347,13 +388,13 @@ Status OlapScanLocalState::_init_scanners(std::list* s int ranges_per_scanner = std::max(1, (int)ranges->size() / std::min(scanners_per_tablet, size_based_scanners_per_tablet)); - int num_ranges = ranges->size(); - for (int i = 0; i < num_ranges;) { + int64_t num_ranges = ranges->size(); + for (int64_t i = 0; i < num_ranges;) { std::vector scanner_ranges; scanner_ranges.push_back((*ranges)[i].get()); ++i; - for (int j = 1; i < num_ranges && j < ranges_per_scanner && - (*ranges)[i]->end_include == (*ranges)[i - 1]->end_include; + for (int64_t j = 1; i < num_ranges && j < ranges_per_scanner && + (*ranges)[i]->end_include == (*ranges)[i - 1]->end_include; ++j, ++i) { scanner_ranges.push_back((*ranges)[i].get()); } @@ -587,4 +628,5 @@ OlapScanOperatorX::OlapScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, i } } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/olap_scan_operator.h b/be/src/pipeline/exec/olap_scan_operator.h index 9e8624b3a0b255..0e8e7223d4b8c5 100644 --- a/be/src/pipeline/exec/olap_scan_operator.h +++ b/be/src/pipeline/exec/olap_scan_operator.h @@ -26,6 +26,7 @@ #include "pipeline/exec/scan_operator.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class NewOlapScanner; @@ -183,6 +184,33 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _runtime_filter_info = nullptr; + // timer about tablet reader + RuntimeProfile::Counter* _tablet_reader_init_timer = nullptr; + RuntimeProfile::Counter* _tablet_reader_capture_rs_readers_timer = nullptr; + RuntimeProfile::Counter* _tablet_reader_init_return_columns_timer = nullptr; + RuntimeProfile::Counter* _tablet_reader_init_keys_param_timer = nullptr; + RuntimeProfile::Counter* _tablet_reader_init_orderby_keys_param_timer = nullptr; + RuntimeProfile::Counter* _tablet_reader_init_conditions_param_timer = nullptr; + RuntimeProfile::Counter* _tablet_reader_init_delete_condition_param_timer = nullptr; + + // timer about block reader + RuntimeProfile::Counter* _block_reader_vcollect_iter_init_timer = nullptr; + RuntimeProfile::Counter* _block_reader_rs_readers_init_timer = nullptr; + RuntimeProfile::Counter* _block_reader_build_heap_init_timer = nullptr; + + RuntimeProfile::Counter* _rowset_reader_get_segment_iterators_timer = nullptr; + RuntimeProfile::Counter* _rowset_reader_create_iterators_timer = nullptr; + RuntimeProfile::Counter* _rowset_reader_init_iterators_timer = nullptr; + RuntimeProfile::Counter* _rowset_reader_load_segments_timer = nullptr; + + RuntimeProfile::Counter* _segment_iterator_init_timer = nullptr; + RuntimeProfile::Counter* _segment_iterator_init_return_column_iterators_timer = nullptr; + RuntimeProfile::Counter* _segment_iterator_init_bitmap_index_iterators_timer = nullptr; + RuntimeProfile::Counter* _segment_iterator_init_inverted_index_iterators_timer = nullptr; + + RuntimeProfile::Counter* _segment_create_column_readers_timer = nullptr; + RuntimeProfile::Counter* _segment_load_index_timer = nullptr; + std::mutex _profile_mtx; }; @@ -198,4 +226,5 @@ class OlapScanOperatorX final : public ScanOperatorX { TQueryCacheParam _cache_param; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/olap_table_sink_operator.h b/be/src/pipeline/exec/olap_table_sink_operator.h index 8a9ffaaf769c31..3453a57a67b9bc 100644 --- a/be/src/pipeline/exec/olap_table_sink_operator.h +++ b/be/src/pipeline/exec/olap_table_sink_operator.h @@ -21,6 +21,7 @@ #include "vec/sink/writer/vtablet_writer.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class OlapTableSinkOperatorX; @@ -75,4 +76,5 @@ class OlapTableSinkOperatorX final : public DataSinkOperatorX DataSinkOperatorX::create_shar return nullptr; } else if constexpr (std::is_same_v) { - LOG(FATAL) << "should not reach here!"; - return nullptr; + throw Exception(Status::FatalError("should not reach here!")); } else { auto ss = LocalStateType::SharedStateType::create_shared(); ss->id = operator_id(); @@ -780,4 +780,5 @@ template class AsyncWriterSink; template class AsyncWriterSink; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h index c84c4e7b43f981..df6e9c913b6b4c 100644 --- a/be/src/pipeline/exec/operator.h +++ b/be/src/pipeline/exec/operator.h @@ -39,6 +39,7 @@ #include "vec/runtime/vdata_stream_recvr.h" namespace doris { +#include "common/compile_check_begin.h" class RowDescriptor; class RuntimeState; class TDataSink; @@ -631,12 +632,10 @@ class OperatorXBase : public OperatorBase { _limit(-1) {} virtual Status init(const TPlanNode& tnode, RuntimeState* state); Status init(const TDataSink& tsink) override { - LOG(FATAL) << "should not reach here!"; - return Status::OK(); + throw Exception(Status::FatalError("should not reach here!")); } virtual Status init(ExchangeType type) { - LOG(FATAL) << "should not reach here!"; - return Status::OK(); + throw Exception(Status::FatalError("should not reach here!")); } [[noreturn]] virtual const std::vector& runtime_filter_descs() { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, _op_name); @@ -859,4 +858,5 @@ class AsyncWriterSink : public PipelineXSinkLocalState { std::shared_ptr _finish_dependency; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.cpp b/be/src/pipeline/exec/partition_sort_sink_operator.cpp index 48b8fe9cb765a1..d0c28afe9de5ba 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/partition_sort_sink_operator.cpp @@ -24,6 +24,7 @@ #include "vec/common/hash_table/hash.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status PartitionSortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); @@ -66,7 +67,7 @@ PartitionSortSinkOperatorX::PartitionSortSinkOperatorX(ObjectPool* pool, int ope _pool(pool), _row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples), _limit(tnode.limit), - _partition_exprs_num(tnode.partition_sort_node.partition_exprs.size()), + _partition_exprs_num(cast_set(tnode.partition_sort_node.partition_exprs.size())), _topn_phase(tnode.partition_sort_node.ptopn_phase), _has_global_limit(tnode.partition_sort_node.has_global_limit), _top_n_algorithm(tnode.partition_sort_node.top_n_algorithm), @@ -212,7 +213,7 @@ Status PartitionSortSinkOperatorX::_emplace_into_hash_table( }; SCOPED_TIMER(local_state._emplace_key_timer); - int row = num_rows; + int64_t row = num_rows; for (row = row - 1; row >= 0 && !local_state._is_need_passthrough; --row) { auto& mapped = *agg_method.lazy_emplace(state, row, creator, creator_for_null_key); @@ -274,4 +275,5 @@ bool PartitionSortSinkLocalState::check_whether_need_passthrough() { } // NOLINTEND(readability-simplify-boolean-expr) +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.h b/be/src/pipeline/exec/partition_sort_sink_operator.h index 6926445f18f2f4..32bbf38202713f 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.h +++ b/be/src/pipeline/exec/partition_sort_sink_operator.h @@ -24,6 +24,7 @@ #include "vec/common/sort/partition_sorter.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class PartitionSortSinkOperatorX; class PartitionSortSinkLocalState : public PipelineXSinkLocalState { @@ -110,4 +111,5 @@ class PartitionSortSinkOperatorX final : public DataSinkOperatorX { @@ -324,4 +325,5 @@ class PartitionedAggSinkOperatorX : public DataSinkOperatorX(state, _shared_state->shared_from_this(), exception_catch_func)); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partitioned_aggregation_source_operator.h b/be/src/pipeline/exec/partitioned_aggregation_source_operator.h index 7e73241745e029..6fb0ecaba01e20 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_source_operator.h +++ b/be/src/pipeline/exec/partitioned_aggregation_source_operator.h @@ -22,6 +22,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -99,4 +100,5 @@ class PartitionedAggSourceOperatorX : public OperatorX std::unique_ptr _agg_source_operator; }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp index 20b25d54ff9f16..3e7f95374f53d2 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp @@ -23,6 +23,7 @@ #include "vec/spill/spill_stream_manager.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" PartitionedHashJoinProbeLocalState::PartitionedHashJoinProbeLocalState(RuntimeState* state, OperatorXBase* parent) @@ -866,4 +867,5 @@ Status PartitionedHashJoinProbeOperatorX::get_block(RuntimeState* state, vectori return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h index f8fc0780b6fc3f..a19e88d7203e62 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h @@ -27,6 +27,7 @@ #include "pipeline/exec/spill_utils.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -213,4 +214,5 @@ class PartitionedHashJoinProbeOperatorX final }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp index 878c3870946f1c..852dccae71ca3b 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp @@ -23,6 +23,7 @@ #include "vec/spill/spill_stream_manager.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status PartitionedHashJoinSinkLocalState::init(doris::RuntimeState* state, doris::pipeline::LocalSinkStateInfo& info) { @@ -246,11 +247,11 @@ Status PartitionedHashJoinSinkLocalState::revoke_memory(RuntimeState* state) { return _revoke_unpartitioned_block(state); } - _spilling_streams_count = _shared_state->partitioned_build_blocks.size(); + _spilling_streams_count = cast_set(_shared_state->partitioned_build_blocks.size()); auto query_id = state->query_id(); - for (size_t i = 0; i != _shared_state->partitioned_build_blocks.size(); ++i) { + for (int i = 0; i != _shared_state->partitioned_build_blocks.size(); ++i) { vectorized::SpillStreamSPtr& spilling_stream = _shared_state->spilled_streams[i]; auto& mutable_block = _shared_state->partitioned_build_blocks[i]; @@ -555,4 +556,5 @@ Status PartitionedHashJoinSinkOperatorX::revoke_memory(RuntimeState* state) { return local_state.revoke_memory(state); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h index d1fe30e06f2dd2..e16e52dcaf9453 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h @@ -28,6 +28,7 @@ #include "vec/runtime/partitioner.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -148,4 +149,5 @@ class PartitionedHashJoinSinkOperatorX }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/repeat_operator.cpp b/be/src/pipeline/exec/repeat_operator.cpp index 5c94d43f0d1e05..48131e0d96e4c6 100644 --- a/be/src/pipeline/exec/repeat_operator.cpp +++ b/be/src/pipeline/exec/repeat_operator.cpp @@ -24,6 +24,7 @@ #include "vec/core/block.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -221,8 +222,7 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp _repeat_id_idx++; - int size = _repeat_id_list.size(); - if (_repeat_id_idx >= size) { + if (_repeat_id_idx >= _repeat_id_list.size()) { _intermediate_block->clear(); _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); _repeat_id_idx = 0; @@ -251,4 +251,5 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/repeat_operator.h b/be/src/pipeline/exec/repeat_operator.h index 31f88f37231aaa..2c2af32de0b0fb 100644 --- a/be/src/pipeline/exec/repeat_operator.h +++ b/be/src/pipeline/exec/repeat_operator.h @@ -23,6 +23,7 @@ #include "pipeline/exec/operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -92,4 +93,5 @@ class RepeatOperatorX final : public StatefulOperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/result_file_sink_operator.cpp b/be/src/pipeline/exec/result_file_sink_operator.cpp index c65b9dda89d0ec..f806d9533d9e4c 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.cpp +++ b/be/src/pipeline/exec/result_file_sink_operator.cpp @@ -28,6 +28,7 @@ #include "vec/sink/vdata_stream_sender.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" ResultFileSinkLocalState::ResultFileSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) @@ -143,4 +144,5 @@ Status ResultFileSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_ return local_state.sink(state, in_block, eos); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/result_file_sink_operator.h b/be/src/pipeline/exec/result_file_sink_operator.h index e9f2b8eeb9c670..c3c5e345f77e1a 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.h +++ b/be/src/pipeline/exec/result_file_sink_operator.h @@ -21,6 +21,7 @@ #include "vec/sink/writer/vfile_result_writer.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class BroadcastPBlockHolder; } // namespace doris::vectorized @@ -88,4 +89,5 @@ class ResultFileSinkOperatorX final : public DataSinkOperatorX _sender = nullptr; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index f8196910021b2c..8aeecbbddc12dc 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -35,6 +35,7 @@ #include "vec/sink/vmysql_result_writer.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); @@ -208,4 +209,5 @@ Status ResultSinkLocalState::close(RuntimeState* state, Status exec_status) { return final_status; } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/result_sink_operator.h b/be/src/pipeline/exec/result_sink_operator.h index 339c167825643b..479343ed6d5ea5 100644 --- a/be/src/pipeline/exec/result_sink_operator.h +++ b/be/src/pipeline/exec/result_sink_operator.h @@ -25,6 +25,7 @@ #include "runtime/result_writer.h" namespace doris { +#include "common/compile_check_begin.h" class BufferControlBlock; namespace pipeline { @@ -172,4 +173,5 @@ class ResultSinkOperatorX final : public DataSinkOperatorX }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index ae4396b22c7eec..a7802f33e249db 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -520,8 +520,8 @@ Status ScanLocalState::_eval_const_conjuncts(vectorized::VExpr* vexpr, if (vexpr->is_constant()) { std::shared_ptr const_col_wrapper; RETURN_IF_ERROR(vexpr->get_const_col(expr_ctx, &const_col_wrapper)); - if (const auto* const_column = - check_and_get_column(const_col_wrapper->column_ptr)) { + if (const auto* const_column = check_and_get_column( + const_col_wrapper->column_ptr.get())) { constant_val = const_cast(const_column->get_data_at(0).data); if (constant_val == nullptr || !*reinterpret_cast(constant_val)) { *pdt = PushDownType::ACCEPTABLE; @@ -530,7 +530,7 @@ Status ScanLocalState::_eval_const_conjuncts(vectorized::VExpr* vexpr, } } else if (const auto* bool_column = check_and_get_column>( - const_col_wrapper->column_ptr)) { + const_col_wrapper->column_ptr.get())) { // TODO: If `vexpr->is_constant()` is true, a const column is expected here. // But now we still don't cover all predicates for const expression. // For example, for query `SELECT col FROM tbl WHERE 'PROMOTION' LIKE 'AAA%'`, @@ -690,7 +690,7 @@ Status ScanLocalState::_should_push_down_binary_predicate( std::shared_ptr const_col_wrapper; RETURN_IF_ERROR(children[1 - i]->get_const_col(expr_ctx, &const_col_wrapper)); if (const auto* const_column = check_and_get_column( - const_col_wrapper->column_ptr)) { + const_col_wrapper->column_ptr.get())) { *slot_ref_child = i; *constant_val = const_column->get_data_at(0); } else { @@ -1190,9 +1190,11 @@ Status ScanOperatorX::init(const TPlanNode& tnode, RuntimeState* // is checked in previous branch. if (query_options.enable_adaptive_pipeline_task_serial_read_on_limit) { DCHECK(query_options.__isset.adaptive_pipeline_task_serial_read_on_limit); - if (tnode.limit > 0 && - tnode.limit <= query_options.adaptive_pipeline_task_serial_read_on_limit) { - _should_run_serial = true; + if (!tnode.__isset.conjuncts || tnode.conjuncts.empty()) { + if (tnode.limit > 0 && + tnode.limit <= query_options.adaptive_pipeline_task_serial_read_on_limit) { + _should_run_serial = true; + } } } } diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h index 4519a3ca283f6f..c6c9cdf405d5a4 100644 --- a/be/src/pipeline/exec/scan_operator.h +++ b/be/src/pipeline/exec/scan_operator.h @@ -35,8 +35,9 @@ #include "vec/utils/util.hpp" namespace doris::vectorized { +#include "common/compile_check_begin.h" class ScannerDelegate; -} +} // namespace doris::vectorized namespace doris::pipeline { @@ -436,4 +437,5 @@ class ScanOperatorX : public OperatorX { std::vector topn_filter_source_node_ids; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/schema_scan_operator.cpp b/be/src/pipeline/exec/schema_scan_operator.cpp index ddc2821cac14a1..2e2f80f5e24838 100644 --- a/be/src/pipeline/exec/schema_scan_operator.cpp +++ b/be/src/pipeline/exec/schema_scan_operator.cpp @@ -26,6 +26,7 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -144,7 +145,7 @@ Status SchemaScanOperatorX::open(RuntimeState* state) { return Status::InternalError("Failed to get tuple descriptor."); } - _slot_num = _dest_tuple_desc->slots().size(); + _slot_num = cast_set(_dest_tuple_desc->slots().size()); // get src tuple desc const auto* schema_table = static_cast(_dest_tuple_desc->table_desc()); @@ -269,4 +270,5 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, vectorized::Block* bl return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/schema_scan_operator.h b/be/src/pipeline/exec/schema_scan_operator.h index c8ddf885e98a0f..2d861002748163 100644 --- a/be/src/pipeline/exec/schema_scan_operator.h +++ b/be/src/pipeline/exec/schema_scan_operator.h @@ -24,6 +24,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -88,4 +89,5 @@ class SchemaScanOperatorX final : public OperatorX { std::unique_ptr _schema_scanner; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/exec/select_operator.h b/be/src/pipeline/exec/select_operator.h index 5370cd9e293c34..584a6f74308903 100644 --- a/be/src/pipeline/exec/select_operator.h +++ b/be/src/pipeline/exec/select_operator.h @@ -22,6 +22,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class SelectOperatorX; class SelectLocalState final : public PipelineXLocalState { @@ -55,4 +56,5 @@ class SelectOperatorX final : public StreamingOperatorX { [[nodiscard]] bool is_source() const override { return false; } }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/set_probe_sink_operator.cpp b/be/src/pipeline/exec/set_probe_sink_operator.cpp index 4c250d5603b499..db487b0f9e7252 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.cpp +++ b/be/src/pipeline/exec/set_probe_sink_operator.cpp @@ -25,6 +25,7 @@ #include "vec/common/hash_table/hash_table_set_probe.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace vectorized { @@ -69,7 +70,7 @@ Status SetProbeSinkOperatorX::sink(RuntimeState* state, vectorized SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); - auto probe_rows = in_block->rows(); + uint32_t probe_rows = cast_set(in_block->rows()); if (probe_rows > 0) { { SCOPED_TIMER(local_state._extract_probe_data_timer); @@ -220,8 +221,8 @@ void SetProbeSinkOperatorX::_refresh_hash_table( ? (valid_element_in_hash_tbl < arg.hash_table ->size()) // When intersect, shrink as long as the element decreases - : (valid_element_in_hash_tbl < - arg.hash_table->size() * + : ((double)valid_element_in_hash_tbl < + (double)arg.hash_table->size() * need_shrink_ratio); // When except, element decreases need to within the 'need_shrink_ratio' before shrinking if (is_need_shrink) { @@ -231,7 +232,7 @@ void SetProbeSinkOperatorX::_refresh_hash_table( local_state._shared_state->valid_element_in_hash_tbl); while (iter != iter_end) { auto& mapped = iter->get_second(); - auto it = mapped.begin(); + auto* it = &mapped; if constexpr (is_intersect) { if (it->visited) { @@ -249,7 +250,7 @@ void SetProbeSinkOperatorX::_refresh_hash_table( } else if (is_intersect) { while (iter != iter_end) { auto& mapped = iter->get_second(); - auto it = mapped.begin(); + auto* it = &mapped; it->visited = false; ++iter; } @@ -269,4 +270,5 @@ template class SetProbeSinkLocalState; template class SetProbeSinkOperatorX; template class SetProbeSinkOperatorX; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/set_probe_sink_operator.h b/be/src/pipeline/exec/set_probe_sink_operator.h index 368ea812cdfe01..6b764c1e509951 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.h +++ b/be/src/pipeline/exec/set_probe_sink_operator.h @@ -23,6 +23,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace vectorized { @@ -116,4 +117,5 @@ class SetProbeSinkOperatorX final : public DataSinkOperatorX::_get_data_in_hashtable( auto block_size = 0; auto add_result = [&local_state, &block_size, this](auto value) { - auto it = value.begin(); + auto* it = &value; if constexpr (is_intersect) { if (it->visited) { //intersected: have done probe, so visited values it's the result _add_result_columns(local_state, value, block_size); @@ -147,8 +147,8 @@ Status SetSourceOperatorX::_get_data_in_hashtable( *eos = iter == hash_table_ctx.hash_table->end(); if (*eos && hash_table_ctx.hash_table->has_null_key_data()) { - auto value = hash_table_ctx.hash_table->template get_null_key_data(); - if constexpr (std::is_same_v>) { + auto value = hash_table_ctx.hash_table->template get_null_key_data(); + if constexpr (std::is_same_v>) { add_result(value); } } @@ -168,15 +168,13 @@ Status SetSourceOperatorX::_get_data_in_hashtable( template void SetSourceOperatorX::_add_result_columns( - SetSourceLocalState& local_state, RowRefListWithFlags& value, - int& block_size) { + SetSourceLocalState& local_state, RowRefWithFlag& value, int& block_size) { auto& build_col_idx = local_state._shared_state->build_col_idx; auto& build_block = local_state._shared_state->build_block; - auto it = value.begin(); for (auto idx = build_col_idx.begin(); idx != build_col_idx.end(); ++idx) { auto& column = *build_block.get_by_position(idx->second).column; - local_state._mutable_cols[idx->first]->insert_from(column, it->row_num); + local_state._mutable_cols[idx->first]->insert_from(column, value.row_num); } block_size++; } diff --git a/be/src/pipeline/exec/set_source_operator.h b/be/src/pipeline/exec/set_source_operator.h index 976ffde3bf23ea..d881e9277fb7b6 100644 --- a/be/src/pipeline/exec/set_source_operator.h +++ b/be/src/pipeline/exec/set_source_operator.h @@ -83,8 +83,8 @@ class SetSourceOperatorX final : public OperatorX& local_state, - RowRefListWithFlags& value, int& block_size); + void _add_result_columns(SetSourceLocalState& local_state, RowRefWithFlag& value, + int& block_size); const size_t _child_quantity; }; #include "common/compile_check_end.h" diff --git a/be/src/pipeline/exec/sort_sink_operator.cpp b/be/src/pipeline/exec/sort_sink_operator.cpp index 072f28723a36ea..6bec42ac62d192 100644 --- a/be/src/pipeline/exec/sort_sink_operator.cpp +++ b/be/src/pipeline/exec/sort_sink_operator.cpp @@ -25,6 +25,7 @@ #include "vec/common/sort/topn_sorter.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status SortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); @@ -176,4 +177,5 @@ void SortSinkOperatorX::reset(RuntimeState* state) { auto& local_state = get_local_state(state); local_state._shared_state->sorter->reset(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/sort_sink_operator.h b/be/src/pipeline/exec/sort_sink_operator.h index 6bf87164e71026..766c6c0ffc9a59 100644 --- a/be/src/pipeline/exec/sort_sink_operator.h +++ b/be/src/pipeline/exec/sort_sink_operator.h @@ -23,6 +23,7 @@ #include "vec/core/field.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class SortSinkOperatorX; @@ -109,4 +110,5 @@ class SortSinkOperatorX final : public DataSinkOperatorX { const bool _reuse_mem; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/sort_source_operator.cpp b/be/src/pipeline/exec/sort_source_operator.cpp index 7f801b79c0b12b..2fb09d7278fda8 100644 --- a/be/src/pipeline/exec/sort_source_operator.cpp +++ b/be/src/pipeline/exec/sort_source_operator.cpp @@ -22,6 +22,7 @@ #include "pipeline/exec/operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" SortLocalState::SortLocalState(RuntimeState* state, OperatorXBase* parent) : PipelineXLocalState(state, parent) {} @@ -79,4 +80,5 @@ Status SortSourceOperatorX::build_merger(RuntimeState* state, return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/sort_source_operator.h b/be/src/pipeline/exec/sort_source_operator.h index 20714eb44e5e60..a638b04b368eaa 100644 --- a/be/src/pipeline/exec/sort_source_operator.h +++ b/be/src/pipeline/exec/sort_source_operator.h @@ -23,6 +23,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -69,4 +70,5 @@ class SortSourceOperatorX final : public OperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/spill_utils.h b/be/src/pipeline/exec/spill_utils.h index 925e7df44e607e..2ba6f22a60b10c 100644 --- a/be/src/pipeline/exec/spill_utils.h +++ b/be/src/pipeline/exec/spill_utils.h @@ -26,6 +26,7 @@ #include "vec/runtime/partitioner.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" using SpillPartitionerType = vectorized::Crc32HashPartitioner; class SpillRunnable : public Runnable { @@ -70,4 +71,5 @@ class SpillRunnable : public Runnable { std::function _func; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_operator.cpp index 1c8d2c47bc698a..b6e5788a07c626 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/streaming_aggregation_operator.cpp @@ -29,6 +29,7 @@ #include "vec/exprs/vslot_ref.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -228,7 +229,7 @@ Status StreamingAggLocalState::_merge_with_serialized_key_helper(vectorized::Blo } } - int rows = block->rows(); + size_t rows = block->rows(); if (_places.size() < rows) { _places.resize(rows); } @@ -270,7 +271,7 @@ Status StreamingAggLocalState::_merge_with_serialized_key_helper(vectorized::Blo for (int i = 0; i < _aggregate_evaluators.size(); ++i) { if (_aggregate_evaluators[i]->is_merge() || for_spill) { - int col_id = 0; + size_t col_id = 0; if constexpr (for_spill) { col_id = _probe_expr_ctxs.size() + i; } else { @@ -403,7 +404,7 @@ Status StreamingAggLocalState::_execute_with_serialized_key_helper(vectorized::B } } - int rows = block->rows(); + size_t rows = block->rows(); if (_places.size() < rows) { _places.resize(rows); } @@ -542,8 +543,8 @@ bool StreamingAggLocalState::_should_expand_preagg_hash_tables() { const int64_t aggregated_input_rows = input_rows - _cur_num_rows_returned; // TODO chenhao // const int64_t expected_input_rows = estimated_input_cardinality_ - num_rows_returned_; - double current_reduction = - static_cast(aggregated_input_rows) / ht_rows; + double current_reduction = static_cast(aggregated_input_rows) / + static_cast(ht_rows); // TODO: workaround for IMPALA-2490: subplan node rows_returned counter may be // inaccurate, which could lead to a divide by zero below. @@ -615,7 +616,7 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::vectorized::B } } - int rows = in_block->rows(); + size_t rows = in_block->rows(); _places.resize(rows); // Stop expanding hash tables if we're not reducing the input sufficiently. As our @@ -739,7 +740,7 @@ Status StreamingAggLocalState::_get_with_serialized_key_result(RuntimeState* sta auto columns_with_schema = vectorized::VectorizedUtils::create_columns_with_type_and_name(p._row_descriptor); - int key_size = _probe_expr_ctxs.size(); + size_t key_size = _probe_expr_ctxs.size(); vectorized::MutableColumns key_columns; for (int i = 0; i < key_size; ++i) { @@ -750,7 +751,7 @@ Status StreamingAggLocalState::_get_with_serialized_key_result(RuntimeState* sta } } vectorized::MutableColumns value_columns; - for (int i = key_size; i < columns_with_schema.size(); ++i) { + for (size_t i = key_size; i < columns_with_schema.size(); ++i) { if (!mem_reuse) { value_columns.emplace_back(columns_with_schema[i].type->create_column()); } else { @@ -852,7 +853,7 @@ Status StreamingAggLocalState::_get_results_without_key(RuntimeState* state, block->clear(); DCHECK(_agg_data->without_key != nullptr); - int agg_size = _aggregate_evaluators.size(); + const auto agg_size = _aggregate_evaluators.size(); vectorized::MutableColumns value_columns(agg_size); std::vector data_types(agg_size); @@ -888,8 +889,8 @@ Status StreamingAggLocalState::_get_results_with_serialized_key(RuntimeState* st bool* eos) { SCOPED_TIMER(_get_results_timer); auto& p = _parent->cast(); - int key_size = _probe_expr_ctxs.size(); - int agg_size = _aggregate_evaluators.size(); + const auto key_size = _probe_expr_ctxs.size(); + const auto agg_size = _aggregate_evaluators.size(); vectorized::MutableColumns value_columns(agg_size); vectorized::DataTypes value_data_types(agg_size); @@ -1013,7 +1014,7 @@ Status StreamingAggLocalState::_get_without_key_result(RuntimeState* state, auto& p = _parent->cast(); *block = vectorized::VectorizedUtils::create_empty_columnswithtypename(p._row_descriptor); - int agg_size = _aggregate_evaluators.size(); + const auto agg_size = _aggregate_evaluators.size(); vectorized::MutableColumns columns(agg_size); std::vector data_types(agg_size); @@ -1170,8 +1171,8 @@ Status StreamingAggOperatorX::open(RuntimeState* state) { DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size()); RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child->row_desc())); - int j = _probe_expr_ctxs.size(); - for (int i = 0; i < j; ++i) { + size_t j = _probe_expr_ctxs.size(); + for (size_t i = 0; i < j; ++i) { auto nullable_output = _output_tuple_desc->slots()[i]->is_nullable(); auto nullable_input = _probe_expr_ctxs[i]->root()->is_nullable(); if (nullable_output != nullable_input) { @@ -1179,7 +1180,7 @@ Status StreamingAggOperatorX::open(RuntimeState* state) { _make_nullable_keys.emplace_back(i); } } - for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { + for (size_t i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j]; SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j]; RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare( @@ -1290,4 +1291,5 @@ bool StreamingAggOperatorX::need_more_input_data(RuntimeState* state) const { return local_state._pre_aggregated_block->empty() && !local_state._child_eos; } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.h b/be/src/pipeline/exec/streaming_aggregation_operator.h index b695880ac2857b..bd35cd940f2974 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.h +++ b/be/src/pipeline/exec/streaming_aggregation_operator.h @@ -27,6 +27,7 @@ #include "vec/core/block.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -237,4 +238,5 @@ class StreamingAggOperatorX final : public StatefulOperatorXget_value( columns[p._child_slots.size() + p._fn_num - 1], - state->batch_size() - columns[p._child_slots.size()]->size()); + //// It has already been checked that + // columns[p._child_slots.size()]->size() < state->batch_size(), + // so columns[p._child_slots.size()]->size() will not exceed the range of int. + state->batch_size() - (int)columns[p._child_slots.size()]->size()); _current_row_insert_times += repeat_times; for (int i = 0; i < p._fn_num - 1; i++) { _fns[i]->get_same_many_values(columns[i + p._child_slots.size()], repeat_times); @@ -276,7 +280,7 @@ Status TableFunctionOperatorX::init(const TPlanNode& tnode, RuntimeState* state) fn->set_expr_context(ctx); _fns.push_back(fn); } - _fn_num = _fns.size(); + _fn_num = cast_set(_fns.size()); // Prepare output slot ids RETURN_IF_ERROR(_prepare_output_slot_ids(tnode)); @@ -304,7 +308,7 @@ Status TableFunctionOperatorX::open(doris::RuntimeState* state) { } } - for (size_t i = 0; i < _child_slots.size(); i++) { + for (int i = 0; i < _child_slots.size(); i++) { if (_slot_need_copy(i)) { _output_slot_indexs.push_back(i); } else { @@ -315,4 +319,5 @@ Status TableFunctionOperatorX::open(doris::RuntimeState* state) { return vectorized::VExpr::open(_vfn_ctxs, state); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/table_function_operator.h b/be/src/pipeline/exec/table_function_operator.h index 81160acb7f7611..9aa26e9ae22b10 100644 --- a/be/src/pipeline/exec/table_function_operator.h +++ b/be/src/pipeline/exec/table_function_operator.h @@ -24,6 +24,7 @@ #include "vec/exprs/table_function/table_function.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -154,4 +155,5 @@ class TableFunctionOperatorX final : public StatefulOperatorX _child_slot_sizes; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/union_sink_operator.cpp b/be/src/pipeline/exec/union_sink_operator.cpp index 8467eeb1d5467a..56491b5258bc55 100644 --- a/be/src/pipeline/exec/union_sink_operator.cpp +++ b/be/src/pipeline/exec/union_sink_operator.cpp @@ -19,6 +19,7 @@ #include +#include "common/cast_set.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "pipeline/exec/data_queue.h" @@ -27,6 +28,7 @@ #include "util/runtime_profile.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status UnionSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); @@ -54,7 +56,8 @@ Status UnionSinkLocalState::open(RuntimeState* state) { UnionSinkOperatorX::UnionSinkOperatorX(int child_id, int sink_id, ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : Base(sink_id, tnode.node_id, tnode.node_id), - _first_materialized_child_idx(tnode.union_node.first_materialized_child_idx), + _first_materialized_child_idx( + cast_set(tnode.union_node.first_materialized_child_idx)), _row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples), _cur_child_id(child_id), _child_size(tnode.num_children) {} @@ -130,4 +133,5 @@ Status UnionSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/union_sink_operator.h b/be/src/pipeline/exec/union_sink_operator.h index aa94ed9a73038f..3a8880622cb108 100644 --- a/be/src/pipeline/exec/union_sink_operator.h +++ b/be/src/pipeline/exec/union_sink_operator.h @@ -26,6 +26,7 @@ #include "vec/core/block.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -152,4 +153,5 @@ class UnionSinkOperatorX final : public DataSinkOperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/exec/union_source_operator.cpp b/be/src/pipeline/exec/union_source_operator.cpp index ecaaf22922b657..d13658488e2c9b 100644 --- a/be/src/pipeline/exec/union_source_operator.cpp +++ b/be/src/pipeline/exec/union_source_operator.cpp @@ -30,6 +30,7 @@ #include "vec/core/block.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -148,7 +149,7 @@ Status UnionSourceOperatorX::get_next_const(RuntimeState* state, vectorized::Blo vectorized::Block tmp_block; tmp_block.insert({vectorized::ColumnUInt8::create(1), std::make_shared(), ""}); - int const_expr_lists_size = _const_expr_lists[_const_expr_list_idx].size(); + int const_expr_lists_size = cast_set(_const_expr_lists[_const_expr_list_idx].size()); if (_const_expr_list_idx && const_expr_lists_size != _const_expr_lists[0].size()) { return Status::InternalError( "[UnionNode]const expr at {}'s count({}) not matched({} expected)", @@ -183,4 +184,5 @@ Status UnionSourceOperatorX::get_next_const(RuntimeState* state, vectorized::Blo } } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/union_source_operator.h b/be/src/pipeline/exec/union_source_operator.h index 200e7de8597b91..0ee66c3da7447b 100644 --- a/be/src/pipeline/exec/union_source_operator.h +++ b/be/src/pipeline/exec/union_source_operator.h @@ -24,6 +24,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace vectorized { @@ -123,4 +124,5 @@ class UnionSourceOperatorX final : public OperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp index 22007a4b220348..b22ee9fd77e72f 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp @@ -144,7 +144,10 @@ Status LocalExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); - RETURN_IF_ERROR(local_state._exchanger->sink(state, in_block, eos, local_state)); + RETURN_IF_ERROR(local_state._exchanger->sink( + state, in_block, eos, + {local_state._compute_hash_value_timer, local_state._distribute_timer, nullptr}, + {&local_state._channel_id, local_state._partitioner.get(), &local_state})); // If all exchange sources ended due to limit reached, current task should also finish if (local_state._exchanger->_running_source_operators == 0) { diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h index 435f7a410a4ca6..c067f023c8d420 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h @@ -65,7 +65,6 @@ class LocalExchangeSinkLocalState final : public PipelineXSinkLocalState _partitioner = nullptr; - std::vector _partition_rows_histogram; // Used by random passthrough exchanger int _channel_id = 0; diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp index c4832b9958c00d..63e36cdfdb0c01 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp @@ -61,10 +61,10 @@ Status LocalExchangeSourceLocalState::close(RuntimeState* state) { } if (_exchanger) { - _exchanger->close(*this); + _exchanger->close({_channel_id, this}); } if (_shared_state) { - _shared_state->sub_running_source_operators(*this); + _shared_state->sub_running_source_operators(); } std::vector {}.swap(_local_merge_deps); @@ -116,7 +116,9 @@ Status LocalExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized:: bool* eos) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - RETURN_IF_ERROR(local_state._exchanger->get_block(state, block, eos, local_state)); + RETURN_IF_ERROR(local_state._exchanger->get_block( + state, block, eos, {nullptr, nullptr, local_state._copy_data_timer}, + {local_state._channel_id, &local_state})); local_state.reached_limit(block, eos); return Status::OK(); } diff --git a/be/src/pipeline/local_exchange/local_exchanger.cpp b/be/src/pipeline/local_exchange/local_exchanger.cpp index 647988f8b794cb..a963de8b684310 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.cpp +++ b/be/src/pipeline/local_exchange/local_exchanger.cpp @@ -29,8 +29,12 @@ namespace doris::pipeline { #include "common/compile_check_begin.h" template void Exchanger::_enqueue_data_and_set_ready(int channel_id, - LocalExchangeSinkLocalState& local_state, + LocalExchangeSinkLocalState* local_state, BlockType&& block) { + if (local_state == nullptr) { + _enqueue_data_and_set_ready(channel_id, std::move(block)); + return; + } size_t allocated_bytes = 0; // PartitionedBlock is used by shuffle exchanger. // PartitionedBlock will be push into multiple queues with different row ranges, so it will be @@ -44,47 +48,47 @@ void Exchanger::_enqueue_data_and_set_ready(int channel_id, allocated_bytes = block->data_block.allocated_bytes(); } std::unique_lock l(_m); - local_state._shared_state->add_mem_usage(channel_id, allocated_bytes, - !std::is_same_v && - !std::is_same_v); + local_state->_shared_state->add_mem_usage(channel_id, allocated_bytes, + !std::is_same_v && + !std::is_same_v); if (_data_queue[channel_id].enqueue(std::move(block))) { - local_state._shared_state->set_ready_to_read(channel_id); + local_state->_shared_state->set_ready_to_read(channel_id); } else { - local_state._shared_state->sub_mem_usage(channel_id, allocated_bytes); + local_state->_shared_state->sub_mem_usage(channel_id, allocated_bytes); // `enqueue(block)` return false iff this queue's source operator is already closed so we // just unref the block. if constexpr (std::is_same_v || std::is_same_v) { - block.first->unref(local_state._shared_state, allocated_bytes, channel_id); + block.first->unref(local_state->_shared_state, allocated_bytes, channel_id); } else { - block->unref(local_state._shared_state, allocated_bytes, channel_id); + block->unref(local_state->_shared_state, allocated_bytes, channel_id); DCHECK_EQ(block->ref_value(), 0); } } } template -bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_state, - BlockType& block, bool* eos, - vectorized::Block* data_block) { - return _dequeue_data(local_state, block, eos, data_block, local_state._channel_id); -} - -template -bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_state, +bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState* local_state, BlockType& block, bool* eos, vectorized::Block* data_block, int channel_id) { + if (local_state == nullptr) { + if (!_dequeue_data(block, eos, data_block, channel_id)) { + throw Exception(ErrorCode::INTERNAL_ERROR, "Exchanger has no data: {}", + data_queue_debug_string(channel_id)); + } + return true; + } bool all_finished = _running_sink_operators == 0; if (_data_queue[channel_id].try_dequeue(block)) { if constexpr (std::is_same_v || std::is_same_v) { - local_state._shared_state->sub_mem_usage(channel_id, - block.first->data_block.allocated_bytes()); + local_state->_shared_state->sub_mem_usage(channel_id, + block.first->data_block.allocated_bytes()); } else { - local_state._shared_state->sub_mem_usage(channel_id, - block->data_block.allocated_bytes()); + local_state->_shared_state->sub_mem_usage(channel_id, + block->data_block.allocated_bytes()); data_block->swap(block->data_block); - block->unref(local_state._shared_state, data_block->allocated_bytes(), channel_id); + block->unref(local_state->_shared_state, data_block->allocated_bytes(), channel_id); DCHECK_EQ(block->ref_value(), 0); } return true; @@ -95,54 +99,88 @@ bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_st if (_data_queue[channel_id].try_dequeue(block)) { if constexpr (std::is_same_v || std::is_same_v) { - local_state._shared_state->sub_mem_usage(channel_id, - block.first->data_block.allocated_bytes()); + local_state->_shared_state->sub_mem_usage( + channel_id, block.first->data_block.allocated_bytes()); } else { - local_state._shared_state->sub_mem_usage(channel_id, - block->data_block.allocated_bytes()); + local_state->_shared_state->sub_mem_usage(channel_id, + block->data_block.allocated_bytes()); data_block->swap(block->data_block); - block->unref(local_state._shared_state, data_block->allocated_bytes(), channel_id); + block->unref(local_state->_shared_state, data_block->allocated_bytes(), channel_id); DCHECK_EQ(block->ref_value(), 0); } return true; } - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); + COUNTER_UPDATE(local_state->_get_block_failed_counter, 1); + local_state->_dependency->block(); + } + return false; +} + +template +void Exchanger::_enqueue_data_and_set_ready(int channel_id, BlockType&& block) { + if constexpr (!std::is_same_v && + !std::is_same_v) { + block->ref(1); + } + if (!_data_queue[channel_id].enqueue(std::move(block))) { + if constexpr (std::is_same_v || + std::is_same_v) { + block.first->unref(); + } else { + block->unref(); + DCHECK_EQ(block->ref_value(), 0); + } + } +} + +template +bool Exchanger::_dequeue_data(BlockType& block, bool* eos, vectorized::Block* data_block, + int channel_id) { + if (_data_queue[channel_id].try_dequeue(block)) { + if constexpr (!std::is_same_v && + !std::is_same_v) { + data_block->swap(block->data_block); + block->unref(); + DCHECK_EQ(block->ref_value(), 0); + } + return true; } return false; } Status ShuffleExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } { - SCOPED_TIMER(local_state._compute_hash_value_timer); - RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, in_block)); + SCOPED_TIMER(profile.compute_hash_value_timer); + RETURN_IF_ERROR(sink_info.partitioner->do_partitioning(state, in_block)); } { - SCOPED_TIMER(local_state._distribute_timer); - RETURN_IF_ERROR(_split_rows(state, - local_state._partitioner->get_channel_ids().get(), - in_block, local_state)); + SCOPED_TIMER(profile.distribute_timer); + RETURN_IF_ERROR(_split_rows(state, sink_info.partitioner->get_channel_ids().get(), + in_block, *sink_info.channel_id, sink_info.local_state)); } return Status::OK(); } -void ShuffleExchanger::close(LocalExchangeSourceLocalState& local_state) { +void ShuffleExchanger::close(SourceInfo&& source_info) { PartitionedBlock partitioned_block; bool eos; vectorized::Block block; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, partitioned_block, &eos, &block)) { - partitioned_block.first->unref(local_state._shared_state, local_state._channel_id); + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, partitioned_block, &eos, &block, + source_info.channel_id)) { + partitioned_block.first->unref( + source_info.local_state ? source_info.local_state->_shared_state : nullptr, + source_info.channel_id); } } Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { + Profile&& profile, SourceInfo&& source_info) { PartitionedBlock partitioned_block; vectorized::MutableBlock mutable_block; @@ -153,14 +191,18 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block auto block_wrapper = partitioned_block.first; RETURN_IF_ERROR(mutable_block.add_rows(&block_wrapper->data_block, offset_start, offset_start + partitioned_block.second.length)); - block_wrapper->unref(local_state._shared_state, local_state._channel_id); + block_wrapper->unref( + source_info.local_state ? source_info.local_state->_shared_state : nullptr, + source_info.channel_id); } while (mutable_block.rows() < state->batch_size() && !*eos && - _dequeue_data(local_state, partitioned_block, eos, block)); + _dequeue_data(source_info.local_state, partitioned_block, eos, block, + source_info.channel_id)); return Status::OK(); }; - if (_dequeue_data(local_state, partitioned_block, eos, block)) { - SCOPED_TIMER(local_state._copy_data_timer); + if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, + source_info.channel_id)) { + SCOPED_TIMER(profile.copy_data_timer); mutable_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->data_block); RETURN_IF_ERROR(get_data()); @@ -169,22 +211,25 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block } Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, - vectorized::Block* block, - LocalExchangeSinkLocalState& local_state) { + vectorized::Block* block, int channel_id, + LocalExchangeSinkLocalState* local_state) { + if (local_state == nullptr) { + return _split_rows(state, channel_ids, block, channel_id); + } const auto rows = cast_set(block->rows()); auto row_idx = std::make_shared>(rows); + auto& partition_rows_histogram = _partition_rows_histogram[channel_id]; { - local_state._partition_rows_histogram.assign(_num_partitions + 1, 0); + partition_rows_histogram.assign(_num_partitions + 1, 0); for (int32_t i = 0; i < rows; ++i) { - local_state._partition_rows_histogram[channel_ids[i]]++; + partition_rows_histogram[channel_ids[i]]++; } for (int32_t i = 1; i <= _num_partitions; ++i) { - local_state._partition_rows_histogram[i] += - local_state._partition_rows_histogram[i - 1]; + partition_rows_histogram[i] += partition_rows_histogram[i - 1]; } for (int32_t i = rows - 1; i >= 0; --i) { - (*row_idx)[local_state._partition_rows_histogram[channel_ids[i]] - 1] = i; - local_state._partition_rows_histogram[channel_ids[i]]--; + (*row_idx)[partition_rows_histogram[channel_ids[i]] - 1] = i; + partition_rows_histogram[channel_ids[i]]--; } } @@ -200,10 +245,10 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest if (new_block_wrapper->data_block.empty()) { return Status::OK(); } - local_state._shared_state->add_total_mem_usage(new_block_wrapper->data_block.allocated_bytes(), - local_state._channel_id); + local_state->_shared_state->add_total_mem_usage(new_block_wrapper->data_block.allocated_bytes(), + channel_id); auto bucket_seq_to_instance_idx = - local_state._parent->cast()._bucket_seq_to_instance_idx; + local_state->_parent->cast()._bucket_seq_to_instance_idx; if (get_type() == ExchangeType::HASH_SHUFFLE) { /** * If type is `HASH_SHUFFLE`, data are hash-shuffled and distributed to all instances of @@ -211,32 +256,32 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest * For example, row 1 get a hash value 1 which means we should distribute to instance 1 on * BE 1 and row 2 get a hash value 2 which means we should distribute to instance 1 on BE 3. */ - const auto& map = local_state._parent->cast() + const auto& map = local_state->_parent->cast() ._shuffle_idx_to_instance_idx; new_block_wrapper->ref(cast_set(map.size())); for (const auto& it : map) { DCHECK(it.second >= 0 && it.second < _num_partitions) << it.first << " : " << it.second << " " << _num_partitions; - uint32_t start = local_state._partition_rows_histogram[it.first]; - uint32_t size = local_state._partition_rows_histogram[it.first + 1] - start; + uint32_t start = partition_rows_histogram[it.first]; + uint32_t size = partition_rows_histogram[it.first + 1] - start; if (size > 0) { _enqueue_data_and_set_ready(it.second, local_state, {new_block_wrapper, {row_idx, start, size}}); } else { - new_block_wrapper->unref(local_state._shared_state, local_state._channel_id); + new_block_wrapper->unref(local_state->_shared_state, channel_id); } } } else { DCHECK(!bucket_seq_to_instance_idx.empty()); new_block_wrapper->ref(_num_partitions); for (int i = 0; i < _num_partitions; i++) { - uint32_t start = local_state._partition_rows_histogram[i]; - uint32_t size = local_state._partition_rows_histogram[i + 1] - start; + uint32_t start = partition_rows_histogram[i]; + uint32_t size = partition_rows_histogram[i + 1] - start; if (size > 0) { _enqueue_data_and_set_ready(bucket_seq_to_instance_idx[i], local_state, {new_block_wrapper, {row_idx, start, size}}); } else { - new_block_wrapper->unref(local_state._shared_state, local_state._channel_id); + new_block_wrapper->unref(local_state->_shared_state, channel_id); } } } @@ -244,8 +289,53 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest return Status::OK(); } +Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, + vectorized::Block* block, int channel_id) { + const auto rows = cast_set(block->rows()); + auto row_idx = std::make_shared>(rows); + auto& partition_rows_histogram = _partition_rows_histogram[channel_id]; + { + partition_rows_histogram.assign(_num_partitions + 1, 0); + for (int32_t i = 0; i < rows; ++i) { + partition_rows_histogram[channel_ids[i]]++; + } + for (int32_t i = 1; i <= _num_partitions; ++i) { + partition_rows_histogram[i] += partition_rows_histogram[i - 1]; + } + for (int32_t i = rows - 1; i >= 0; --i) { + (*row_idx)[partition_rows_histogram[channel_ids[i]] - 1] = i; + partition_rows_histogram[channel_ids[i]]--; + } + } + + vectorized::Block data_block; + std::shared_ptr new_block_wrapper; + if (_free_blocks.try_dequeue(data_block)) { + new_block_wrapper = BlockWrapper::create_shared(std::move(data_block)); + } else { + new_block_wrapper = BlockWrapper::create_shared(block->clone_empty()); + } + + new_block_wrapper->data_block.swap(*block); + if (new_block_wrapper->data_block.empty()) { + return Status::OK(); + } + new_block_wrapper->ref(cast_set(_num_partitions)); + for (int i = 0; i < _num_partitions; i++) { + uint32_t start = partition_rows_histogram[i]; + uint32_t size = partition_rows_histogram[i + 1] - start; + if (size > 0) { + _enqueue_data_and_set_ready(i, {new_block_wrapper, {row_idx, start, size}}); + } else { + new_block_wrapper->unref(); + } + } + + return Status::OK(); +} + Status PassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } @@ -256,41 +346,43 @@ Status PassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_blo } new_block.swap(*in_block); wrapper = BlockWrapper::create_shared(std::move(new_block)); - auto channel_id = (local_state._channel_id++) % _num_partitions; - _enqueue_data_and_set_ready(channel_id, local_state, std::move(wrapper)); + auto channel_id = ((*sink_info.channel_id)++) % _num_partitions; + _enqueue_data_and_set_ready(channel_id, sink_info.local_state, std::move(wrapper)); return Status::OK(); } -void PassthroughExchanger::close(LocalExchangeSourceLocalState& local_state) { +void PassthroughExchanger::close(SourceInfo&& source_info) { vectorized::Block next_block; BlockWrapperSPtr wrapper; bool eos; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, wrapper, &eos, &next_block, + source_info.channel_id)) { // do nothing } } -void PassToOneExchanger::close(LocalExchangeSourceLocalState& local_state) { +void PassToOneExchanger::close(SourceInfo&& source_info) { vectorized::Block next_block; BlockWrapperSPtr wrapper; bool eos; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, wrapper, &eos, &next_block, + source_info.channel_id)) { // do nothing } } Status PassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { + Profile&& profile, SourceInfo&& source_info) { BlockWrapperSPtr next_block; - _dequeue_data(local_state, next_block, eos, block); + _dequeue_data(source_info.local_state, next_block, eos, block, source_info.channel_id); return Status::OK(); } Status PassToOneExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } @@ -301,70 +393,72 @@ Status PassToOneExchanger::sink(RuntimeState* state, vectorized::Block* in_block new_block.swap(*in_block); BlockWrapperSPtr wrapper = BlockWrapper::create_shared(std::move(new_block)); - _enqueue_data_and_set_ready(0, local_state, std::move(wrapper)); + _enqueue_data_and_set_ready(0, sink_info.local_state, std::move(wrapper)); return Status::OK(); } Status PassToOneExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { - if (local_state._channel_id != 0) { + Profile&& profile, SourceInfo&& source_info) { + if (source_info.channel_id != 0) { *eos = true; return Status::OK(); } BlockWrapperSPtr next_block; - _dequeue_data(local_state, next_block, eos, block); + _dequeue_data(source_info.local_state, next_block, eos, block, source_info.channel_id); return Status::OK(); } Status LocalMergeSortExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (!in_block->empty()) { vectorized::Block new_block; if (!_free_blocks.try_dequeue(new_block)) { new_block = {in_block->clone_empty()}; } - DCHECK_LE(local_state._channel_id, _data_queue.size()); + DCHECK_LE(*sink_info.channel_id, _data_queue.size()); new_block.swap(*in_block); - _enqueue_data_and_set_ready(local_state._channel_id, local_state, + _enqueue_data_and_set_ready(*sink_info.channel_id, sink_info.local_state, BlockWrapper::create_shared(std::move(new_block))); } - if (eos) { - local_state._shared_state->source_deps[local_state._channel_id]->set_always_ready(); + if (eos && sink_info.local_state) { + sink_info.local_state->_shared_state->source_deps[*sink_info.channel_id] + ->set_always_ready(); } return Status::OK(); } -void ExchangerBase::finalize(LocalExchangeSourceLocalState& local_state) { +void ExchangerBase::finalize() { DCHECK(_running_source_operators == 0); vectorized::Block block; while (_free_blocks.try_dequeue(block)) { // do nothing } } -void LocalMergeSortExchanger::finalize(LocalExchangeSourceLocalState& local_state) { + +void LocalMergeSortExchanger::finalize() { BlockWrapperSPtr next_block; vectorized::Block block; bool eos; int id = 0; for (auto& data_queue : _data_queue) { data_queue.set_eos(); - while (_dequeue_data(local_state, next_block, &eos, &block, id)) { + while (_dequeue_data(next_block, &eos, &block, id)) { block = vectorized::Block(); } id++; } - ExchangerBase::finalize(local_state); + ExchangerBase::finalize(); } Status LocalMergeSortExchanger::build_merger(RuntimeState* state, - LocalExchangeSourceLocalState& local_state) { - RETURN_IF_ERROR(_sort_source->build_merger(state, _merger, local_state.profile())); + LocalExchangeSourceLocalState* local_state) { + RETURN_IF_ERROR(_sort_source->build_merger(state, _merger, local_state->profile())); std::vector child_block_suppliers; for (int channel_id = 0; channel_id < _num_partitions; channel_id++) { - vectorized::BlockSupplier block_supplier = [&, id = channel_id](vectorized::Block* block, - bool* eos) { + vectorized::BlockSupplier block_supplier = [&, local_state, id = channel_id]( + vectorized::Block* block, bool* eos) { BlockWrapperSPtr next_block; _dequeue_data(local_state, next_block, eos, block, id); return Status::OK(); @@ -388,20 +482,21 @@ now sort(8) --> local merge(1) ---> datasink(1) [2] ----> */ Status LocalMergeSortExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { - if (local_state._channel_id != 0) { + Profile&& profile, SourceInfo&& source_info) { + if (source_info.channel_id != 0) { *eos = true; return Status::OK(); } if (!_merger) { - RETURN_IF_ERROR(build_merger(state, local_state)); + DCHECK(source_info.local_state); + RETURN_IF_ERROR(build_merger(state, source_info.local_state)); } RETURN_IF_ERROR(_merger->get_next(block, eos)); return Status::OK(); } Status BroadcastExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } @@ -411,32 +506,40 @@ Status BroadcastExchanger::sink(RuntimeState* state, vectorized::Block* in_block } new_block.swap(*in_block); auto wrapper = BlockWrapper::create_shared(std::move(new_block)); - local_state._shared_state->add_total_mem_usage(wrapper->data_block.allocated_bytes(), - local_state._channel_id); + if (sink_info.local_state) { + sink_info.local_state->_shared_state->add_total_mem_usage( + wrapper->data_block.allocated_bytes(), *sink_info.channel_id); + } + wrapper->ref(_num_partitions); for (int i = 0; i < _num_partitions; i++) { - _enqueue_data_and_set_ready(i, local_state, {wrapper, {0, wrapper->data_block.rows()}}); + _enqueue_data_and_set_ready(i, sink_info.local_state, + {wrapper, {0, wrapper->data_block.rows()}}); } return Status::OK(); } -void BroadcastExchanger::close(LocalExchangeSourceLocalState& local_state) { +void BroadcastExchanger::close(SourceInfo&& source_info) { BroadcastBlock partitioned_block; bool eos; vectorized::Block block; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, partitioned_block, &eos, &block)) { - partitioned_block.first->unref(local_state._shared_state, local_state._channel_id); + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, partitioned_block, &eos, &block, + source_info.channel_id)) { + partitioned_block.first->unref( + source_info.local_state ? source_info.local_state->_shared_state : nullptr, + source_info.channel_id); } } Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { + Profile&& profile, SourceInfo&& source_info) { BroadcastBlock partitioned_block; - if (_dequeue_data(local_state, partitioned_block, eos, block)) { - SCOPED_TIMER(local_state._copy_data_timer); + if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, + source_info.channel_id)) { + SCOPED_TIMER(profile.copy_data_timer); vectorized::MutableBlock mutable_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->data_block); @@ -444,7 +547,9 @@ Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* blo RETURN_IF_ERROR(mutable_block.add_rows(&block_wrapper->data_block, partitioned_block.second.offset_start, partitioned_block.second.length)); - block_wrapper->unref(local_state._shared_state, local_state._channel_id); + block_wrapper->unref( + source_info.local_state ? source_info.local_state->_shared_state : nullptr, + source_info.channel_id); } return Status::OK(); @@ -452,21 +557,21 @@ Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* blo Status AdaptivePassthroughExchanger::_passthrough_sink(RuntimeState* state, vectorized::Block* in_block, - LocalExchangeSinkLocalState& local_state) { + SinkInfo&& sink_info) { vectorized::Block new_block; if (!_free_blocks.try_dequeue(new_block)) { new_block = {in_block->clone_empty()}; } new_block.swap(*in_block); - auto channel_id = (local_state._channel_id++) % _num_partitions; - _enqueue_data_and_set_ready(channel_id, local_state, + auto channel_id = ((*sink_info.channel_id)++) % _num_partitions; + _enqueue_data_and_set_ready(channel_id, sink_info.local_state, BlockWrapper::create_shared(std::move(new_block))); return Status::OK(); } Status AdaptivePassthroughExchanger::_shuffle_sink(RuntimeState* state, vectorized::Block* block, - LocalExchangeSinkLocalState& local_state) { + SinkInfo&& sink_info) { std::vector channel_ids; const auto num_rows = block->rows(); channel_ids.resize(num_rows, 0); @@ -481,40 +586,39 @@ Status AdaptivePassthroughExchanger::_shuffle_sink(RuntimeState* state, vectoriz std::iota(channel_ids.begin() + i, channel_ids.end(), 0); } } - return _split_rows(state, channel_ids.data(), block, local_state); + return _split_rows(state, channel_ids.data(), block, std::move(sink_info)); } Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, - vectorized::Block* block, - LocalExchangeSinkLocalState& local_state) { + vectorized::Block* block, SinkInfo&& sink_info) { const auto rows = cast_set(block->rows()); auto row_idx = std::make_shared>(rows); + auto& partition_rows_histogram = _partition_rows_histogram[*sink_info.channel_id]; { - local_state._partition_rows_histogram.assign(_num_partitions + 1, 0); + partition_rows_histogram.assign(_num_partitions + 1, 0); for (int32_t i = 0; i < rows; ++i) { - local_state._partition_rows_histogram[channel_ids[i]]++; + partition_rows_histogram[channel_ids[i]]++; } for (int32_t i = 1; i <= _num_partitions; ++i) { - local_state._partition_rows_histogram[i] += - local_state._partition_rows_histogram[i - 1]; + partition_rows_histogram[i] += partition_rows_histogram[i - 1]; } for (int32_t i = rows - 1; i >= 0; --i) { - (*row_idx)[local_state._partition_rows_histogram[channel_ids[i]] - 1] = i; - local_state._partition_rows_histogram[channel_ids[i]]--; + (*row_idx)[partition_rows_histogram[channel_ids[i]] - 1] = i; + partition_rows_histogram[channel_ids[i]]--; } } for (int32_t i = 0; i < _num_partitions; i++) { - const size_t start = local_state._partition_rows_histogram[i]; - const size_t size = local_state._partition_rows_histogram[i + 1] - start; + const size_t start = partition_rows_histogram[i]; + const size_t size = partition_rows_histogram[i + 1] - start; if (size > 0) { std::unique_ptr mutable_block = vectorized::MutableBlock::create_unique(block->clone_empty()); RETURN_IF_ERROR(mutable_block->add_rows(block, start, size)); auto new_block = mutable_block->to_block(); - _enqueue_data_and_set_ready(i, local_state, + _enqueue_data_and_set_ready(i, sink_info.local_state, BlockWrapper::create_shared(std::move(new_block))); } } @@ -522,34 +626,35 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, } Status AdaptivePassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_block, - bool eos, LocalExchangeSinkLocalState& local_state) { + bool eos, Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } if (_is_pass_through) { - return _passthrough_sink(state, in_block, local_state); + return _passthrough_sink(state, in_block, std::move(sink_info)); } else { if (_total_block++ > _num_partitions) { _is_pass_through = true; } - return _shuffle_sink(state, in_block, local_state); + return _shuffle_sink(state, in_block, std::move(sink_info)); } } Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, - bool* eos, - LocalExchangeSourceLocalState& local_state) { + bool* eos, Profile&& profile, + SourceInfo&& source_info) { BlockWrapperSPtr next_block; - _dequeue_data(local_state, next_block, eos, block); + _dequeue_data(source_info.local_state, next_block, eos, block, source_info.channel_id); return Status::OK(); } -void AdaptivePassthroughExchanger::close(LocalExchangeSourceLocalState& local_state) { +void AdaptivePassthroughExchanger::close(SourceInfo&& source_info) { vectorized::Block next_block; bool eos; BlockWrapperSPtr wrapper; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, wrapper, &eos, &next_block, + source_info.channel_id)) { // do nothing } } diff --git a/be/src/pipeline/local_exchange/local_exchanger.h b/be/src/pipeline/local_exchange/local_exchanger.h index 4d699baa52fb8b..2ab1c8627228a4 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.h +++ b/be/src/pipeline/local_exchange/local_exchanger.h @@ -20,14 +20,33 @@ #include "pipeline/dependency.h" #include "pipeline/exec/operator.h" -namespace doris::pipeline { +namespace doris { #include "common/compile_check_begin.h" - +namespace vectorized { +class PartitionerBase; +} +namespace pipeline { class LocalExchangeSourceLocalState; class LocalExchangeSinkLocalState; struct BlockWrapper; class SortSourceOperatorX; +struct Profile { + RuntimeProfile::Counter* compute_hash_value_timer = nullptr; + RuntimeProfile::Counter* distribute_timer = nullptr; + RuntimeProfile::Counter* copy_data_timer = nullptr; +}; + +struct SinkInfo { + int* channel_id; + vectorized::PartitionerBase* partitioner; + LocalExchangeSinkLocalState* local_state; +}; + +struct SourceInfo { + int channel_id; + LocalExchangeSourceLocalState* local_state; +}; /** * One exchanger is hold by one `LocalExchangeSharedState`. And one `LocalExchangeSharedState` is * shared by all local exchange sink operators and source operators with the same id. @@ -60,15 +79,15 @@ class ExchangerBase { _free_block_limit(free_block_limit) {} virtual ~ExchangerBase() = default; virtual Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) = 0; + Profile&& profile, SourceInfo&& source_info) = 0; virtual Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) = 0; + Profile&& profile, SinkInfo&& sink_info) = 0; virtual ExchangeType get_type() const = 0; // Called if a local exchanger source operator are closed. Free the unused data block in data_queue. - virtual void close(LocalExchangeSourceLocalState& local_state) = 0; + virtual void close(SourceInfo&& source_info) = 0; // Called if all local exchanger source operators are closed. We free the memory in // `_free_blocks` here. - virtual void finalize(LocalExchangeSourceLocalState& local_state); + virtual void finalize(); virtual std::string data_queue_debug_string(int i) = 0; @@ -105,12 +124,13 @@ template struct BlockQueue { std::atomic eos = false; moodycamel::ConcurrentQueue data_queue; + moodycamel::ProducerToken ptok {data_queue}; BlockQueue() : eos(false), data_queue(moodycamel::ConcurrentQueue()) {} BlockQueue(BlockQueue&& other) : eos(other.eos.load()), data_queue(std::move(other.data_queue)) {} inline bool enqueue(BlockType const& item) { if (!eos) { - if (!data_queue.enqueue(item)) [[unlikely]] { + if (!data_queue.enqueue(ptok, item)) [[unlikely]] { throw Exception(ErrorCode::INTERNAL_ERROR, "Exception occurs in data queue [size = {}] of local exchange.", data_queue.size_approx()); @@ -122,7 +142,7 @@ struct BlockQueue { inline bool enqueue(BlockType&& item) { if (!eos) { - if (!data_queue.enqueue(std::move(item))) [[unlikely]] { + if (!data_queue.enqueue(ptok, std::move(item))) [[unlikely]] { throw Exception(ErrorCode::INTERNAL_ERROR, "Exception occurs in data queue [size = {}] of local exchange.", data_queue.size_approx()); @@ -155,12 +175,13 @@ class Exchanger : public ExchangerBase { protected: // Enqueue data block and set downstream source operator to read. - void _enqueue_data_and_set_ready(int channel_id, LocalExchangeSinkLocalState& local_state, + void _enqueue_data_and_set_ready(int channel_id, LocalExchangeSinkLocalState* local_state, BlockType&& block); - bool _dequeue_data(LocalExchangeSourceLocalState& local_state, BlockType& block, bool* eos, - vectorized::Block* data_block); - bool _dequeue_data(LocalExchangeSourceLocalState& local_state, BlockType& block, bool* eos, + bool _dequeue_data(LocalExchangeSourceLocalState* local_state, BlockType& block, bool* eos, vectorized::Block* data_block, int channel_id); + + void _enqueue_data_and_set_ready(int channel_id, BlockType&& block); + bool _dequeue_data(BlockType& block, bool* eos, vectorized::Block* data_block, int channel_id); std::vector> _data_queue; private: @@ -186,7 +207,7 @@ struct BlockWrapper { ~BlockWrapper() { DCHECK_EQ(ref_count.load(), 0); } void ref(int delta) { ref_count += delta; } void unref(LocalExchangeSharedState* shared_state, size_t allocated_bytes, int channel_id) { - if (ref_count.fetch_sub(1) == 1) { + if (ref_count.fetch_sub(1) == 1 && shared_state != nullptr) { DCHECK_GT(allocated_bytes, 0); shared_state->sub_total_mem_usage(allocated_bytes, channel_id); if (shared_state->exchanger->_free_block_limit == 0 || @@ -201,7 +222,7 @@ struct BlockWrapper { } } - void unref(LocalExchangeSharedState* shared_state, int channel_id) { + void unref(LocalExchangeSharedState* shared_state = nullptr, int channel_id = 0) { unref(shared_state, data_block.allocated_bytes(), channel_id); } int ref_value() const { return ref_count.load(); } @@ -219,19 +240,24 @@ class ShuffleExchanger : public Exchanger { DCHECK_GT(num_partitions, 0); DCHECK_GT(num_sources, 0); _data_queue.resize(num_sources); + _partition_rows_histogram.resize(running_sink_operators); } ~ShuffleExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; - void close(LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; + void close(SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::HASH_SHUFFLE; } protected: Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, - vectorized::Block* block, LocalExchangeSinkLocalState& local_state); + vectorized::Block* block, int channel_id, + LocalExchangeSinkLocalState* local_state); + Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, + vectorized::Block* block, int channel_id); + std::vector> _partition_rows_histogram; }; class BucketShuffleExchanger final : public ShuffleExchanger { @@ -255,13 +281,13 @@ class PassthroughExchanger final : public Exchanger { _data_queue.resize(num_partitions); } ~PassthroughExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::PASSTHROUGH; } - void close(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override; }; class PassToOneExchanger final : public Exchanger { @@ -273,13 +299,13 @@ class PassToOneExchanger final : public Exchanger { _data_queue.resize(num_partitions); } ~PassToOneExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::PASS_TO_ONE; } - void close(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override; }; class LocalMergeSortExchanger final : public Exchanger { @@ -292,17 +318,17 @@ class LocalMergeSortExchanger final : public Exchanger { _data_queue.resize(num_partitions); } ~LocalMergeSortExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::LOCAL_MERGE_SORT; } - Status build_merger(RuntimeState* statem, LocalExchangeSourceLocalState& local_state); + Status build_merger(RuntimeState* statem, LocalExchangeSourceLocalState* local_state); - void close(LocalExchangeSourceLocalState& local_state) override {} - void finalize(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override {} + void finalize() override; private: std::unique_ptr _merger; @@ -318,13 +344,13 @@ class BroadcastExchanger final : public Exchanger { _data_queue.resize(num_partitions); } ~BroadcastExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::BROADCAST; } - void close(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override; }; //The code in AdaptivePassthroughExchanger is essentially @@ -337,26 +363,28 @@ class AdaptivePassthroughExchanger : public Exchanger { : Exchanger(running_sink_operators, num_partitions, free_block_limit) { _data_queue.resize(num_partitions); + _partition_rows_histogram.resize(running_sink_operators); } - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::ADAPTIVE_PASSTHROUGH; } - void close(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override; private: Status _passthrough_sink(RuntimeState* state, vectorized::Block* in_block, - LocalExchangeSinkLocalState& local_state); - Status _shuffle_sink(RuntimeState* state, vectorized::Block* in_block, - LocalExchangeSinkLocalState& local_state); + SinkInfo&& sink_info); + Status _shuffle_sink(RuntimeState* state, vectorized::Block* in_block, SinkInfo&& sink_info); Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, - vectorized::Block* block, LocalExchangeSinkLocalState& local_state); + vectorized::Block* block, SinkInfo&& sink_info); std::atomic_bool _is_pass_through = false; std::atomic_int32_t _total_block = 0; + std::vector> _partition_rows_histogram; }; #include "common/compile_check_end.h" -} // namespace doris::pipeline +} // namespace pipeline +} // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/pipeline.cpp b/be/src/pipeline/pipeline.cpp index e4678b7dcf3a83..6c39d361e59c77 100644 --- a/be/src/pipeline/pipeline.cpp +++ b/be/src/pipeline/pipeline.cpp @@ -112,7 +112,7 @@ void Pipeline::make_all_runnable() { if (_sink->count_down_destination()) { for (auto* task : _tasks) { if (task) { - task->set_wake_up_by_downstream(); + task->set_wake_up_early(); } } for (auto* task : _tasks) { diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index 8ab0f1d151568d..5ae89db55a45ac 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -35,6 +35,7 @@ #include "cloud/config.h" #include "common/cast_set.h" #include "common/config.h" +#include "common/exception.h" #include "common/logging.h" #include "common/status.h" #include "io/fs/stream_load_pipe.h" @@ -498,8 +499,8 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag if (pipeline_id_to_task.contains(_pipelines[pip_idx]->id())) { auto* task = pipeline_id_to_task[_pipelines[pip_idx]->id()]; DCHECK(pipeline_id_to_profile[pip_idx]); - RETURN_IF_ERROR(task->prepare(local_params, request.fragment.output_sink, - _query_ctx.get())); + RETURN_IF_ERROR_OR_CATCH_EXCEPTION(task->prepare( + local_params, request.fragment.output_sink, _query_ctx.get())); } } { diff --git a/be/src/pipeline/pipeline_task.cpp b/be/src/pipeline/pipeline_task.cpp index 6814881ac7a300..5ed725010ec364 100644 --- a/be/src/pipeline/pipeline_task.cpp +++ b/be/src/pipeline/pipeline_task.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -223,9 +224,6 @@ bool PipelineTask::_wait_to_start() { _blocked_dep = _execution_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { static_cast(_blocked_dep)->start_watcher(); - if (_wake_up_by_downstream) { - _eos = true; - } return true; } @@ -233,9 +231,6 @@ bool PipelineTask::_wait_to_start() { _blocked_dep = op_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); - if (_wake_up_by_downstream) { - _eos = true; - } return true; } } @@ -257,9 +252,6 @@ bool PipelineTask::_is_blocked() { _blocked_dep = dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); - if (_wake_up_by_downstream) { - _eos = true; - } return true; } } @@ -279,9 +271,6 @@ bool PipelineTask::_is_blocked() { _blocked_dep = op_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); - if (_wake_up_by_downstream) { - _eos = true; - } return true; } } @@ -289,15 +278,15 @@ bool PipelineTask::_is_blocked() { } Status PipelineTask::execute(bool* eos) { - SCOPED_TIMER(_task_profile->total_time_counter()); - SCOPED_TIMER(_exec_timer); - SCOPED_ATTACH_TASK(_state); - _eos = _sink->is_finished(_state) || _eos || _wake_up_by_downstream; - *eos = _eos; if (_eos) { - // If task is waken up by finish dependency, `_eos` is set to true by last execution, and we should return here. + *eos = true; return Status::OK(); } + + SCOPED_TIMER(_task_profile->total_time_counter()); + SCOPED_TIMER(_exec_timer); + SCOPED_ATTACH_TASK(_state); + int64_t time_spent = 0; DBUG_EXECUTE_IF("fault_inject::PipelineXTask::execute", { Status status = Status::Error("fault_inject pipeline_task execute failed"); @@ -320,27 +309,31 @@ Status PipelineTask::execute(bool* eos) { if (_wait_to_start()) { return Status::OK(); } - if (_wake_up_by_downstream) { - _eos = true; - *eos = true; - return Status::OK(); - } + // The status must be runnable if (!_opened && !_fragment_context->is_canceled()) { + if (_wake_up_early) { + *eos = true; + _eos = true; + return Status::OK(); + } RETURN_IF_ERROR(_open()); } + auto set_wake_up_and_dep_ready = [&]() { + if (wake_up_early()) { + return; + } + set_wake_up_early(); + clear_blocking_state(); + }; + _task_profile->add_info_string("TaskState", "Runnable"); _task_profile->add_info_string("BlockedByDependency", ""); while (!_fragment_context->is_canceled()) { if (_is_blocked()) { return Status::OK(); } - if (_wake_up_by_downstream) { - _eos = true; - *eos = true; - return Status::OK(); - } /// When a task is cancelled, /// its blocking state will be cleared and it will transition to a ready state (though it is not truly ready). @@ -361,47 +354,47 @@ Status PipelineTask::execute(bool* eos) { RETURN_IF_ERROR(_sink->revoke_memory(_state)); continue; } - *eos = _eos; DBUG_EXECUTE_IF("fault_inject::PipelineXTask::executing", { Status status = Status::Error("fault_inject pipeline_task executing failed"); return status; }); - // `_dry_run` means sink operator need no more data // `_sink->is_finished(_state)` means sink operator should be finished - if (_dry_run || _sink->is_finished(_state)) { - *eos = true; - _eos = true; - } else { + if (_sink->is_finished(_state)) { + set_wake_up_and_dep_ready(); + } + + // `_dry_run` means sink operator need no more data + *eos = wake_up_early() || _dry_run; + if (!*eos) { SCOPED_TIMER(_get_block_timer); _get_block_counter->update(1); RETURN_IF_ERROR(_root->get_block_after_projects(_state, block, eos)); } + if (*eos) { + RETURN_IF_ERROR(close(Status::OK(), false)); + } + if (_block->rows() != 0 || *eos) { SCOPED_TIMER(_sink_timer); - Status status = Status::OK(); - // Define a lambda function to catch sink exception, because sink will check - // return error status with EOF, it is special, could not return directly. - auto sink_function = [&]() -> Status { - Status internal_st; - internal_st = _sink->sink(_state, block, *eos); - return internal_st; - }; - status = sink_function(); - if (!status.is()) { - RETURN_IF_ERROR(status); + Status status = _sink->sink(_state, block, *eos); + + if (status.is()) { + set_wake_up_and_dep_ready(); + } else if (!status) { + return status; } - *eos = status.is() ? true : *eos; + if (*eos) { // just return, the scheduler will do finish work - _eos = true; _task_profile->add_info_string("TaskState", "Finished"); + _eos = true; return Status::OK(); } } } - static_cast(get_task_queue()->push_back(this)); + RETURN_IF_ERROR(get_task_queue()->push_back(this)); return Status::OK(); } @@ -470,17 +463,14 @@ void PipelineTask::finalize() { _le_state_map.clear(); } -Status PipelineTask::close(Status exec_status) { +Status PipelineTask::close(Status exec_status, bool close_sink) { int64_t close_ns = 0; - Defer defer {[&]() { - if (_task_queue) { - _task_queue->update_statistics(this, close_ns); - } - }}; Status s; { SCOPED_RAW_TIMER(&close_ns); - s = _sink->close(_state, exec_status); + if (close_sink) { + s = _sink->close(_state, exec_status); + } for (auto& op : _operators) { auto tem = op->close(_state); if (!tem.ok() && s.ok()) { @@ -489,10 +479,18 @@ Status PipelineTask::close(Status exec_status) { } } if (_opened) { - _fresh_profile_counter(); - COUNTER_SET(_close_timer, close_ns); + COUNTER_UPDATE(_close_timer, close_ns); COUNTER_UPDATE(_task_profile->total_time_counter(), close_ns); } + + if (close_sink && _opened) { + _task_profile->add_info_string("WakeUpEarly", wake_up_early() ? "true" : "false"); + _fresh_profile_counter(); + } + + if (_task_queue) { + _task_queue->update_statistics(this, close_ns); + } return s; } @@ -508,10 +506,10 @@ std::string PipelineTask::debug_string() { auto elapsed = _fragment_context->elapsed_time() / 1000000000.0; fmt::format_to(debug_string_buffer, "PipelineTask[this = {}, id = {}, open = {}, eos = {}, finish = {}, dry run = " - "{}, elapse time = {}s, _wake_up_by_downstream = {}], block dependency = {}, is " + "{}, elapse time = {}s, _wake_up_early = {}], block dependency = {}, is " "running = {}\noperators: ", (void*)this, _index, _opened, _eos, _finalized, _dry_run, elapsed, - _wake_up_by_downstream.load(), + _wake_up_early.load(), cur_blocked_dep && !_finalized ? cur_blocked_dep->debug_string() : "NULL", is_running()); for (size_t i = 0; i < _operators.size(); i++) { diff --git a/be/src/pipeline/pipeline_task.h b/be/src/pipeline/pipeline_task.h index 4bb062122c0c08..1a31e5954f479c 100644 --- a/be/src/pipeline/pipeline_task.h +++ b/be/src/pipeline/pipeline_task.h @@ -61,7 +61,7 @@ class PipelineTask { // if the pipeline create a bunch of pipeline task // must be call after all pipeline task is finish to release resource - Status close(Status exec_status); + Status close(Status exec_status, bool close_sink = true); PipelineFragmentContext* fragment_context() { return _fragment_context; } @@ -135,7 +135,7 @@ class PipelineTask { int task_id() const { return _index; }; bool is_finalized() const { return _finalized; } - void set_wake_up_by_downstream() { _wake_up_by_downstream = true; } + void set_wake_up_early() { _wake_up_early = true; } void clear_blocking_state() { _state->get_query_ctx()->get_execution_dependency()->set_always_ready(); @@ -237,7 +237,7 @@ class PipelineTask { PipelineId pipeline_id() const { return _pipeline->id(); } - bool wake_up_by_downstream() const { return _wake_up_by_downstream; } + bool wake_up_early() const { return _wake_up_early; } private: friend class RuntimeFilterDependency; @@ -319,7 +319,7 @@ class PipelineTask { std::atomic _running = false; std::atomic _eos = false; - std::atomic _wake_up_by_downstream = false; + std::atomic _wake_up_early = false; }; } // namespace doris::pipeline diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index a371cdb947ff56..2d7554e702969f 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -421,9 +421,9 @@ void ExecEnv::init_file_cache_factory(std::vector& cache_paths std::unordered_set cache_path_set; Status rest = doris::parse_conf_cache_paths(doris::config::file_cache_path, cache_paths); if (!rest) { - LOG(FATAL) << "parse config file cache path failed, path=" << doris::config::file_cache_path - << ", reason=" << rest.msg(); - exit(-1); + throw Exception( + Status::FatalError("parse config file cache path failed, path={}, reason={}", + doris::config::file_cache_path, rest.msg())); } doris::Status cache_status; @@ -437,8 +437,8 @@ void ExecEnv::init_file_cache_factory(std::vector& cache_paths cache_path.path, cache_path.init_settings()); if (!cache_status.ok()) { if (!doris::config::ignore_broken_disk) { - LOG(FATAL) << "failed to init file cache, err: " << cache_status; - exit(-1); + throw Exception( + Status::FatalError("failed to init file cache, err: {}", cache_status)); } LOG(WARNING) << "failed to init file cache, err: " << cache_status; } diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index ce18071fda0d07..19e8f76366c084 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -390,12 +390,20 @@ void FragmentMgr::coordinator_callback(const ReportStatusRequest& req) { params.load_counters.emplace(s_unselected_rows, std::to_string(num_rows_load_unselected)); if (!req.runtime_state->get_error_log_file_path().empty()) { - params.__set_tracking_url( - to_load_error_http_path(req.runtime_state->get_error_log_file_path())); + std::string error_log_url = + to_load_error_http_path(req.runtime_state->get_error_log_file_path()); + LOG(INFO) << "error log file path: " << error_log_url + << ", query id: " << print_id(req.query_id) + << ", fragment instance id: " << print_id(req.fragment_instance_id); + params.__set_tracking_url(error_log_url); } else if (!req.runtime_states.empty()) { for (auto* rs : req.runtime_states) { if (!rs->get_error_log_file_path().empty()) { - params.__set_tracking_url(to_load_error_http_path(rs->get_error_log_file_path())); + std::string error_log_url = to_load_error_http_path(rs->get_error_log_file_path()); + LOG(INFO) << "error log file path: " << error_log_url + << ", query id: " << print_id(req.query_id) + << ", fragment instance id: " << print_id(rs->fragment_instance_id()); + params.__set_tracking_url(error_log_url); } if (rs->wal_id() > 0) { params.__set_txn_id(rs->wal_id()); diff --git a/be/src/runtime/jsonb_value.h b/be/src/runtime/jsonb_value.h index 65f4927759c304..5f530db1ac8117 100644 --- a/be/src/runtime/jsonb_value.h +++ b/be/src/runtime/jsonb_value.h @@ -61,58 +61,47 @@ struct JsonBinaryValue { } bool operator==(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } // != bool ne(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } // <= bool le(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } // >= bool ge(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } // < bool lt(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } // > bool gt(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } bool operator!=(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } bool operator<=(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } bool operator>=(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } bool operator<(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } bool operator>(const JsonBinaryValue& other) const { - LOG(FATAL) << "comparing between JsonBinaryValue is not supported"; - __builtin_unreachable(); + throw Exception(Status::FatalError("comparing between JsonBinaryValue is not supported")); } Status from_json_string(const char* s, size_t len); diff --git a/be/src/runtime/memory/cache_manager.h b/be/src/runtime/memory/cache_manager.h index a2a089b929dbdf..1e89e957ba1ce6 100644 --- a/be/src/runtime/memory/cache_manager.h +++ b/be/src/runtime/memory/cache_manager.h @@ -40,7 +40,8 @@ class CacheManager { #ifdef BE_TEST _caches.erase(it); #else - LOG(FATAL) << "Repeat register cache " << CachePolicy::type_string(cache->type()); + throw Exception(Status::FatalError("Repeat register cache {}", + CachePolicy::type_string(cache->type()))); #endif // BE_TEST } _caches.insert({cache->type(), cache}); diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h index 8f077a4eb45bb1..72e61fed2e0013 100644 --- a/be/src/runtime/memory/cache_policy.h +++ b/be/src/runtime/memory/cache_policy.h @@ -99,10 +99,10 @@ class CachePolicy { case CacheType::TABLET_COLUMN_OBJECT_POOL: return "TabletColumnObjectPool"; default: - LOG(FATAL) << "not match type of cache policy :" << static_cast(type); + throw Exception(Status::FatalError("not match type of cache policy :{}", + static_cast(type))); } - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); + throw Exception(Status::FatalError("__builtin_unreachable")); } inline static std::unordered_map StringToType = { diff --git a/be/src/runtime/memory/global_memory_arbitrator.h b/be/src/runtime/memory/global_memory_arbitrator.h index 075113088fbc5b..a7a85725ab10c9 100644 --- a/be/src/runtime/memory/global_memory_arbitrator.h +++ b/be/src/runtime/memory/global_memory_arbitrator.h @@ -76,7 +76,7 @@ class GlobalMemoryArbitrator { static inline int64_t sys_mem_available() { return MemInfo::_s_sys_mem_available.load(std::memory_order_relaxed) - refresh_interval_memory_growth.load(std::memory_order_relaxed) - - process_reserved_memory(); + process_reserved_memory() + static_cast(MemInfo::allocator_cache_mem()); } static inline std::string sys_mem_available_str() { @@ -91,12 +91,14 @@ class GlobalMemoryArbitrator { static inline std::string sys_mem_available_details_str() { auto msg = fmt::format( "sys available memory {}(= {}[proc/available] - {}[reserved] - " - "{}B[waiting_refresh])", + "{}B[waiting_refresh] + {}[tc/jemalloc_cache])", PrettyPrinter::print(sys_mem_available(), TUnit::BYTES), PrettyPrinter::print(MemInfo::_s_sys_mem_available.load(std::memory_order_relaxed), TUnit::BYTES), PrettyPrinter::print(process_reserved_memory(), TUnit::BYTES), - refresh_interval_memory_growth); + refresh_interval_memory_growth, + PrettyPrinter::print(static_cast(MemInfo::allocator_cache_mem()), + TUnit::BYTES)); #ifdef ADDRESS_SANITIZER msg = "[ASAN]" + msg; #endif diff --git a/be/src/runtime/memory/lru_cache_policy.h b/be/src/runtime/memory/lru_cache_policy.h index 3fdb43facd7715..7e73f2dd76b566 100644 --- a/be/src/runtime/memory/lru_cache_policy.h +++ b/be/src/runtime/memory/lru_cache_policy.h @@ -36,13 +36,13 @@ class LRUCachePolicy : public CachePolicy { LRUCachePolicy(CacheType type, size_t capacity, LRUCacheType lru_cache_type, uint32_t stale_sweep_time_s, uint32_t num_shards = DEFAULT_LRU_CACHE_NUM_SHARDS, uint32_t element_count_capacity = DEFAULT_LRU_CACHE_ELEMENT_COUNT_CAPACITY, - bool enable_prune = true) + bool enable_prune = true, bool is_lru_k = DEFAULT_LRU_CACHE_IS_LRU_K) : CachePolicy(type, capacity, stale_sweep_time_s, enable_prune), _lru_cache_type(lru_cache_type) { if (check_capacity(capacity, num_shards)) { _cache = std::shared_ptr( new ShardedLRUCache(type_string(type), capacity, lru_cache_type, num_shards, - element_count_capacity)); + element_count_capacity, is_lru_k)); } else { CHECK(ExecEnv::GetInstance()->get_dummy_lru_cache()); _cache = ExecEnv::GetInstance()->get_dummy_lru_cache(); @@ -54,14 +54,15 @@ class LRUCachePolicy : public CachePolicy { uint32_t stale_sweep_time_s, uint32_t num_shards, uint32_t element_count_capacity, CacheValueTimeExtractor cache_value_time_extractor, - bool cache_value_check_timestamp, bool enable_prune = true) + bool cache_value_check_timestamp, bool enable_prune = true, + bool is_lru_k = DEFAULT_LRU_CACHE_IS_LRU_K) : CachePolicy(type, capacity, stale_sweep_time_s, enable_prune), _lru_cache_type(lru_cache_type) { if (check_capacity(capacity, num_shards)) { _cache = std::shared_ptr( new ShardedLRUCache(type_string(type), capacity, lru_cache_type, num_shards, cache_value_time_extractor, cache_value_check_timestamp, - element_count_capacity)); + element_count_capacity, is_lru_k)); } else { CHECK(ExecEnv::GetInstance()->get_dummy_lru_cache()); _cache = ExecEnv::GetInstance()->get_dummy_lru_cache(); @@ -90,7 +91,8 @@ class LRUCachePolicy : public CachePolicy { case LRUCacheType::NUMBER: return "number"; default: - LOG(FATAL) << "not match type of lru cache:" << static_cast(type); + throw Exception( + Status::FatalError("not match type of lru cache:{}", static_cast(type))); } } diff --git a/be/src/runtime/memory/thread_mem_tracker_mgr.h b/be/src/runtime/memory/thread_mem_tracker_mgr.h index db3b32a6298820..9dbf4399492d02 100644 --- a/be/src/runtime/memory/thread_mem_tracker_mgr.h +++ b/be/src/runtime/memory/thread_mem_tracker_mgr.h @@ -246,13 +246,13 @@ inline void ThreadMemTrackerMgr::consume(int64_t size, int skip_large_memory_che } if (doris::config::crash_in_alloc_large_memory_bytes > 0 && size > doris::config::crash_in_alloc_large_memory_bytes) { - LOG(FATAL) << fmt::format( + throw Exception(Status::FatalError( "alloc large memory: {}, {}, crash generate core dumpsto help analyze, " "stacktrace:\n{}", size, is_attach_query() ? "in query or load: " + print_id(_query_id) : "not in query or load", - get_stack_trace()); + get_stack_trace())); } } } diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index c16db7c67d3420..b4a38173d72222 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -219,35 +219,14 @@ Status RuntimeFilterMgr::get_merge_addr(TNetworkAddress* addr) { Status RuntimeFilterMergeControllerEntity::_init_with_desc( const TRuntimeFilterDesc* runtime_filter_desc, const TQueryOptions* query_options, - const std::vector* target_info, - const int producer_size) { - std::unique_lock guard(_filter_map_mutex); - std::shared_ptr cnt_val = std::make_shared(); - // runtime_filter_desc and target will be released, - // so we need to copy to cnt_val - cnt_val->producer_size = producer_size; - cnt_val->runtime_filter_desc = *runtime_filter_desc; - cnt_val->pool.reset(new ObjectPool()); - cnt_val->filter = cnt_val->pool->add(new IRuntimeFilter(_state, runtime_filter_desc)); - - auto filter_id = runtime_filter_desc->filter_id; - RETURN_IF_ERROR( - cnt_val->filter->init_with_desc(&cnt_val->runtime_filter_desc, query_options, -1)); - cnt_val->filter->set_ignored(); - _filter_map.emplace(filter_id, cnt_val); - return Status::OK(); -} - -Status RuntimeFilterMergeControllerEntity::_init_with_desc( - const TRuntimeFilterDesc* runtime_filter_desc, const TQueryOptions* query_options, - const std::vector* targetv2_info, + const std::vector&& targetv2_info, const int producer_size) { std::shared_ptr cnt_val = std::make_shared(); // runtime_filter_desc and target will be released, // so we need to copy to cnt_val cnt_val->producer_size = producer_size; cnt_val->runtime_filter_desc = *runtime_filter_desc; - cnt_val->targetv2_info = *targetv2_info; + cnt_val->targetv2_info = targetv2_info; cnt_val->pool.reset(new ObjectPool()); cnt_val->filter = cnt_val->pool->add(new IRuntimeFilter(_state, runtime_filter_desc)); auto filter_id = runtime_filter_desc->filter_id; @@ -268,36 +247,21 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, if (runtime_filter_params.__isset.rid_to_runtime_filter) { for (const auto& filterid_to_desc : runtime_filter_params.rid_to_runtime_filter) { int filter_id = filterid_to_desc.first; - const auto& target_iter = runtime_filter_params.rid_to_target_param.find(filter_id); - if (target_iter == runtime_filter_params.rid_to_target_param.end() && - !runtime_filter_params.__isset.rid_to_target_paramv2) { - // This runtime filter has to target info - return Status::InternalError("runtime filter params meet error"); - } else if (target_iter == runtime_filter_params.rid_to_target_param.end()) { - const auto& targetv2_iter = - runtime_filter_params.rid_to_target_paramv2.find(filter_id); - if (targetv2_iter == runtime_filter_params.rid_to_target_paramv2.end()) { - // This runtime filter has to target info - return Status::InternalError("runtime filter params meet error"); - } - const auto& build_iter = - runtime_filter_params.runtime_filter_builder_num.find(filter_id); - if (build_iter == runtime_filter_params.runtime_filter_builder_num.end()) { - // This runtime filter has to builder info - return Status::InternalError("runtime filter params meet error"); - } - - RETURN_IF_ERROR(_init_with_desc(&filterid_to_desc.second, &query_options, - &targetv2_iter->second, build_iter->second)); - } else { - const auto& build_iter = - runtime_filter_params.runtime_filter_builder_num.find(filter_id); - if (build_iter == runtime_filter_params.runtime_filter_builder_num.end()) { - return Status::InternalError("runtime filter params meet error"); - } - RETURN_IF_ERROR(_init_with_desc(&filterid_to_desc.second, &query_options, - &target_iter->second, build_iter->second)); + const auto& targetv2_iter = runtime_filter_params.rid_to_target_paramv2.find(filter_id); + const auto& build_iter = + runtime_filter_params.runtime_filter_builder_num.find(filter_id); + if (build_iter == runtime_filter_params.runtime_filter_builder_num.end()) { + // This runtime filter has no builder info + return Status::InternalError( + "Runtime filter has a wrong parameter. Maybe FE version is mismatched."); } + + RETURN_IF_ERROR(_init_with_desc( + &filterid_to_desc.second, &query_options, + targetv2_iter == runtime_filter_params.rid_to_target_paramv2.end() + ? std::vector {} + : std::move(targetv2_iter->second), + build_iter->second)); } } return Status::OK(); diff --git a/be/src/runtime/runtime_filter_mgr.h b/be/src/runtime/runtime_filter_mgr.h index 9f4cf5f4e22a07..c54be905f28f08 100644 --- a/be/src/runtime/runtime_filter_mgr.h +++ b/be/src/runtime/runtime_filter_mgr.h @@ -192,12 +192,7 @@ class RuntimeFilterMergeControllerEntity { private: Status _init_with_desc(const TRuntimeFilterDesc* runtime_filter_desc, const TQueryOptions* query_options, - const std::vector* target_info, - const int producer_size); - - Status _init_with_desc(const TRuntimeFilterDesc* runtime_filter_desc, - const TQueryOptions* query_options, - const std::vector* target_info, + const std::vector&& target_info, const int producer_size); UniqueId _query_id; diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index ecaf99061a070b..df7c4141691d0b 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -341,7 +341,9 @@ Status RuntimeState::create_error_log_file() { LOG(WARNING) << error_msg.str(); return Status::InternalError(error_msg.str()); } - VLOG_FILE << "create error log file: " << _error_log_file_path; + LOG(INFO) << "create error log file: " << _error_log_file_path + << ", query id: " << print_id(_query_id) + << ", fragment instance id: " << print_id(_fragment_instance_id); return Status::OK(); } diff --git a/be/src/runtime/snapshot_loader.cpp b/be/src/runtime/snapshot_loader.cpp index 784904c78a3fb1..c5b27c823054a4 100644 --- a/be/src/runtime/snapshot_loader.cpp +++ b/be/src/runtime/snapshot_loader.cpp @@ -74,7 +74,7 @@ Status upload_with_checksum(io::RemoteFileSystem& fs, std::string_view local_pat RETURN_IF_ERROR(fs.upload(local_path, full_remote_path)); break; default: - LOG(FATAL) << "unknown fs type: " << static_cast(fs.type()); + throw Exception(Status::FatalError("unknown fs type: {}", static_cast(fs.type()))); } return Status::OK(); } @@ -765,50 +765,68 @@ Status SnapshotLoader::move(const std::string& snapshot_path, TabletSharedPtr ta return Status::InternalError(err_msg); } - if (overwrite) { - std::vector snapshot_files; - RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, &snapshot_files)); - - // 1. simply delete the old dir and replace it with the snapshot dir - try { - // This remove seems soft enough, because we already get - // tablet id and schema hash from this path, which - // means this path is a valid path. - std::filesystem::remove_all(tablet_path); - VLOG_CRITICAL << "remove dir: " << tablet_path; - std::filesystem::create_directory(tablet_path); - VLOG_CRITICAL << "re-create dir: " << tablet_path; - } catch (const std::filesystem::filesystem_error& e) { - std::stringstream ss; - ss << "failed to move tablet path: " << tablet_path << ". err: " << e.what(); - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } + if (!overwrite) { + throw Exception(Status::FatalError("only support overwrite now")); + } - // link files one by one - // files in snapshot dir will be moved in snapshot clean process - std::vector linked_files; - for (auto& file : snapshot_files) { - auto full_src_path = fmt::format("{}/{}", snapshot_path, file); - auto full_dest_path = fmt::format("{}/{}", tablet_path, file); - if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) { - LOG(WARNING) << "failed to link file from " << full_src_path << " to " - << full_dest_path << ", err: " << std::strerror(errno); - - // clean the already linked files - for (auto& linked_file : linked_files) { - remove(linked_file.c_str()); - } + // Medium migration/clone/checkpoint/compaction may change or check the + // files and tablet meta, so we need to take these locks. + std::unique_lock migration_lock(tablet->get_migration_lock(), std::try_to_lock); + std::unique_lock base_compact_lock(tablet->get_base_compaction_lock(), std::try_to_lock); + std::unique_lock cumu_compact_lock(tablet->get_cumulative_compaction_lock(), std::try_to_lock); + std::unique_lock cold_compact_lock(tablet->get_cold_compaction_lock(), std::try_to_lock); + std::unique_lock build_idx_lock(tablet->get_build_inverted_index_lock(), std::try_to_lock); + std::unique_lock meta_store_lock(tablet->get_meta_store_lock(), std::try_to_lock); + if (!migration_lock.owns_lock() || !base_compact_lock.owns_lock() || + !cumu_compact_lock.owns_lock() || !cold_compact_lock.owns_lock() || + !build_idx_lock.owns_lock() || !meta_store_lock.owns_lock()) { + // This error should be retryable + auto status = Status::ObtainLockFailed("failed to get tablet locks, tablet: {}", tablet_id); + LOG(WARNING) << status << ", snapshot path: " << snapshot_path + << ", tablet path: " << tablet_path; + return status; + } - return Status::InternalError("move tablet failed"); + std::vector snapshot_files; + RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, &snapshot_files)); + + // FIXME: the below logic will demage the tablet files if failed in the middle. + + // 1. simply delete the old dir and replace it with the snapshot dir + try { + // This remove seems soft enough, because we already get + // tablet id and schema hash from this path, which + // means this path is a valid path. + std::filesystem::remove_all(tablet_path); + VLOG_CRITICAL << "remove dir: " << tablet_path; + std::filesystem::create_directory(tablet_path); + VLOG_CRITICAL << "re-create dir: " << tablet_path; + } catch (const std::filesystem::filesystem_error& e) { + std::stringstream ss; + ss << "failed to move tablet path: " << tablet_path << ". err: " << e.what(); + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + + // link files one by one + // files in snapshot dir will be moved in snapshot clean process + std::vector linked_files; + for (auto& file : snapshot_files) { + auto full_src_path = fmt::format("{}/{}", snapshot_path, file); + auto full_dest_path = fmt::format("{}/{}", tablet_path, file); + if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) { + LOG(WARNING) << "failed to link file from " << full_src_path << " to " << full_dest_path + << ", err: " << std::strerror(errno); + + // clean the already linked files + for (auto& linked_file : linked_files) { + remove(linked_file.c_str()); } - linked_files.push_back(full_dest_path); - VLOG_CRITICAL << "link file from " << full_src_path << " to " << full_dest_path; - } - } else { - LOG(FATAL) << "only support overwrite now"; - __builtin_unreachable(); + return Status::InternalError("move tablet failed"); + } + linked_files.push_back(full_dest_path); + VLOG_CRITICAL << "link file from " << full_src_path << " to " << full_dest_path; } // snapshot loader not need to change tablet uid diff --git a/be/src/runtime/stream_load/stream_load_executor.cpp b/be/src/runtime/stream_load/stream_load_executor.cpp index 482fadac44e051..054de96a881425 100644 --- a/be/src/runtime/stream_load/stream_load_executor.cpp +++ b/be/src/runtime/stream_load/stream_load_executor.cpp @@ -85,13 +85,18 @@ Status StreamLoadExecutor::execute_plan_fragment(std::shared_ptrnumber_unselected_rows = state->num_rows_load_unselected(); ctx->loaded_bytes = state->num_bytes_load_total(); int64_t num_selected_rows = ctx->number_total_rows - ctx->number_unselected_rows; + ctx->error_url = to_load_error_http_path(state->get_error_log_file_path()); if (!ctx->group_commit && num_selected_rows > 0 && (double)ctx->number_filtered_rows / num_selected_rows > ctx->max_filter_ratio) { // NOTE: Do not modify the error message here, for historical reasons, // some users may rely on this error message. - *status = Status::DataQualityError("too many filtered rows"); + if (ctx->need_commit_self) { + *status = + Status::DataQualityError("too many filtered rows, url: " + ctx->error_url); + } else { + *status = Status::DataQualityError("too many filtered rows"); + } } - ctx->error_url = to_load_error_http_path(state->get_error_log_file_path()); if (status->ok()) { DorisMetrics::instance()->stream_receive_bytes_total->increment(ctx->receive_bytes); @@ -385,8 +390,7 @@ bool StreamLoadExecutor::collect_load_stat(StreamLoadContext* ctx, TTxnCommitAtt } switch (ctx->load_type) { case TLoadType::MINI_LOAD: { - LOG(FATAL) << "mini load is not supported any more"; - break; + throw Exception(Status::FatalError("mini load is not supported any more")); } case TLoadType::ROUTINE_LOAD: { attach->loadType = TLoadType::ROUTINE_LOAD; diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index e0a44af69c1d66..9ba7949ec5afad 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -354,8 +354,7 @@ class ThreadLocalHandle { DCHECK(bthread_context != nullptr); bthread_context->thread_local_handle_count--; } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); + throw Exception(Status::FatalError("__builtin_unreachable")); } } }; @@ -379,8 +378,8 @@ static ThreadContext* thread_context(bool allow_return_null = false) { return nullptr; } // It means that use thread_context() but this thread not attached a query/load using SCOPED_ATTACH_TASK macro. - LOG(FATAL) << "__builtin_unreachable, " << doris::memory_orphan_check_msg; - __builtin_unreachable(); + throw Exception( + Status::FatalError("__builtin_unreachable, {}", doris::memory_orphan_check_msg)); } // belong to one query object member, not be shared by multiple queries. diff --git a/be/src/runtime/workload_group/workload_group_metrics.cpp b/be/src/runtime/workload_group/workload_group_metrics.cpp index 18ff7aa2f4f185..0f7322b7feb448 100644 --- a/be/src/runtime/workload_group/workload_group_metrics.cpp +++ b/be/src/runtime/workload_group/workload_group_metrics.cpp @@ -36,32 +36,31 @@ WorkloadGroupMetrics::WorkloadGroupMetrics(WorkloadGroup* wg) { _cpu_time_metric = std::make_unique( doris::MetricType::COUNTER, doris::MetricUnit::SECONDS, "workload_group_cpu_time_sec"); - _cpu_time_counter = - (IntAtomicCounter*)(_entity->register_metric(_cpu_time_metric.get())); + _cpu_time_counter = (IntCounter*)(_entity->register_metric(_cpu_time_metric.get())); _mem_used_bytes_metric = std::make_unique( doris::MetricType::COUNTER, doris::MetricUnit::BYTES, "workload_group_mem_used_bytes"); - _mem_used_bytes_counter = (IntAtomicCounter*)(_entity->register_metric( - _mem_used_bytes_metric.get())); + _mem_used_bytes_counter = + (IntCounter*)(_entity->register_metric(_mem_used_bytes_metric.get())); _local_scan_bytes_metric = std::make_unique( doris::MetricType::COUNTER, doris::MetricUnit::BYTES, "workload_group_local_scan_bytes"); - _local_scan_bytes_counter = (IntAtomicCounter*)(_entity->register_metric( - _local_scan_bytes_metric.get())); + _local_scan_bytes_counter = + (IntCounter*)(_entity->register_metric(_local_scan_bytes_metric.get())); _remote_scan_bytes_metric = std::make_unique( doris::MetricType::COUNTER, doris::MetricUnit::BYTES, "workload_group_remote_scan_bytes"); - _remote_scan_bytes_counter = (IntAtomicCounter*)(_entity->register_metric( - _remote_scan_bytes_metric.get())); + _remote_scan_bytes_counter = + (IntCounter*)(_entity->register_metric(_remote_scan_bytes_metric.get())); for (const auto& [key, io_throttle] : wg->_scan_io_throttle_map) { std::unique_ptr metric = std::make_unique( doris::MetricType::COUNTER, doris::MetricUnit::BYTES, "workload_group_local_scan_bytes_" + io_throttle->metric_name()); _local_scan_bytes_counter_map[key] = - (IntAtomicCounter*)(_entity->register_metric(metric.get())); + (IntCounter*)(_entity->register_metric(metric.get())); _local_scan_bytes_metric_map[key] = std::move(metric); } } diff --git a/be/src/runtime/workload_group/workload_group_metrics.h b/be/src/runtime/workload_group/workload_group_metrics.h index e68715df249dee..c761638d115439 100644 --- a/be/src/runtime/workload_group/workload_group_metrics.h +++ b/be/src/runtime/workload_group/workload_group_metrics.h @@ -28,7 +28,7 @@ class WorkloadGroup; template class AtomicCounter; -using IntAtomicCounter = AtomicCounter; +using IntCounter = AtomicCounter; class MetricEntity; struct MetricPrototype; @@ -65,11 +65,11 @@ class WorkloadGroupMetrics { // _local_disk_io_metric is every disk's IO std::map> _local_scan_bytes_metric_map; - IntAtomicCounter* _cpu_time_counter {nullptr}; // used for metric - IntAtomicCounter* _mem_used_bytes_counter {nullptr}; // used for metric - IntAtomicCounter* _local_scan_bytes_counter {nullptr}; // used for metric - IntAtomicCounter* _remote_scan_bytes_counter {nullptr}; // used for metric - std::map _local_scan_bytes_counter_map; // used for metric + IntCounter* _cpu_time_counter {nullptr}; // used for metric + IntCounter* _mem_used_bytes_counter {nullptr}; // used for metric + IntCounter* _local_scan_bytes_counter {nullptr}; // used for metric + IntCounter* _remote_scan_bytes_counter {nullptr}; // used for metric + std::map _local_scan_bytes_counter_map; // used for metric std::atomic _cpu_time_nanos {0}; std::atomic _last_cpu_time_nanos {0}; diff --git a/be/src/util/binary_cast.hpp b/be/src/util/binary_cast.hpp index 8a91ab3a579152..e7c62ad45ac091 100644 --- a/be/src/util/binary_cast.hpp +++ b/be/src/util/binary_cast.hpp @@ -137,8 +137,7 @@ To binary_cast(From from) { conv.decimal = from; return conv.i128; } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); + throw Exception(Status::FatalError("__builtin_unreachable")); } } diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index 504b0b27428190..5ec5a8bf8e1aa4 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -237,9 +237,7 @@ class BitUtil { } else if constexpr (std::is_same_v) { return value; } else { - __builtin_unreachable(); - LOG(FATAL) << "__builtin_unreachable"; - return value; + throw Exception(Status::FatalError("__builtin_unreachable")); } } diff --git a/be/src/util/bitmap_value.h b/be/src/util/bitmap_value.h index 2d15ac99611274..528dbe40788229 100644 --- a/be/src/util/bitmap_value.h +++ b/be/src/util/bitmap_value.h @@ -2519,8 +2519,7 @@ class BitmapValueIterator { } break; case BitmapValue::BitmapDataType::SET: { - LOG(FATAL) << "BitmapValue with set do not support move"; - break; + throw Exception(Status::FatalError("BitmapValue with set do not support move")); } default: break; diff --git a/be/src/util/block_compression.cpp b/be/src/util/block_compression.cpp index d1788b0948a6f2..7a0aacd4252dec 100644 --- a/be/src/util/block_compression.cpp +++ b/be/src/util/block_compression.cpp @@ -233,7 +233,8 @@ class HadoopLz4BlockCompression : public Lz4BlockCompression { HadoopLz4BlockCompression() { Status st = Decompressor::create_decompressor(CompressType::LZ4BLOCK, &_decompressor); if (!st.ok()) { - LOG(FATAL) << "HadoopLz4BlockCompression construction failed. status = " << st << "\n"; + throw Exception(Status::FatalError( + "HadoopLz4BlockCompression construction failed. status = {}", st)); } } diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp index 8f64fe699c6062..fc35be3dc35931 100644 --- a/be/src/util/cgroup_util.cpp +++ b/be/src/util/cgroup_util.cpp @@ -218,6 +218,10 @@ std::optional CGroupUtil::get_cgroupsv2_path(const std::string& sub Status CGroupUtil::read_int_line_from_cgroup_file(const std::filesystem::path& file_path, int64_t* val) { std::ifstream file_stream(file_path, std::ios::in); + if (!file_stream.is_open()) { + return Status::CgroupError("Error open {}", file_path.string()); + } + string line; getline(file_stream, line); if (file_stream.fail() || file_stream.bad()) { @@ -264,4 +268,167 @@ void CGroupUtil::read_int_metric_from_cgroup_file( } } +Status CGroupUtil::read_string_line_from_cgroup_file(const std::filesystem::path& file_path, + std::string* line_ptr) { + std::ifstream file_stream(file_path, std::ios::in); + if (!file_stream.is_open()) { + return Status::CgroupError("Error open {}", file_path.string()); + } + string line; + getline(file_stream, line); + if (file_stream.fail() || file_stream.bad()) { + return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); + } + *line_ptr = line; + return Status::OK(); +} + +Status CGroupUtil::parse_cpuset_line(std::string cpuset_line, int* cpu_count_ptr) { + if (cpuset_line.empty()) { + return Status::CgroupError("cpuset line is empty"); + } + std::vector ranges; + boost::split(ranges, cpuset_line, boost::is_any_of(",")); + int cpu_count = 0; + + for (const std::string& range : ranges) { + std::vector cpu_values; + boost::split(cpu_values, range, boost::is_any_of("-")); + + if (cpu_values.size() == 2) { + int start = std::stoi(cpu_values[0]); + int end = std::stoi(cpu_values[1]); + cpu_count += (end - start) + 1; + } else { + cpu_count++; + } + } + *cpu_count_ptr = cpu_count; + return Status::OK(); +} + +int CGroupUtil::get_cgroup_limited_cpu_number(int physical_cores) { + if (physical_cores <= 0) { + return physical_cores; + } + int ret = physical_cores; +#if defined(OS_LINUX) + // For cgroup v2 + // Child cgroup's cpu.max may bigger than parent group's cpu.max, + // so it should look up from current cgroup to top group. + // For cpuset, child cgroup's cpuset.cpus could not bigger thant parent's cpuset.cpus. + if (CGroupUtil::cgroupsv2_enable()) { + std::string cgroupv2_process_path = CGroupUtil::cgroupv2_of_process(); + if (cgroupv2_process_path.empty()) { + return ret; + } + std::filesystem::path current_cgroup_path = (default_cgroups_mount / cgroupv2_process_path); + ret = get_cgroup_v2_cpu_quota_number(current_cgroup_path, default_cgroups_mount, ret); + + current_cgroup_path = (default_cgroups_mount / cgroupv2_process_path); + ret = get_cgroup_v2_cpuset_number(current_cgroup_path, default_cgroups_mount, ret); + } else if (CGroupUtil::cgroupsv1_enable()) { + // cpu quota, should find first not empty config from current path to top. + // because if a process attach to current cgroup, its cpu quota may not be set. + std::string cpu_quota_path = ""; + Status cpu_quota_ret = CGroupUtil::find_abs_cgroupv1_path("cpu", &cpu_quota_path); + if (cpu_quota_ret.ok() && !cpu_quota_path.empty()) { + std::filesystem::path current_cgroup_path = cpu_quota_path; + ret = get_cgroup_v1_cpu_quota_number(current_cgroup_path, default_cgroups_mount, ret); + } + + //cpuset + // just lookup current process cgroup path is enough + // because if a process attach to current cgroup, its cpuset.cpus must be set. + std::string cpuset_path = ""; + Status cpuset_ret = CGroupUtil::find_abs_cgroupv1_path("cpuset", &cpuset_path); + if (cpuset_ret.ok() && !cpuset_path.empty()) { + std::filesystem::path current_path = cpuset_path; + ret = get_cgroup_v1_cpuset_number(current_path, ret); + } + } +#endif + return ret; +} + +int CGroupUtil::get_cgroup_v2_cpu_quota_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num) { + int ret = cpu_num; + while (current_path != default_cg_mout_path.parent_path()) { + std::ifstream cpu_max_file(current_path / "cpu.max"); + if (cpu_max_file.is_open()) { + std::string cpu_limit_str; + double cpu_period; + cpu_max_file >> cpu_limit_str >> cpu_period; + if (cpu_limit_str != "max" && cpu_period != 0) { + double cpu_limit = std::stod(cpu_limit_str); + ret = std::min(static_cast(std::ceil(cpu_limit / cpu_period)), ret); + } + } + current_path = current_path.parent_path(); + } + return ret; +} + +int CGroupUtil::get_cgroup_v2_cpuset_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num) { + int ret = cpu_num; + while (current_path != default_cg_mout_path.parent_path()) { + std::ifstream cpuset_cpus_file(current_path / "cpuset.cpus.effective"); + current_path = current_path.parent_path(); + if (cpuset_cpus_file.is_open()) { + std::string cpuset_line; + cpuset_cpus_file >> cpuset_line; + if (cpuset_line.empty()) { + continue; + } + int cpus_count = 0; + static_cast(CGroupUtil::parse_cpuset_line(cpuset_line, &cpus_count)); + ret = std::min(cpus_count, ret); + break; + } + } + return ret; +} + +int CGroupUtil::get_cgroup_v1_cpu_quota_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num) { + int ret = cpu_num; + while (current_path != default_cg_mout_path.parent_path()) { + std::ifstream cpu_quota_file(current_path / "cpu.cfs_quota_us"); + std::ifstream cpu_period_file(current_path / "cpu.cfs_period_us"); + if (cpu_quota_file.is_open() && cpu_period_file.is_open()) { + double cpu_quota_value; + double cpu_period_value; + cpu_quota_file >> cpu_quota_value; + cpu_period_file >> cpu_period_value; + if (cpu_quota_value > 0 && cpu_period_value > 0) { + ret = std::min(ret, + static_cast(std::ceil(cpu_quota_value / cpu_period_value))); + break; + } + } + current_path = current_path.parent_path(); + } + return ret; +} + +int CGroupUtil::get_cgroup_v1_cpuset_number(std::filesystem::path& current_path, int cpu_num) { + int ret = cpu_num; + std::string cpuset_line = ""; + Status cpuset_ret = CGroupUtil::read_string_line_from_cgroup_file( + (current_path / "cpuset.cpus"), &cpuset_line); + if (cpuset_ret.ok() && !cpuset_line.empty()) { + int cpuset_count = 0; + static_cast(CGroupUtil::parse_cpuset_line(cpuset_line, &cpuset_count)); + if (cpuset_count > 0) { + ret = std::min(ret, cpuset_count); + } + } + return ret; +} + } // namespace doris diff --git a/be/src/util/cgroup_util.h b/be/src/util/cgroup_util.h index bc1417453f41f6..54fc9494599f15 100644 --- a/be/src/util/cgroup_util.h +++ b/be/src/util/cgroup_util.h @@ -104,5 +104,27 @@ class CGroupUtil { static void read_int_metric_from_cgroup_file( const std::filesystem::path& file_path, std::unordered_map& metrics_map); + + static Status read_string_line_from_cgroup_file(const std::filesystem::path& file_path, + std::string* line_ptr); + + // cpuset_line: 0-4,6,8-10 + static Status parse_cpuset_line(std::string cpuset_line, int* cpu_count_ptr); + + static int get_cgroup_limited_cpu_number(int physical_cores); + + static int get_cgroup_v2_cpu_quota_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num); + + static int get_cgroup_v2_cpuset_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num); + + static int get_cgroup_v1_cpu_quota_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num); + + static int get_cgroup_v1_cpuset_number(std::filesystem::path& current_path, int cpu_num); }; } // namespace doris diff --git a/be/src/util/core_local.cpp b/be/src/util/core_local.cpp deleted file mode 100644 index 1c4b1dd04715b4..00000000000000 --- a/be/src/util/core_local.cpp +++ /dev/null @@ -1,129 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "util/core_local.h" - -#include -#include -#include -#include - -#include "common/compiler_util.h" // IWYU pragma: keep -#include "common/logging.h" -#include "util/spinlock.h" -#include "util/sse_util.hpp" - -namespace doris { - -constexpr int BLOCK_SIZE = 4096; -struct alignas(CACHE_LINE_SIZE) CoreDataBlock { - void* at(size_t offset) { return data + offset; } - char data[BLOCK_SIZE]; - - static void* operator new(size_t nbytes) { - void* p = nullptr; - if (posix_memalign(&p, alignof(CoreDataBlock), nbytes) == 0) { - return p; - } - throw std::bad_alloc(); - } - - static void operator delete(void* p) { free(p); } -}; - -template -class CoreDataAllocatorImpl : public CoreDataAllocator { -public: - virtual ~CoreDataAllocatorImpl(); - void* get_or_create(size_t id) override { - size_t block_id = id / ELEMENTS_PER_BLOCK; - { - std::lock_guard l(_lock); - if (block_id >= _blocks.size()) { - _blocks.resize(block_id + 1); - } - } - CoreDataBlock* block = _blocks[block_id]; - if (block == nullptr) { - std::lock_guard l(_lock); - block = _blocks[block_id]; - if (block == nullptr) { - block = new CoreDataBlock(); - _blocks[block_id] = block; - } - } - size_t offset = (id % ELEMENTS_PER_BLOCK) * ELEMENT_BYTES; - return block->at(offset); - } - -private: - static constexpr int ELEMENTS_PER_BLOCK = BLOCK_SIZE / ELEMENT_BYTES; - SpinLock _lock; // lock to protect the modification of _blocks - std::vector _blocks; -}; - -template -CoreDataAllocatorImpl::~CoreDataAllocatorImpl() { - for (auto block : _blocks) { - delete block; - } -} - -CoreDataAllocatorFactory* CoreDataAllocatorFactory::instance() { - static CoreDataAllocatorFactory _s_instance; - return &_s_instance; -} - -CoreDataAllocator* CoreDataAllocatorFactory::get_allocator(size_t cpu_idx, size_t data_bytes) { - std::lock_guard l(_lock); - auto pair = std::make_pair(cpu_idx, data_bytes); - auto it = _allocators.find(pair); - if (it != std::end(_allocators)) { - return it->second; - } - CoreDataAllocator* allocator = nullptr; - switch (data_bytes) { - case 1: - allocator = new CoreDataAllocatorImpl<1>(); - break; - case 2: - allocator = new CoreDataAllocatorImpl<2>(); - break; - case 3: - case 4: - allocator = new CoreDataAllocatorImpl<4>(); - break; - case 5: - case 6: - case 7: - case 8: - allocator = new CoreDataAllocatorImpl<8>(); - break; - default: - DCHECK(false) << "don't support core local value for this size, size=" << data_bytes; - } - _allocators.emplace(pair, allocator); - return allocator; -} - -CoreDataAllocatorFactory::~CoreDataAllocatorFactory() { - for (auto& it : _allocators) { - delete it.second; - } -} - -} // namespace doris diff --git a/be/src/util/core_local.h b/be/src/util/core_local.h deleted file mode 100644 index 1610ae5a0bb046..00000000000000 --- a/be/src/util/core_local.h +++ /dev/null @@ -1,162 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "common/compiler_util.h" // IWYU pragma: keep - -namespace doris { - -class CoreDataAllocator { -public: - virtual ~CoreDataAllocator() {} - virtual void* get_or_create(size_t id) = 0; -}; - -class CoreDataAllocatorFactory { -public: - CoreDataAllocatorFactory() {} - ~CoreDataAllocatorFactory(); - CoreDataAllocator* get_allocator(size_t cpu_id, size_t data_bytes); - static CoreDataAllocatorFactory* instance(); - -private: - DISALLOW_COPY_AND_ASSIGN(CoreDataAllocatorFactory); - -private: - std::mutex _lock; - std::map, CoreDataAllocator*> _allocators; -}; - -template -class CoreLocalValueController { -public: - CoreLocalValueController() { - int num_cpus = static_cast(std::thread::hardware_concurrency()); - _size = 8; - while (_size < num_cpus) { - _size <<= 1; - } - _allocators.resize(_size, nullptr); - for (int i = 0; i < _size; ++i) { - _allocators[i] = CoreDataAllocatorFactory::instance()->get_allocator(i, sizeof(T)); - } - } - - ~CoreLocalValueController() {} - - int get_id() { - std::lock_guard l(_lock); - int id = 0; - if (_free_ids.empty()) { - id = _next_id++; - } else { - id = _free_ids.back(); - _free_ids.pop_back(); - } - return id; - } - void reclaim_id(int id) { - std::lock_guard l(_lock); - _free_ids.push_back(id); - } - size_t size() const { return _size; } - CoreDataAllocator* allocator(int i) const { return _allocators[i]; } - - static CoreLocalValueController* instance() { - static CoreLocalValueController _s_instance; - return &_s_instance; - } - -private: - DISALLOW_COPY_AND_ASSIGN(CoreLocalValueController); - -private: - std::mutex _lock; - int _next_id = 0; - std::deque _free_ids; - std::vector _allocators; - size_t _size; -}; - -template -class CoreLocalValue { -public: - CoreLocalValue(const T init_value = T()) { - CoreLocalValueController* controller = CoreLocalValueController::instance(); - _id = controller->get_id(); - _size = controller->size(); - _values.resize(_size, nullptr); - for (int i = 0; i < _size; ++i) { - void* ptr = controller->allocator(i)->get_or_create(_id); - _values[i] = new (ptr) T(init_value); - } - } - - ~CoreLocalValue() { - for (int i = 0; i < _size; ++i) { - _values[i]->~T(); - } - CoreLocalValueController::instance()->reclaim_id(_id); - } - - size_t size() const { return _size; } - T* access() const { -#ifdef __APPLE__ - size_t cpu_id = 0; -#else - size_t cpu_id = sched_getcpu(); -#endif - if (cpu_id >= _size) { - cpu_id &= _size - 1; - } - return access_at_core(cpu_id); - } - T* access_at_core(size_t core_idx) const { return _values[core_idx]; } - - inline void reset() { - for (int i = 0; i < _size; ++i) { - _values[i]->~T(); - } - _values.clear(); - _values.resize(_size, nullptr); - CoreLocalValueController* controller = CoreLocalValueController::instance(); - for (int i = 0; i < _size; ++i) { - void* ptr = controller->allocator(i)->get_or_create(_id); - _values[i] = new (ptr) T(); - } - } - -private: - int _id = -1; - size_t _size = 0; - std::vector _values; -}; - -} // namespace doris diff --git a/be/src/util/cpu_info.cpp b/be/src/util/cpu_info.cpp index 116dacb8da7ed4..b49985cdc06830 100644 --- a/be/src/util/cpu_info.cpp +++ b/be/src/util/cpu_info.cpp @@ -59,6 +59,7 @@ #include "gflags/gflags.h" #include "gutil/stringprintf.h" #include "gutil/strings/substitute.h" +#include "util/cgroup_util.h" #include "util/pretty_printer.h" using boost::algorithm::contains; @@ -109,58 +110,6 @@ static struct { {"popcnt", CpuInfo::POPCNT}, {"avx", CpuInfo::AVX}, {"avx2", CpuInfo::AVX2}, }; -int cgroup_bandwidth_quota(int physical_cores) { - namespace fs = std::filesystem; - fs::path cpu_max = "/sys/fs/cgroup/cpu.max"; - fs::path cfs_quota = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; - fs::path cfs_period = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; - - int64_t quota, period; - char byte_buffer[1000]; - int64_t read_bytes; - - if (fs::exists(cpu_max)) { - // cgroup v2 - // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html - std::ifstream file(cpu_max); - file.read(byte_buffer, 999); - read_bytes = file.gcount(); - byte_buffer[read_bytes] = '\0'; - if (sscanf(byte_buffer, "%" SCNd64 " %" SCNd64 "", "a, &period) != 2) { - return physical_cores; - } - } else if (fs::exists(cfs_quota) && fs::exists(cfs_period)) { - // cgroup v1 - // https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html#management - - // Read the quota, this indicates how many microseconds the CPU can be utilized by this cgroup per period - std::ifstream quota_file(cfs_quota); - quota_file.read(byte_buffer, 999); - read_bytes = quota_file.gcount(); - byte_buffer[read_bytes] = '\0'; - if (sscanf(byte_buffer, "%" SCNd64 "", "a) != 1) { - return physical_cores; - } - - // Read the time period, a cgroup can utilize the CPU up to quota microseconds every period - std::ifstream period_file(cfs_period); - period_file.read(byte_buffer, 999); - read_bytes = period_file.gcount(); - byte_buffer[read_bytes] = '\0'; - if (sscanf(byte_buffer, "%" SCNd64 "", &period) != 1) { - return physical_cores; - } - } else { - // No cgroup quota - return physical_cores; - } - if (quota > 0 && period > 0) { - return int64_t(ceil(double(quota) / double(period))); - } else { - return physical_cores; - } -} - // Helper function to parse for hardware flags. // values contains a list of space-separated flags. check to see if the flags we // care about are present. @@ -212,7 +161,7 @@ void CpuInfo::init() { } } - int num_cores = cgroup_bandwidth_quota(physical_num_cores); + int num_cores = CGroupUtil::get_cgroup_limited_cpu_number(physical_num_cores); if (max_mhz != 0) { cycles_per_ms_ = int64_t(max_mhz) * 1000; } else { diff --git a/be/src/util/datetype_cast.hpp b/be/src/util/datetype_cast.hpp index 495631ea7e376c..5c187ded7b729c 100644 --- a/be/src/util/datetype_cast.hpp +++ b/be/src/util/datetype_cast.hpp @@ -29,8 +29,10 @@ /* * We use these function family to clarify our types of datelike type. for example: * DataTypeDate -------------------> ColumnDate -----------------------> Int64 - * | TypeToColumn ValueTypeOfColumn - * | TypeToValueType + * | | TypeToColumn ValueTypeOfColumn | + * | β†˜--------------------------------------------------------------β†— + * | ::FieldType + * ↓ TypeToValueType * VecDateTimeValue */ namespace doris::date_cast { @@ -102,6 +104,7 @@ constexpr bool IsV1() { std::is_same_v); } +// only for datelike types. template constexpr bool IsV2() { return !IsV1(); diff --git a/be/src/util/debug_util.cpp b/be/src/util/debug_util.cpp index 1cf03d2c22d0e1..0856b10c051709 100644 --- a/be/src/util/debug_util.cpp +++ b/be/src/util/debug_util.cpp @@ -17,6 +17,7 @@ #include "util/debug_util.h" +#include #include #include #include @@ -104,6 +105,16 @@ std::string hexdump(const char* buf, int len) { return ss.str(); } +bvar::Status be_version_metrics("doris_be_version", [] { + std::stringstream ss; + ss << version::doris_build_version_major() << 0 << version::doris_build_version_minor() << 0 + << version::doris_build_version_patch(); + if (version::doris_build_version_hotfix() > 0) { + ss << 0 << version::doris_build_version_hotfix(); + } + return std::strtoul(ss.str().c_str(), nullptr, 10); +}()); + std::string PrintThriftNetworkAddress(const TNetworkAddress& add) { std::stringstream ss; add.printTo(ss); diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp index e9d4f31e5ca137..e77ee1c36b6b89 100644 --- a/be/src/util/doris_metrics.cpp +++ b/be/src/util/doris_metrics.cpp @@ -311,17 +311,17 @@ DorisMetrics::DorisMetrics() : _metric_registry(_s_registry_name) { INT_GAUGE_METRIC_REGISTER(_server_metric_entity, broker_file_open_reading); INT_GAUGE_METRIC_REGISTER(_server_metric_entity, local_file_open_writing); INT_GAUGE_METRIC_REGISTER(_server_metric_entity, s3_file_open_writing); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, num_io_bytes_read_total); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, num_io_bytes_read_from_cache); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, num_io_bytes_read_from_remote); - - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, query_ctx_cnt); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_ctx_cnt); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_cnt); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_task_cnt); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_task_queued); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_task_running); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_task_submit_failed); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, num_io_bytes_read_total); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, num_io_bytes_read_from_cache); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, num_io_bytes_read_from_remote); + + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, query_ctx_cnt); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_ctx_cnt); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_cnt); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_task_cnt); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_task_queued); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_task_running); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, scanner_task_submit_failed); } void DorisMetrics::initialize(bool init_system_metrics, const std::set& disk_devices, diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h index 31b907eec9ed6c..d089758c21c93f 100644 --- a/be/src/util/doris_metrics.h +++ b/be/src/util/doris_metrics.h @@ -236,17 +236,17 @@ class DorisMetrics { UIntGauge* group_local_scan_thread_pool_queue_size = nullptr; UIntGauge* group_local_scan_thread_pool_thread_num = nullptr; - IntAtomicCounter* num_io_bytes_read_total = nullptr; - IntAtomicCounter* num_io_bytes_read_from_cache = nullptr; - IntAtomicCounter* num_io_bytes_read_from_remote = nullptr; - - IntAtomicCounter* query_ctx_cnt = nullptr; - IntAtomicCounter* scanner_ctx_cnt = nullptr; - IntAtomicCounter* scanner_cnt = nullptr; - IntAtomicCounter* scanner_task_cnt = nullptr; - IntAtomicCounter* scanner_task_queued = nullptr; - IntAtomicCounter* scanner_task_submit_failed = nullptr; - IntAtomicCounter* scanner_task_running = nullptr; + IntCounter* num_io_bytes_read_total = nullptr; + IntCounter* num_io_bytes_read_from_cache = nullptr; + IntCounter* num_io_bytes_read_from_remote = nullptr; + + IntCounter* query_ctx_cnt = nullptr; + IntCounter* scanner_ctx_cnt = nullptr; + IntCounter* scanner_cnt = nullptr; + IntCounter* scanner_task_cnt = nullptr; + IntCounter* scanner_task_queued = nullptr; + IntCounter* scanner_task_submit_failed = nullptr; + IntCounter* scanner_task_running = nullptr; static DorisMetrics* instance() { static DorisMetrics instance; diff --git a/be/src/util/easy_json.cc b/be/src/util/easy_json.cc index 46c3a1867f7b42..fcb8021e3836b2 100644 --- a/be/src/util/easy_json.cc +++ b/be/src/util/easy_json.cc @@ -27,6 +27,8 @@ #include #include #include + +#include "common/exception.h" // IWYU pragma: no_include using rapidjson::SizeType; @@ -200,8 +202,7 @@ EasyJson EasyJson::PushBack(EasyJson::ComplexTypeInitializer val) { } else if (val == kArray) { push_val.SetArray(); } else { - LOG(FATAL) << "Unknown initializer type"; - __builtin_unreachable(); + throw Exception(Status::FatalError("Unknown initializer type")); } value_->PushBack(push_val, alloc_->allocator()); return EasyJson(&(*value_)[value_->Size() - 1], alloc_); diff --git a/be/src/util/jsonb_utils.h b/be/src/util/jsonb_utils.h index 7dba0dca3af1eb..8ec842ef227dd5 100644 --- a/be/src/util/jsonb_utils.h +++ b/be/src/util/jsonb_utils.h @@ -23,6 +23,7 @@ #include +#include "common/exception.h" #include "jsonb_document.h" #include "jsonb_stream.h" #include "jsonb_writer.h" @@ -42,7 +43,8 @@ class JsonbToJson { const std::string to_json_string(const char* data, size_t size) { JsonbDocument* pdoc = doris::JsonbDocument::createDocument(data, size); if (!pdoc) { - LOG(FATAL) << "invalid json binary value: " << std::string_view(data, size); + throw Exception(Status::FatalError("invalid json binary value: {}", + std::string_view(data, size))); } return to_json_string(pdoc->getValue()); } diff --git a/be/src/util/metrics.h b/be/src/util/metrics.h index ac7e69a4ef8ab4..cb49884fefb60b 100644 --- a/be/src/util/metrics.h +++ b/be/src/util/metrics.h @@ -19,21 +19,17 @@ #include #include -#include -#include #include #include #include #include #include -#include #include #include #include #include -#include "util/core_local.h" #include "util/histogram.h" namespace doris { @@ -67,8 +63,8 @@ using Labels = std::unordered_map; class Metric { public: - Metric() {} - virtual ~Metric() {} + Metric() = default; + virtual ~Metric() = default; virtual std::string to_string() const = 0; virtual std::string to_prometheus(const std::string& display_name, const Labels& entity_labels, const Labels& metric_labels) const; @@ -83,7 +79,7 @@ template class AtomicMetric : public Metric { public: AtomicMetric() : _value(T()) {} - virtual ~AtomicMetric() {} + virtual ~AtomicMetric() = default; std::string to_string() const override { return std::to_string(value()); } @@ -101,81 +97,10 @@ class AtomicMetric : public Metric { std::atomic _value; }; -template -class LockSimpleMetric : public Metric { -public: - LockSimpleMetric() : _value(T()) {} - virtual ~LockSimpleMetric() {} - - std::string to_string() const override { return std::to_string(value()); } - - T value() const { - std::lock_guard l(_lock); - return _value; - } - - void increment(const T& delta) { - std::lock_guard l(this->_lock); - _value += delta; - } - - void set_value(const T& value) { - std::lock_guard l(this->_lock); - _value = value; - } - - rj::Value to_json_value(rj::Document::AllocatorType& allocator) const override { - return rj::Value(value()); - } - -protected: - // We use std::mutex instead of std::atomic is because atomic don't support - // double's fetch_add - // TODO(zc): If this is atomic is bottleneck, we change to thread local. - // performance: on Intel(R) Xeon(R) CPU E5-2450 int64_t - // original type: 2ns/op - // single thread std::mutex: 26ns/op - // multiple thread(8) std::mutex: 2500ns/op - mutable std::mutex _lock; - T _value; -}; - -template -class CoreLocalCounter : public Metric { -public: - CoreLocalCounter() {} - virtual ~CoreLocalCounter() {} - - std::string to_string() const override { - std::stringstream ss; - ss << value(); - return ss.str(); - } - - T value() const { - T sum = 0; - for (int i = 0; i < _value.size(); ++i) { - sum += *_value.access_at_core(i); - } - return sum; - } - - void increment(const T& delta) { __sync_fetch_and_add(_value.access(), delta); } - - void reset() { _value.reset(); } - - rj::Value to_json_value(rj::Document::AllocatorType& allocator) const override { - return rj::Value(value()); - } - -protected: - CoreLocalValue _value; -}; - class HistogramMetric : public Metric { public: - HistogramMetric() {} - virtual ~HistogramMetric() {} + HistogramMetric() = default; + virtual ~HistogramMetric() = default; HistogramMetric(const HistogramMetric&) = delete; HistogramMetric& operator=(const HistogramMetric&) = delete; @@ -208,41 +133,25 @@ class HistogramMetric : public Metric { template class AtomicCounter : public AtomicMetric { public: - AtomicCounter() {} - virtual ~AtomicCounter() {} + AtomicCounter() = default; + virtual ~AtomicCounter() = default; }; template class AtomicGauge : public AtomicMetric { public: AtomicGauge() : AtomicMetric() {} - virtual ~AtomicGauge() {} -}; - -template -class LockCounter : public LockSimpleMetric { -public: - LockCounter() : LockSimpleMetric() {} - virtual ~LockCounter() {} -}; - -// This can only used for trival type -template -class LockGauge : public LockSimpleMetric { -public: - LockGauge() : LockSimpleMetric() {} - virtual ~LockGauge() {} + virtual ~AtomicGauge() = default; }; -using IntCounter = CoreLocalCounter; -using IntAtomicCounter = AtomicCounter; -using UIntCounter = CoreLocalCounter; -using DoubleCounter = LockCounter; +using IntCounter = AtomicCounter; +using UIntCounter = AtomicCounter; +using DoubleCounter = AtomicCounter; using IntGauge = AtomicGauge; using UIntGauge = AtomicGauge; -using DoubleGauge = LockGauge; - +using DoubleGauge = AtomicGauge; using Labels = std::unordered_map; + struct MetricPrototype { public: MetricPrototype(MetricType type_, MetricUnit unit_, std::string name_, @@ -302,15 +211,12 @@ struct MetricPrototype { #define INT_GAUGE_METRIC_REGISTER(entity, metric) \ metric = (IntGauge*)(entity->register_metric(&METRIC_##metric)) -#define INT_DOUBLE_METRIC_REGISTER(entity, metric) \ +#define DOUBLE_GAUGE_METRIC_REGISTER(entity, metric) \ metric = (DoubleGauge*)(entity->register_metric(&METRIC_##metric)) #define INT_UGAUGE_METRIC_REGISTER(entity, metric) \ metric = (UIntGauge*)(entity->register_metric(&METRIC_##metric)) -#define INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, metric) \ - metric = (IntAtomicCounter*)(entity->register_metric(&METRIC_##metric)) - #define HISTOGRAM_METRIC_REGISTER(entity, metric) \ metric = (HistogramMetric*)(entity->register_metric(&METRIC_##metric)) @@ -338,8 +244,8 @@ enum class MetricEntityType { kServer, kTablet }; class MetricEntity { public: - MetricEntity(MetricEntityType type, const std::string& name, const Labels& labels) - : _type(type), _name(name), _labels(labels) {} + MetricEntity(MetricEntityType type, std::string name, Labels labels) + : _type(type), _name(std::move(name)), _labels(std::move(labels)) {} ~MetricEntity() { for (auto& metric : _metrics) { delete metric.second; @@ -401,7 +307,7 @@ using EntityMetricsByType = class MetricRegistry { public: - MetricRegistry(const std::string& name) : _name(name) {} + MetricRegistry(std::string name) : _name(std::move(name)) {} ~MetricRegistry(); std::shared_ptr register_entity( diff --git a/be/src/util/mysql_row_buffer.cpp b/be/src/util/mysql_row_buffer.cpp index 3e20a2d9de72fe..4fd7de13753a95 100644 --- a/be/src/util/mysql_row_buffer.cpp +++ b/be/src/util/mysql_row_buffer.cpp @@ -107,7 +107,11 @@ MysqlRowBuffer::~MysqlRowBuffer() { template void MysqlRowBuffer::open_dynamic_mode() { if (!_dynamic_mode) { - *_pos++ = NEXT_EIGHT_BYTE; + // if _pos now exactly at the end of _buf memory, + // we should reserve 1 byte for _dynamic_mode flag byte to avoid *pos = 254 + // cause _dynamic_mode flag byte be overwritten + reserve(1); + *_pos++ = NEXT_EIGHT_BYTE; // *_pos = 254 ; _pos++ // write length when dynamic mode close _len_pos = (_pos - _buf); _pos = _pos + 8; diff --git a/be/src/util/rle_encoding.h b/be/src/util/rle_encoding.h index 206349b472815d..5369ace9eed6ce 100644 --- a/be/src/util/rle_encoding.h +++ b/be/src/util/rle_encoding.h @@ -283,7 +283,7 @@ void RleDecoder::RewindOne() { switch (rewind_state_) { case CANT_REWIND: - LOG(FATAL) << "Can't rewind more than once after each read!"; + throw Exception(Status::FatalError("Can't rewind more than once after each read!")); break; case REWIND_RUN: ++repeat_count_; diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h index 99313132382e5c..bfa75b728d5620 100644 --- a/be/src/util/simd/vstring_function.h +++ b/be/src/util/simd/vstring_function.h @@ -309,8 +309,11 @@ class VStringFunctions { // is to say, counting bytes which do not match 10xx_xxxx pattern. // All 0xxx_xxxx, 110x_xxxx, 1110_xxxx and 1111_0xxx are greater than 1011_1111 when use int8_t arithmetic, // so just count bytes greater than 1011_1111 in a byte string as the result of utf8_length. - static inline size_t get_char_len(const char* src, size_t len) { - size_t char_len = 0; + // get_char_len is used to return the UTF-8 length of a string. + // The return value will never exceed len. + template + static inline T get_char_len(const char* src, T len) { + T char_len = 0; const char* p = src; const char* end = p + len; #if defined(__SSE2__) || defined(__aarch64__) diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp index 973f461d8defe7..ecbb4d580360c4 100644 --- a/be/src/util/system_metrics.cpp +++ b/be/src/util/system_metrics.cpp @@ -33,18 +33,23 @@ #include "gutil/strings/split.h" // for string split #include "gutil/strtoint.h" // for atoi64 +#include "util/cgroup_util.h" #include "util/mem_info.h" #include "util/perf_counters.h" namespace doris { +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(avail_cpu_num, MetricUnit::NOUNIT); + DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(host_cpu_num, MetricUnit::NOUNIT); struct CpuNumberMetrics { CpuNumberMetrics(MetricEntity* ent) : entity(ent) { - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, host_cpu_num); + INT_COUNTER_METRIC_REGISTER(entity, host_cpu_num); + INT_COUNTER_METRIC_REGISTER(entity, avail_cpu_num); } - IntAtomicCounter* host_cpu_num {nullptr}; + IntCounter* host_cpu_num {nullptr}; + IntCounter* avail_cpu_num {nullptr}; MetricEntity* entity = nullptr; }; @@ -65,16 +70,16 @@ DEFINE_CPU_COUNTER_METRIC(guest_nice); // /proc/stat: http://www.linuxhowtos.org/System/procstat.htm struct CpuMetrics { CpuMetrics(MetricEntity* ent) : entity(ent) { - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_user); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_nice); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_system); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_idle); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_iowait); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_irq); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_soft_irq); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_steal); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_guest); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, cpu_guest_nice); + INT_COUNTER_METRIC_REGISTER(entity, cpu_user); + INT_COUNTER_METRIC_REGISTER(entity, cpu_nice); + INT_COUNTER_METRIC_REGISTER(entity, cpu_system); + INT_COUNTER_METRIC_REGISTER(entity, cpu_idle); + INT_COUNTER_METRIC_REGISTER(entity, cpu_iowait); + INT_COUNTER_METRIC_REGISTER(entity, cpu_irq); + INT_COUNTER_METRIC_REGISTER(entity, cpu_soft_irq); + INT_COUNTER_METRIC_REGISTER(entity, cpu_steal); + INT_COUNTER_METRIC_REGISTER(entity, cpu_guest); + INT_COUNTER_METRIC_REGISTER(entity, cpu_guest_nice); metrics[0] = cpu_user; metrics[1] = cpu_nice; @@ -91,18 +96,18 @@ struct CpuMetrics { static constexpr int cpu_num_metrics = 10; MetricEntity* entity = nullptr; - IntAtomicCounter* cpu_user; - IntAtomicCounter* cpu_nice; - IntAtomicCounter* cpu_system; - IntAtomicCounter* cpu_idle; - IntAtomicCounter* cpu_iowait; - IntAtomicCounter* cpu_irq; - IntAtomicCounter* cpu_soft_irq; - IntAtomicCounter* cpu_steal; - IntAtomicCounter* cpu_guest; - IntAtomicCounter* cpu_guest_nice; - - IntAtomicCounter* metrics[cpu_num_metrics]; + IntCounter* cpu_user; + IntCounter* cpu_nice; + IntCounter* cpu_system; + IntCounter* cpu_idle; + IntCounter* cpu_iowait; + IntCounter* cpu_irq; + IntCounter* cpu_soft_irq; + IntCounter* cpu_steal; + IntCounter* cpu_guest; + IntCounter* cpu_guest_nice; + + IntCounter* metrics[cpu_num_metrics]; }; #define DEFINE_MEMORY_GAUGE_METRIC(metric, unit) \ @@ -211,25 +216,25 @@ DEFINE_DISK_COUNTER_METRIC(io_time_weigthed, MetricUnit::MILLISECONDS); struct DiskMetrics { DiskMetrics(MetricEntity* ent) : entity(ent) { - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, disk_reads_completed); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, disk_bytes_read); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, disk_read_time_ms); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, disk_writes_completed); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, disk_bytes_written); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, disk_write_time_ms); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, disk_io_time_ms); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, disk_io_time_weigthed); + INT_COUNTER_METRIC_REGISTER(entity, disk_reads_completed); + INT_COUNTER_METRIC_REGISTER(entity, disk_bytes_read); + INT_COUNTER_METRIC_REGISTER(entity, disk_read_time_ms); + INT_COUNTER_METRIC_REGISTER(entity, disk_writes_completed); + INT_COUNTER_METRIC_REGISTER(entity, disk_bytes_written); + INT_COUNTER_METRIC_REGISTER(entity, disk_write_time_ms); + INT_COUNTER_METRIC_REGISTER(entity, disk_io_time_ms); + INT_COUNTER_METRIC_REGISTER(entity, disk_io_time_weigthed); } MetricEntity* entity = nullptr; - IntAtomicCounter* disk_reads_completed; - IntAtomicCounter* disk_bytes_read; - IntAtomicCounter* disk_read_time_ms; - IntAtomicCounter* disk_writes_completed; - IntAtomicCounter* disk_bytes_written; - IntAtomicCounter* disk_write_time_ms; - IntAtomicCounter* disk_io_time_ms; - IntAtomicCounter* disk_io_time_weigthed; + IntCounter* disk_reads_completed; + IntCounter* disk_bytes_read; + IntCounter* disk_read_time_ms; + IntCounter* disk_writes_completed; + IntCounter* disk_bytes_written; + IntCounter* disk_write_time_ms; + IntCounter* disk_io_time_ms; + IntCounter* disk_io_time_weigthed; }; #define DEFINE_NETWORK_COUNTER_METRIC(metric, unit) \ @@ -241,17 +246,17 @@ DEFINE_NETWORK_COUNTER_METRIC(send_packets, MetricUnit::PACKETS); struct NetworkMetrics { NetworkMetrics(MetricEntity* ent) : entity(ent) { - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, network_receive_bytes); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, network_receive_packets); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, network_send_bytes); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, network_send_packets); + INT_COUNTER_METRIC_REGISTER(entity, network_receive_bytes); + INT_COUNTER_METRIC_REGISTER(entity, network_receive_packets); + INT_COUNTER_METRIC_REGISTER(entity, network_send_bytes); + INT_COUNTER_METRIC_REGISTER(entity, network_send_packets); } MetricEntity* entity = nullptr; - IntAtomicCounter* network_receive_bytes; - IntAtomicCounter* network_receive_packets; - IntAtomicCounter* network_send_bytes; - IntAtomicCounter* network_send_packets; + IntCounter* network_receive_bytes; + IntCounter* network_receive_packets; + IntCounter* network_send_bytes; + IntCounter* network_send_packets; }; #define DEFINE_SNMP_COUNTER_METRIC(metric, unit, desc) \ @@ -265,17 +270,17 @@ DEFINE_SNMP_COUNTER_METRIC(tcp_out_segs, MetricUnit::NOUNIT, "All send TCP packe // metrics read from /proc/net/snmp struct SnmpMetrics { SnmpMetrics(MetricEntity* ent) : entity(ent) { - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, snmp_tcp_in_errs); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, snmp_tcp_retrans_segs); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, snmp_tcp_in_segs); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, snmp_tcp_out_segs); + INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_in_errs); + INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_retrans_segs); + INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_in_segs); + INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_out_segs); } MetricEntity* entity = nullptr; - IntAtomicCounter* snmp_tcp_in_errs; - IntAtomicCounter* snmp_tcp_retrans_segs; - IntAtomicCounter* snmp_tcp_in_segs; - IntAtomicCounter* snmp_tcp_out_segs; + IntCounter* snmp_tcp_in_errs; + IntCounter* snmp_tcp_retrans_segs; + IntCounter* snmp_tcp_in_segs; + IntCounter* snmp_tcp_out_segs; }; #define DEFINE_FD_COUNTER_METRIC(metric, unit) \ @@ -303,9 +308,9 @@ DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(15_minutes); struct LoadAverageMetrics { LoadAverageMetrics(MetricEntity* ent) : entity(ent) { - INT_DOUBLE_METRIC_REGISTER(entity, load_average_1_minutes); - INT_DOUBLE_METRIC_REGISTER(entity, load_average_5_minutes); - INT_DOUBLE_METRIC_REGISTER(entity, load_average_15_minutes); + DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_1_minutes); + DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_5_minutes); + DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_15_minutes); } MetricEntity* entity = nullptr; @@ -324,18 +329,18 @@ DEFINE_PROC_STAT_COUNTER_METRIC(procs_blocked); struct ProcMetrics { ProcMetrics(MetricEntity* ent) : entity(ent) { - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, proc_interrupt); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, proc_ctxt_switch); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, proc_procs_running); - INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, proc_procs_blocked); + INT_COUNTER_METRIC_REGISTER(entity, proc_interrupt); + INT_COUNTER_METRIC_REGISTER(entity, proc_ctxt_switch); + INT_COUNTER_METRIC_REGISTER(entity, proc_procs_running); + INT_COUNTER_METRIC_REGISTER(entity, proc_procs_blocked); } MetricEntity* entity = nullptr; - IntAtomicCounter* proc_interrupt; - IntAtomicCounter* proc_ctxt_switch; - IntAtomicCounter* proc_procs_running; - IntAtomicCounter* proc_procs_blocked; + IntCounter* proc_interrupt; + IntCounter* proc_ctxt_switch; + IntCounter* proc_procs_running; + IntCounter* proc_procs_blocked; }; DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_disk_io_util_percent, MetricUnit::PERCENT); @@ -1004,6 +1009,14 @@ void SystemMetrics::_update_proc_metrics() { fclose(fp); } +void SystemMetrics::update_be_avail_cpu_num() { + int64_t physical_cpu_num = _cpu_num_metrics->host_cpu_num->value(); + if (physical_cpu_num > 0) { + physical_cpu_num = CGroupUtil::get_cgroup_limited_cpu_number(physical_cpu_num); + _cpu_num_metrics->avail_cpu_num->set_value(physical_cpu_num); + } +} + void SystemMetrics::get_metrics_from_proc_vmstat() { #ifdef BE_TEST FILE* fp = fopen(k_ut_vmstat_path, "r"); diff --git a/be/src/util/system_metrics.h b/be/src/util/system_metrics.h index 29ce8c9c02b359..2c5446b81f4f71 100644 --- a/be/src/util/system_metrics.h +++ b/be/src/util/system_metrics.h @@ -66,6 +66,8 @@ class SystemMetrics { void update_max_network_receive_bytes_rate(int64_t max_receive_bytes_rate); void update_allocator_metrics(); + void update_be_avail_cpu_num(); + private: void _install_cpu_metrics(); // On Intel(R) Xeon(R) CPU E5-2450 0 @ 2.10GHz; diff --git a/be/src/util/threadpool.cpp b/be/src/util/threadpool.cpp index f5ea38515def36..e9af13f556e143 100644 --- a/be/src/util/threadpool.cpp +++ b/be/src/util/threadpool.cpp @@ -27,6 +27,7 @@ #include #include +#include "common/exception.h" #include "common/logging.h" #include "gutil/map-util.h" #include "gutil/port.h" @@ -194,7 +195,7 @@ void ThreadPoolToken::transition(State new_state) { CHECK(false); // QUIESCED is a terminal state break; default: - LOG(FATAL) << "Unknown token state: " << _state; + throw Exception(Status::FatalError("Unknown token state: {}", _state)); } #endif @@ -616,10 +617,10 @@ Status ThreadPool::create_thread() { void ThreadPool::check_not_pool_thread_unlocked() { Thread* current = Thread::current_thread(); if (ContainsKey(_threads, current)) { - LOG(FATAL) << strings::Substitute( - "Thread belonging to thread pool '$0' with " - "name '$1' called pool function that would result in deadlock", - _name, current->name()); + throw Exception( + Status::FatalError("Thread belonging to thread pool {} with " + "name {} called pool function that would result in deadlock", + _name, current->name())); } } diff --git a/be/src/util/timezone_utils.cpp b/be/src/util/timezone_utils.cpp index 6bb71ac46471c9..a26ad3703b79b9 100644 --- a/be/src/util/timezone_utils.cpp +++ b/be/src/util/timezone_utils.cpp @@ -35,6 +35,7 @@ #include #include +#include "common/exception.h" #include "common/logging.h" #include "common/status.h" @@ -83,8 +84,7 @@ void TimezoneUtils::load_timezones_to_cache() { const auto root_path = fs::path {base_str}; if (!exists(root_path)) { - LOG(FATAL) << "Cannot find system tzfile. Doris exiting!"; - __builtin_unreachable(); + throw Exception(Status::FatalError("Cannot find system tzfile. Doris exiting!")); } std::set ignore_paths = {"posix", "right"}; // duplications. ignore them. diff --git a/be/src/util/utf8_check.cpp b/be/src/util/utf8_check.cpp index 5355b9014202bb..f90c27e5e915ac 100644 --- a/be/src/util/utf8_check.cpp +++ b/be/src/util/utf8_check.cpp @@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) { return validate_utf8_naive(src, len); } #endif + +bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len) { + if (params.__isset.file_attributes && !params.file_attributes.enable_text_validate_utf8) { + return true; + } + return validate_utf8(src, len); +} } // namespace doris diff --git a/be/src/util/utf8_check.h b/be/src/util/utf8_check.h index 4214e186b71508..7e9b7a2a9de6af 100644 --- a/be/src/util/utf8_check.h +++ b/be/src/util/utf8_check.h @@ -17,6 +17,8 @@ #pragma once +#include + #include namespace doris { @@ -25,4 +27,6 @@ namespace doris { bool validate_utf8(const char* src, size_t len); // check utf8 use naive c++ bool validate_utf8_naive(const char* data, size_t len); + +bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len); } // namespace doris diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index e0ec2bef62fc2a..d761d40c4c932c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -20,6 +20,8 @@ #pragma once +#include + #include "common/exception.h" #include "common/status.h" #include "util/defer_op.h" @@ -81,7 +83,7 @@ using ConstAggregateDataPtr = const char*; */ class IAggregateFunction { public: - IAggregateFunction(const DataTypes& argument_types_) : argument_types(argument_types_) {} + IAggregateFunction(DataTypes argument_types_) : argument_types(std::move(argument_types_)) {} /// Get main function name. virtual String get_name() const = 0; @@ -225,7 +227,7 @@ class IAggregateFunction { virtual void set_version(const int version_) { version = version_; } - virtual AggregateFunctionPtr transmit_to_stable() { return nullptr; } + virtual IAggregateFunction* transmit_to_stable() { return nullptr; } /// Verify function signature virtual Status verify_result_type(const bool without_key, const DataTypes& argument_types, diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.h b/be/src/vec/aggregate_functions/aggregate_function_distinct.h index 46450394627474..a5515145d9d2ad 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.h +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.h @@ -341,12 +341,22 @@ class AggregateFunctionDistinct DataTypePtr get_return_type() const override { return nested_func->get_return_type(); } - AggregateFunctionPtr transmit_to_stable() override { - return AggregateFunctionPtr(new AggregateFunctionDistinct( - nested_func, IAggregateFunction::argument_types)); + IAggregateFunction* transmit_to_stable() override { + return new AggregateFunctionDistinct(nested_func, + IAggregateFunction::argument_types); } }; +template +struct FunctionStableTransfer { + using FunctionStable = T; +}; + +template