From a8779453f161f1338e91a57c30737ba3d331d279 Mon Sep 17 00:00:00 2001 From: hui lai <1353307710@qq.com> Date: Thu, 13 Jun 2024 22:01:09 +0800 Subject: [PATCH] [fix](multi-table-load) fix be core when multi table load pipe finish fail (#36269) ## Proposed changes ``` *** Current BE git commitID: 5a8ea3079d *** *** SIGSEGV address not mapped to object (@0x18) received by PID 3726857 (TID 3727585 OR 0x7f0129e83700) from PID 24; stack trace: *** 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /mnt/disk2/xujianxu/doris/be/src/common/signal_handler.h:421 1# PosixSignals::chained_handler(int, siginfo*, void*) [clone .part.0] in /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so 2# JVM_handle_linux_signal in /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so 3# 0x00007F01D9E87090 in /lib/x86_64-linux-gnu/libc.so.6 4# std::_Function_handler (), std::__future_base::_State_baseV2::_Setter >::_M_invoke(std::_Any_data const&) at /mnt/disk2/xujianxu/ldb_toolchain/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/bits/std_function.h:290 5# std::__future_base::_State_baseV2::_M_do_set(std::function ()>*, bool*) at /mnt/disk2/xujianxu/ldb_toolchain/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/future:593 6# __pthread_once_slow at /build/glibc-SzIz7B/glibc-2.31/nptl/pthread_once.c:118 7# std::__future_base::_State_baseV2::_M_set_result(std::function ()>, bool) at /mnt/disk2/xujianxu/ldb_toolchain/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/future:428 8# doris::io::MultiTablePipe::_handle_consumer_finished() at /mnt/disk2/xujianxu/doris/be/src/io/fs/multi_table_pipe.cpp:334 9# doris::io::MultiTablePipe::exec_plans(doris::ExecEnv*, std::vector >)::{lambda(doris::RuntimeState*, doris::Status*)#1}::operator()(doris::RuntimeState*, doris::Status*) const at /mnt/disk2/xujianxu/doris/be/src/io/fs/multi_table_pipe.cpp:253 10# doris::pipeline::PipelineFragmentContext::~PipelineFragmentContext() at /mnt/disk2/xujianxu/doris/be/src/pipeline/pipeline_fragment_context.cpp:131 11# std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release_last_use_cold() at /mnt/disk2/xujianxu/ldb_toolchain/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/bits/shared_ptr_base.h:199 12# doris::pipeline::_close_task(doris::pipeline::PipelineTask*, doris::Status) at /mnt/disk2/xujianxu/doris/be/src/pipeline/task_scheduler.cpp:95 13# doris::pipeline::TaskScheduler::_do_work(unsigned long) at /mnt/disk2/xujianxu/doris/be/src/pipeline/task_scheduler.cpp:168 14# doris::ThreadPool::dispatch_thread() in /mnt/hdd01/STRESS_ENV/be/lib/doris_be 15# doris::Thread::supervise_thread(void*) at /mnt/disk2/xujianxu/doris/be/src/util/thread.cpp:499 16# start_thread at /build/glibc-SzIz7B/glibc-2.31/nptl/pthread_create.c:478 17# __clone at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97 ``` BE will core when multi table load pipe finish fail. For exec_task will return if finish fail, causing ctx was deconstructed. Wait all table finish to solve this problem. --- be/src/runtime/routine_load/routine_load_task_executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/runtime/routine_load/routine_load_task_executor.cpp b/be/src/runtime/routine_load/routine_load_task_executor.cpp index 292f45ec030de4..958ad37f8d2d09 100644 --- a/be/src/runtime/routine_load/routine_load_task_executor.cpp +++ b/be/src/runtime/routine_load/routine_load_task_executor.cpp @@ -408,7 +408,7 @@ void RoutineLoadTaskExecutor::exec_task(std::shared_ptr ctx, } // need memory order multi_table_pipe->handle_consume_finished(); - HANDLE_ERROR(kafka_pipe->finish(), "finish multi table task failed"); + HANDLE_MULTI_TABLE_ERROR(kafka_pipe->finish(), "finish multi table task failed"); } else { // start to consume, this may block a while HANDLE_ERROR(consumer_grp->start_all(ctx, kafka_pipe), "consuming failed");