diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 15b12fc843eaff5..2757578827c2bb7 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -16,3 +16,4 @@ # be/src/io/* @platoneko @gavinchou @dataroaring fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @dataroaring @CalvinKirs @morningman +**/pom.xml @CalvinKirs @morningman diff --git a/.github/workflows/comment-to-trigger-teamcity.yml b/.github/workflows/comment-to-trigger-teamcity.yml index bd3e29dbb0c8e80..f56077424b6d926 100644 --- a/.github/workflows/comment-to-trigger-teamcity.yml +++ b/.github/workflows/comment-to-trigger-teamcity.yml @@ -21,15 +21,21 @@ on: issue_comment: types: [created, edited] +permissions: + contents: read + pull-requests: write + statuses: write + jobs: check-comment-if-need-to-trigger-teamcity: # This job only runs for pull request comments, and comment body contains 'run' - if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, 'run') }} + if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, 'run') || contains(github.event.comment.body, 'skip buildall')) }} runs-on: ubuntu-latest env: COMMENT_BODY: ${{ github.event.comment.body }} + COMMENT_USER_ID: ${{ github.event.comment.user.id }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: @@ -50,8 +56,20 @@ jobs: "${COMMENT_BODY}" == *'run arm'* || "${COMMENT_BODY}" == *'run performance'* ]]; then echo "comment_trigger=true" | tee -a "$GITHUB_OUTPUT" + echo "comment_skip=false" | tee -a "$GITHUB_OUTPUT" + elif [[ "${COMMENT_BODY}" == *'skip buildall'* ]]; then + if [[ "${COMMENT_USER_ID}" == '27881198' || + "${COMMENT_USER_ID}" == '37901441' ]]; then + echo "comment_trigger=false" | tee -a "$GITHUB_OUTPUT" + echo "comment_skip=true" | tee -a "$GITHUB_OUTPUT" + echo "COMMENT_USER_ID ${COMMENT_USER_ID} is allowed to skip buildall." + else + echo "COMMENT_USER_ID ${COMMENT_USER_ID} is not allowed to skip buildall." + exit + fi else echo "comment_trigger=false" | tee -a "$GITHUB_OUTPUT" + echo "comment_skip=false" | tee -a "$GITHUB_OUTPUT" echo "find no keyword in comment body, skip this action." exit fi @@ -71,7 +89,7 @@ jobs: echo "COMMENT_REPEAT_TIMES=${COMMENT_REPEAT_TIMES}" | tee -a "$GITHUB_OUTPUT" - name: "Checkout master" - if: ${{ fromJSON(steps.parse.outputs.comment_trigger) }} + if: ${{ fromJSON(steps.parse.outputs.comment_trigger) || fromJSON(steps.parse.outputs.comment_skip) }} uses: actions/checkout@v4 - name: "Check if pr need run build" @@ -150,6 +168,7 @@ jobs: set -x if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.0'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.1'" ]]; then trigger_or_skip_build \ "${{ steps.changes.outputs.changed_fe_ut }}" \ @@ -158,7 +177,7 @@ jobs: "feut" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch is not in (master, branch-2.0, branch-2.1), skip run feut" + echo "PR target branch is not in (master, branch-2.0, branch-2.1, branch-3.0), skip run feut" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -175,6 +194,7 @@ jobs: set -x if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.0'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.1'" ]]; then trigger_or_skip_build \ "${{ steps.changes.outputs.changed_be_ut }}" \ @@ -183,7 +203,7 @@ jobs: "beut" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch is not in (master, branch-2.0, branch-2.1), skip run beut" + echo "PR target branch is not in (master, branch-2.0, branch-2.1, branch-3.0), skip run beut" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -197,7 +217,8 @@ jobs: run: | source ./regression-test/pipeline/common/teamcity-utils.sh set -x - if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" ]]; then + if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" ]]; then trigger_or_skip_build \ "${{ steps.changes.outputs.changed_cloud_ut }}" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -205,7 +226,7 @@ jobs: "cloudut" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch is not master, skip run cloudut" + echo "PR target branch is not in (master, branch-3.0), skip run cloudut" fi - name: "Trigger or Skip compile" @@ -279,8 +300,9 @@ jobs: echo "COMMENT_TRIGGER_TYPE is buildall, trigger compile is enough, compile will trigger cloud_p0" && exit fi set -x - if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" ]]; then - echo "PR target branch in (master), need run cloud_p0" + if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" ]]; then + echo "PR target branch is in (master, branch-3.0), need run cloud_p0" trigger_or_skip_build \ "${{ steps.changes.outputs.changed_cloud_p0 }}" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -288,7 +310,7 @@ jobs: "cloud_p0" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch not in (master), skip run cloud_p0" + echo "PR target branch is not in (master, branch-3.0), skip run cloud_p0" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -305,8 +327,9 @@ jobs: echo "COMMENT_TRIGGER_TYPE is buildall, trigger compile is enough, compile will trigger cloud_p1" && exit fi set -x - if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" ]]; then - echo "PR target branch in (master), need run cloud_p1" + if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" ]]; then + echo "PR target branch is in (master, branch-3.0), need run cloud_p1" trigger_or_skip_build \ "${{ steps.changes.outputs.changed_cloud_p1 }}" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -314,7 +337,7 @@ jobs: "cloud_p1" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch not in (master), skip run cloud_p1" + echo "PR target branch is not in (master, branch-3.0), skip run cloud_p1" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -341,8 +364,9 @@ jobs: source ./regression-test/pipeline/common/teamcity-utils.sh set -x if [[ "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'master'" || + "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-3.0'" || "${{ steps.parse.outputs.TARGET_BRANCH }}" == "'branch-2.0'" ]]; then - echo "PR target branch in (master, branch-2.0), need run performance" + echo "PR target branch is in (master, branch-2.0, branch-3.0), need run performance" trigger_or_skip_build \ "${{ steps.changes.outputs.changed_performance }}" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -350,7 +374,7 @@ jobs: "performance" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" else - echo "PR target branch not in (master, branch-2.0), skip run performance" + echo "PR target branch is not in (master, branch-2.0, branch-3.0), skip run performance" trigger_or_skip_build \ "false" \ "${{ steps.parse.outputs.PULL_REQUEST_NUM }}" \ @@ -358,3 +382,19 @@ jobs: "performance" \ "${{ steps.parse.outputs.COMMENT_REPEAT_TIMES }}" fi + + - name: "Skip buildall" + if: ${{ fromJSON(steps.parse.outputs.comment_skip) }} + run: | + source ./regression-test/pipeline/common/teamcity-utils.sh + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" feut + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" beut + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" compile + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" p0 + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" p1 + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" external + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" performance + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" arm + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" cloud_p0 + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" cloud_p1 + skip_build "${{ steps.parse.outputs.COMMIT_ID_FROM_TRIGGER }}" cloudut diff --git a/.github/workflows/labeler/scope-label-conf.yml b/.github/workflows/labeler/scope-label-conf.yml index 47f6e1c4ab485be..4bd2862c67445af 100644 --- a/.github/workflows/labeler/scope-label-conf.yml +++ b/.github/workflows/labeler/scope-label-conf.yml @@ -25,4 +25,6 @@ meta-change: - gensrc/proto/* doing: - - * + - base-branch: 'master' + - changed-files: + - any-glob-to-any-file: '**' diff --git a/.licenserc.yaml b/.licenserc.yaml index e458f812bd45e4e..28821fbb82c6041 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -92,4 +92,5 @@ header: - "pytest/qe" - "pytest/sys/data" - "pytest/deploy/*.conf" + - "tools/jeprof" comment: on-failure diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 00d084bbc254e5e..f554ba6053a5e6d 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -157,6 +157,7 @@ set(BOOST_VERSION "1.81.0") if (NOT APPLE) find_package(Boost ${BOOST_VERSION} REQUIRED COMPONENTS system date_time) + find_package(Boost ${BOOST_VERSION} REQUIRED COMPONENTS system container) else() find_package(Boost ${BOOST_VERSION} COMPONENTS system date_time) find_package(Boost ${BOOST_VERSION} COMPONENTS system container) @@ -298,12 +299,11 @@ if (COMPILER_CLANG) -Wno-implicit-float-conversion -Wno-implicit-int-conversion -Wno-sign-conversion + -Wno-missing-field-initializers + -Wno-unused-const-variable -Wno-shorten-64-to-32) if (USE_LIBCPP) add_compile_options($<$:-stdlib=libc++>) - if (NOT OS_MACOSX) - add_compile_options($<$:-lstdc++>) - endif() add_definitions(-DUSE_LIBCPP) endif() endif () @@ -517,6 +517,7 @@ find_package(absl) # add it here first. set(COMMON_THIRDPARTY Boost::date_time + Boost::container ${COMMON_THIRDPARTY} ) @@ -559,7 +560,6 @@ endif() if (OS_MACOSX) set(COMMON_THIRDPARTY ${COMMON_THIRDPARTY} - Boost::container bfd iberty intl @@ -603,9 +603,11 @@ if (NOT OS_MACOSX) ${DORIS_DEPENDENCIES} -static-libstdc++ -static-libgcc - -lstdc++fs -lresolv ) + if (NOT (USE_LIBCPP AND COMPILER_CLANG)) + set(DORIS_LINK_LIBS ${DORIS_LINK_LIBS} -lstdc++fs) + endif() else() set(DORIS_LINK_LIBS ${DORIS_LINK_LIBS} diff --git a/be/src/agent/be_exec_version_manager.h b/be/src/agent/be_exec_version_manager.h index ec6ddf497ec084d..a7b4e2dee20e577 100644 --- a/be/src/agent/be_exec_version_manager.h +++ b/be/src/agent/be_exec_version_manager.h @@ -80,8 +80,9 @@ class BeExecVersionManager { * b. clear old version of version 3->4 * c. change FunctionIsIPAddressInRange from AlwaysNotNullable to DependOnArguments * d. change some agg function nullable property: PR #37215 + * e. change variant serde to fix PR #38413 */ -constexpr inline int BeExecVersionManager::max_be_exec_version = 5; +constexpr inline int BeExecVersionManager::max_be_exec_version = 7; constexpr inline int BeExecVersionManager::min_be_exec_version = 0; /// functional @@ -89,5 +90,6 @@ constexpr inline int BITMAP_SERDE = 3; constexpr inline int USE_NEW_SERDE = 4; // release on DORIS version 2.1 constexpr inline int OLD_WAL_SERDE = 3; // use to solve compatibility issues, see pr #32299 constexpr inline int AGG_FUNCTION_NULLABLE = 5; // change some agg nullable property: PR #37215 +constexpr inline int VARIANT_SERDE = 6; // change variant serde to fix PR #38413 } // namespace doris diff --git a/be/src/agent/cgroup_cpu_ctl.cpp b/be/src/agent/cgroup_cpu_ctl.cpp index e1bdd1c7207ec82..c98a28746f238df 100644 --- a/be/src/agent/cgroup_cpu_ctl.cpp +++ b/be/src/agent/cgroup_cpu_ctl.cpp @@ -19,31 +19,110 @@ #include #include +#include #include +#include "util/cgroup_util.h" #include "util/defer_op.h" namespace doris { -Status CgroupCpuCtl::init() { - _doris_cgroup_cpu_path = config::doris_cgroup_cpu_path; - if (_doris_cgroup_cpu_path.empty()) { - LOG(INFO) << "doris cgroup cpu path is not specify, path=" << _doris_cgroup_cpu_path; - return Status::InvalidArgument("doris cgroup cpu path {} is not specify.", - _doris_cgroup_cpu_path); +bool CgroupCpuCtl::is_a_valid_cgroup_path(std::string cg_path) { + if (!cg_path.empty()) { + if (cg_path.back() != '/') { + cg_path = cg_path + "/"; + } + if (_is_enable_cgroup_v2_in_env) { + std::string query_path_cg_type = cg_path + "cgroup.type"; + std::string query_path_ctl = cg_path + "cgroup.subtree_control"; + std::string query_path_procs = cg_path + "cgroup.procs"; + if (access(query_path_cg_type.c_str(), F_OK) != 0 || + access(query_path_ctl.c_str(), F_OK) != 0 || + access(query_path_procs.c_str(), F_OK) != 0) { + LOG(WARNING) << "[cgroup_init_path]invalid cgroup v2 path, access neccessary file " + "failed"; + } else { + return true; + } + } else if (_is_enable_cgroup_v1_in_env) { + std::string query_path_tasks = cg_path + "tasks"; + std::string query_path_cpu_shares = cg_path + "cpu.shares"; + std::string query_path_quota = cg_path + "cpu.cfs_quota_us"; + if (access(query_path_tasks.c_str(), F_OK) != 0 || + access(query_path_cpu_shares.c_str(), F_OK) != 0 || + access(query_path_quota.c_str(), F_OK) != 0) { + LOG(WARNING) << "[cgroup_init_path]invalid cgroup v1 path, access neccessary file " + "failed"; + } else { + return true; + } + } } + return false; +} - if (access(_doris_cgroup_cpu_path.c_str(), F_OK) != 0) { - LOG(INFO) << "doris cgroup cpu path not exists, path=" << _doris_cgroup_cpu_path; - return Status::InvalidArgument("doris cgroup cpu path {} not exists.", - _doris_cgroup_cpu_path); +void CgroupCpuCtl::init_doris_cgroup_path() { + std::string conf_path = config::doris_cgroup_cpu_path; + if (conf_path.empty()) { + LOG(INFO) << "[cgroup_init_path]doris cgroup home path is not specify"; + return; } - if (_doris_cgroup_cpu_path.back() != '/') { - _doris_cgroup_cpu_path = _doris_cgroup_cpu_path + "/"; + if (access(conf_path.c_str(), F_OK) != 0) { + LOG(INFO) << "[cgroup_init_path]doris cgroup home path not exists, path=" << conf_path; + return; } - return Status::OK(); + + if (conf_path.back() != '/') { + conf_path = conf_path + "/"; + } + + // check whether current user specified path is a valid cgroup path + std::string cg_msg = "not set cgroup in env"; + if (CGroupUtil::cgroupsv2_enable()) { + _is_enable_cgroup_v2_in_env = true; + cg_msg = "cgroup v2 is enabled in env"; + } else if (CGroupUtil::cgroupsv1_enable()) { + _is_enable_cgroup_v1_in_env = true; + cg_msg = "cgroup v1 is enabled in env"; + } + bool is_cgroup_path_valid = CgroupCpuCtl::is_a_valid_cgroup_path(conf_path); + + std::string tmp_query_path = conf_path + "query"; + if (is_cgroup_path_valid) { + if (access(tmp_query_path.c_str(), F_OK) != 0) { + int ret = mkdir(tmp_query_path.c_str(), S_IRWXU); + if (ret != 0) { + LOG(ERROR) << "[cgroup_init_path]cgroup mkdir query failed, path=" + << tmp_query_path; + } + } + _is_cgroup_query_path_valid = CgroupCpuCtl::is_a_valid_cgroup_path(tmp_query_path); + } + + _doris_cgroup_cpu_path = conf_path; + _doris_cgroup_cpu_query_path = tmp_query_path; + std::string query_path_msg = _is_cgroup_query_path_valid ? "cgroup query path is valid" + : "cgroup query path is not valid"; + _cpu_core_num = CpuInfo::num_cores(); + + LOG(INFO) << "[cgroup_init_path]init cgroup home path finish, home path=" + << _doris_cgroup_cpu_path << ", query path=" << _doris_cgroup_cpu_query_path << ", " + << cg_msg << ", " << query_path_msg << ", core_num=" << _cpu_core_num; +} + +uint64_t CgroupCpuCtl::cpu_soft_limit_default_value() { + return _is_enable_cgroup_v2_in_env ? 100 : 1024; +} + +std::unique_ptr CgroupCpuCtl::create_cgroup_cpu_ctl(uint64_t wg_id) { + if (_is_enable_cgroup_v2_in_env) { + return std::make_unique(wg_id); + } else if (_is_enable_cgroup_v1_in_env) { + return std::make_unique(wg_id); + } + return nullptr; } void CgroupCpuCtl::get_cgroup_cpu_info(uint64_t* cpu_shares, int* cpu_hard_limit) { @@ -78,7 +157,7 @@ void CgroupCpuCtl::update_cpu_soft_limit(int cpu_shares) { } } -Status CgroupCpuCtl::write_cg_sys_file(std::string file_path, int value, std::string msg, +Status CgroupCpuCtl::write_cg_sys_file(std::string file_path, std::string value, std::string msg, bool is_append) { int fd = open(file_path.c_str(), is_append ? O_RDWR | O_APPEND : O_RDWR); if (fd == -1) { @@ -102,82 +181,7 @@ Status CgroupCpuCtl::write_cg_sys_file(std::string file_path, int value, std::st return Status::OK(); } -Status CgroupV1CpuCtl::init() { - RETURN_IF_ERROR(CgroupCpuCtl::init()); - - // query path - _cgroup_v1_cpu_query_path = _doris_cgroup_cpu_path + "query"; - if (access(_cgroup_v1_cpu_query_path.c_str(), F_OK) != 0) { - int ret = mkdir(_cgroup_v1_cpu_query_path.c_str(), S_IRWXU); - if (ret != 0) { - LOG(ERROR) << "cgroup v1 mkdir query failed, path=" << _cgroup_v1_cpu_query_path; - return Status::InternalError("cgroup v1 mkdir query failed, path={}", - _cgroup_v1_cpu_query_path); - } - } - - // check whether current user specified path is a valid cgroup path - std::string query_path_tasks = _cgroup_v1_cpu_query_path + "/tasks"; - std::string query_path_cpu_shares = _cgroup_v1_cpu_query_path + "/cpu.shares"; - std::string query_path_quota = _cgroup_v1_cpu_query_path + "/cpu.cfs_quota_us"; - if (access(query_path_tasks.c_str(), F_OK) != 0) { - return Status::InternalError("invalid cgroup path, not find task file"); - } - if (access(query_path_cpu_shares.c_str(), F_OK) != 0) { - return Status::InternalError("invalid cgroup path, not find cpu share file"); - } - if (access(query_path_quota.c_str(), F_OK) != 0) { - return Status::InternalError("invalid cgroup path, not find cpu quota file"); - } - - if (_wg_id == -1) { - // means current cgroup cpu ctl is just used to clear dir, - // it does not contains workload group. - // todo(wb) rethinking whether need to refactor cgroup_cpu_ctl - _init_succ = true; - LOG(INFO) << "init cgroup cpu query path succ, path=" << _cgroup_v1_cpu_query_path; - return Status::OK(); - } - - // workload group path - _cgroup_v1_cpu_tg_path = _cgroup_v1_cpu_query_path + "/" + std::to_string(_wg_id); - if (access(_cgroup_v1_cpu_tg_path.c_str(), F_OK) != 0) { - int ret = mkdir(_cgroup_v1_cpu_tg_path.c_str(), S_IRWXU); - if (ret != 0) { - LOG(ERROR) << "cgroup v1 mkdir workload group failed, path=" << _cgroup_v1_cpu_tg_path; - return Status::InternalError("cgroup v1 mkdir workload group failed, path=", - _cgroup_v1_cpu_tg_path); - } - } - - // quota file - _cgroup_v1_cpu_tg_quota_file = _cgroup_v1_cpu_tg_path + "/cpu.cfs_quota_us"; - // cpu.shares file - _cgroup_v1_cpu_tg_shares_file = _cgroup_v1_cpu_tg_path + "/cpu.shares"; - // task file - _cgroup_v1_cpu_tg_task_file = _cgroup_v1_cpu_tg_path + "/tasks"; - LOG(INFO) << "cgroup v1 cpu path init success" - << ", query tg path=" << _cgroup_v1_cpu_tg_path - << ", query tg quota file path=" << _cgroup_v1_cpu_tg_quota_file - << ", query tg tasks file path=" << _cgroup_v1_cpu_tg_task_file - << ", core num=" << _cpu_core_num; - _init_succ = true; - return Status::OK(); -} - -Status CgroupV1CpuCtl::modify_cg_cpu_soft_limit_no_lock(int cpu_shares) { - std::string msg = "modify cpu shares to " + std::to_string(cpu_shares); - return CgroupCpuCtl::write_cg_sys_file(_cgroup_v1_cpu_tg_shares_file, cpu_shares, msg, false); -} - -Status CgroupV1CpuCtl::modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) { - int val = cpu_hard_limit > 0 ? (_cpu_cfs_period_us * _cpu_core_num * cpu_hard_limit / 100) - : CGROUP_CPU_HARD_LIMIT_DEFAULT_VALUE; - std::string msg = "modify cpu quota value to " + std::to_string(val); - return CgroupCpuCtl::write_cg_sys_file(_cgroup_v1_cpu_tg_quota_file, val, msg, false); -} - -Status CgroupV1CpuCtl::add_thread_to_cgroup() { +Status CgroupCpuCtl::add_thread_to_cgroup(std::string task_path) { if (!_init_succ) { return Status::OK(); } @@ -189,18 +193,17 @@ Status CgroupV1CpuCtl::add_thread_to_cgroup() { std::string msg = "add thread " + std::to_string(tid) + " to group" + " " + std::to_string(_wg_id); std::lock_guard w_lock(_lock_mutex); - return CgroupCpuCtl::write_cg_sys_file(_cgroup_v1_cpu_tg_task_file, tid, msg, true); + return CgroupCpuCtl::write_cg_sys_file(task_path, std::to_string(tid), msg, true); #endif } -Status CgroupV1CpuCtl::delete_unused_cgroup_path(std::set& used_wg_ids) { - if (!_init_succ) { - return Status::InternalError( - "cgroup cpu ctl init failed, delete can not be executed"); +Status CgroupCpuCtl::delete_unused_cgroup_path(std::set& used_wg_ids) { + if (!_is_cgroup_query_path_valid) { + return Status::InternalError("not find a valid cgroup query path"); } // 1 get unused wg id std::set unused_wg_ids; - for (const auto& entry : std::filesystem::directory_iterator(_cgroup_v1_cpu_query_path)) { + for (const auto& entry : std::filesystem::directory_iterator(_doris_cgroup_cpu_query_path)) { const std::string dir_name = entry.path().string(); struct stat st; // == 0 means exists @@ -222,9 +225,9 @@ Status CgroupV1CpuCtl::delete_unused_cgroup_path(std::set& used_wg_ids // 2 delete unused cgroup path int failed_count = 0; - std::string query_path = _cgroup_v1_cpu_query_path.back() != '/' - ? _cgroup_v1_cpu_query_path + "/" - : _cgroup_v1_cpu_query_path; + std::string query_path = _doris_cgroup_cpu_query_path.back() != '/' + ? _doris_cgroup_cpu_query_path + "/" + : _doris_cgroup_cpu_query_path; for (const std::string& unused_wg_id : unused_wg_ids) { std::string wg_path = query_path + unused_wg_id; int ret = rmdir(wg_path.c_str()); @@ -240,4 +243,157 @@ Status CgroupV1CpuCtl::delete_unused_cgroup_path(std::set& used_wg_ids return Status::OK(); } +Status CgroupV1CpuCtl::init() { + if (!_is_cgroup_query_path_valid) { + return Status::InternalError("cgroup query path is not valid"); + } + + if (_wg_id <= 0) { + return Status::InternalError("find an invalid wg_id {}", _wg_id); + } + + // workload group path + _cgroup_v1_cpu_tg_path = _doris_cgroup_cpu_query_path + "/" + std::to_string(_wg_id); + if (access(_cgroup_v1_cpu_tg_path.c_str(), F_OK) != 0) { + int ret = mkdir(_cgroup_v1_cpu_tg_path.c_str(), S_IRWXU); + if (ret != 0) { + LOG(ERROR) << "cgroup v1 mkdir workload group failed, path=" << _cgroup_v1_cpu_tg_path; + return Status::InternalError("cgroup v1 mkdir workload group failed, path=", + _cgroup_v1_cpu_tg_path); + } + } + + _cgroup_v1_cpu_tg_quota_file = _cgroup_v1_cpu_tg_path + "/cpu.cfs_quota_us"; + if (access(_cgroup_v1_cpu_tg_quota_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v1 cpu.cfs_quota_us file"); + } + _cgroup_v1_cpu_tg_shares_file = _cgroup_v1_cpu_tg_path + "/cpu.shares"; + if (access(_cgroup_v1_cpu_tg_shares_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v1 cpu.shares file"); + } + _cgroup_v1_cpu_tg_task_file = _cgroup_v1_cpu_tg_path + "/tasks"; + if (access(_cgroup_v1_cpu_tg_task_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v1 cpu.shares file"); + } + LOG(INFO) << "cgroup v1 cpu path init success" + << ", query tg path=" << _cgroup_v1_cpu_tg_path + << ", query wg quota file path=" << _cgroup_v1_cpu_tg_quota_file + << ", query wg share file path=" << _cgroup_v1_cpu_tg_shares_file + << ", query wg tasks file path=" << _cgroup_v1_cpu_tg_task_file + << ", core num=" << _cpu_core_num; + _init_succ = true; + return Status::OK(); +} + +Status CgroupV1CpuCtl::modify_cg_cpu_soft_limit_no_lock(int cpu_shares) { + std::string cpu_share_str = std::to_string(cpu_shares); + std::string msg = "modify cpu shares to " + cpu_share_str; + return CgroupCpuCtl::write_cg_sys_file(_cgroup_v1_cpu_tg_shares_file, cpu_share_str, msg, + false); +} + +Status CgroupV1CpuCtl::modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) { + int val = cpu_hard_limit > 0 ? (_cpu_cfs_period_us * _cpu_core_num * cpu_hard_limit / 100) + : CGROUP_CPU_HARD_LIMIT_DEFAULT_VALUE; + std::string str_val = std::to_string(val); + std::string msg = "modify cpu quota value to " + str_val; + return CgroupCpuCtl::write_cg_sys_file(_cgroup_v1_cpu_tg_quota_file, str_val, msg, false); +} + +Status CgroupV1CpuCtl::add_thread_to_cgroup() { + return CgroupCpuCtl::add_thread_to_cgroup(_cgroup_v1_cpu_tg_task_file); +} + +Status CgroupV2CpuCtl::init() { + if (!_is_cgroup_query_path_valid) { + return Status::InternalError(" cgroup query path is empty"); + } + + if (_wg_id <= 0) { + return Status::InternalError("find an invalid wg_id {}", _wg_id); + } + + // enable cpu controller for home path's child + _doris_cgroup_cpu_path_subtree_ctl_file = _doris_cgroup_cpu_path + "cgroup.subtree_control"; + if (access(_doris_cgroup_cpu_path_subtree_ctl_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v2 doris home's subtree control file"); + } + RETURN_IF_ERROR(enable_cpu_controller(_doris_cgroup_cpu_path_subtree_ctl_file)); + + // enable cpu controller for query path's child + _cgroup_v2_query_path_subtree_ctl_file = + _doris_cgroup_cpu_query_path + "/cgroup.subtree_control"; + if (access(_cgroup_v2_query_path_subtree_ctl_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v2 query path's subtree control file"); + } + RETURN_IF_ERROR(enable_cpu_controller(_cgroup_v2_query_path_subtree_ctl_file)); + + // wg path + _cgroup_v2_query_wg_path = _doris_cgroup_cpu_query_path + "/" + std::to_string(_wg_id); + if (access(_cgroup_v2_query_wg_path.c_str(), F_OK) != 0) { + int ret = mkdir(_cgroup_v2_query_wg_path.c_str(), S_IRWXU); + if (ret != 0) { + return Status::InternalError("cgroup v2 mkdir wg failed, path={}", + _cgroup_v2_query_wg_path); + } + } + + _cgroup_v2_query_wg_cpu_max_file = _cgroup_v2_query_wg_path + "/cpu.max"; + if (access(_cgroup_v2_query_wg_cpu_max_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v2 wg cpu.max file"); + } + + _cgroup_v2_query_wg_cpu_weight_file = _cgroup_v2_query_wg_path + "/cpu.weight"; + if (access(_cgroup_v2_query_wg_cpu_weight_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v2 wg cpu.weight file"); + } + + _cgroup_v2_query_wg_thread_file = _cgroup_v2_query_wg_path + "/cgroup.threads"; + if (access(_cgroup_v2_query_wg_thread_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v2 wg cgroup.threads file"); + } + + _cgroup_v2_query_wg_type_file = _cgroup_v2_query_wg_path + "/cgroup.type"; + if (access(_cgroup_v2_query_wg_type_file.c_str(), F_OK) != 0) { + return Status::InternalError("not find cgroup v2 wg cgroup.type file"); + } + RETURN_IF_ERROR(CgroupCpuCtl::write_cg_sys_file(_cgroup_v2_query_wg_type_file, "threaded", + "set cgroup type", false)); + + LOG(INFO) << "cgroup v2 cpu path init success" + << ", query wg path=" << _cgroup_v2_query_wg_path + << ", cpu.max file = " << _cgroup_v2_query_wg_cpu_max_file + << ", cgroup.threads file = " << _cgroup_v2_query_wg_thread_file + << ", core num=" << _cpu_core_num; + _init_succ = true; + return Status::OK(); +} + +Status CgroupV2CpuCtl::modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) { + std::string value = ""; + if (cpu_hard_limit > 0) { + uint64_t int_val = _cpu_cfs_period_us * _cpu_core_num * cpu_hard_limit / 100; + value = std::to_string(int_val) + " 100000"; + } else { + value = CGROUP_V2_CPU_HARD_LIMIT_DEFAULT_VALUE; + } + std::string msg = "modify cpu.max to [" + value + "]"; + return CgroupCpuCtl::write_cg_sys_file(_cgroup_v2_query_wg_cpu_max_file, value, msg, false); +} + +Status CgroupV2CpuCtl::modify_cg_cpu_soft_limit_no_lock(int cpu_weight) { + std::string cpu_weight_str = std::to_string(cpu_weight); + std::string msg = "modify cpu.weight to " + cpu_weight_str; + return CgroupCpuCtl::write_cg_sys_file(_cgroup_v2_query_wg_cpu_weight_file, cpu_weight_str, msg, + false); +} + +Status CgroupV2CpuCtl::add_thread_to_cgroup() { + return CgroupCpuCtl::add_thread_to_cgroup(_cgroup_v2_query_wg_thread_file); +} + +Status CgroupV2CpuCtl::enable_cpu_controller(std::string file) { + return CgroupCpuCtl::write_cg_sys_file(file, "+cpu", "set cpu controller", false); +} + } // namespace doris diff --git a/be/src/agent/cgroup_cpu_ctl.h b/be/src/agent/cgroup_cpu_ctl.h index b5f8d2d5d80e67c..5cc31ab40ed6fc8 100644 --- a/be/src/agent/cgroup_cpu_ctl.h +++ b/be/src/agent/cgroup_cpu_ctl.h @@ -30,14 +30,14 @@ namespace doris { // cgroup cpu.cfs_quota_us default value, it means disable cpu hard limit const static int CGROUP_CPU_HARD_LIMIT_DEFAULT_VALUE = -1; +const static std::string CGROUP_V2_CPU_HARD_LIMIT_DEFAULT_VALUE = "max 100000"; class CgroupCpuCtl { public: virtual ~CgroupCpuCtl() = default; - CgroupCpuCtl() = default; CgroupCpuCtl(uint64_t wg_id) { _wg_id = wg_id; } - virtual Status init(); + virtual Status init() = 0; virtual Status add_thread_to_cgroup() = 0; @@ -48,18 +48,36 @@ class CgroupCpuCtl { // for log void get_cgroup_cpu_info(uint64_t* cpu_shares, int* cpu_hard_limit); - virtual Status delete_unused_cgroup_path(std::set& used_wg_ids) = 0; + static void init_doris_cgroup_path(); + + static Status delete_unused_cgroup_path(std::set& used_wg_ids); + + static std::unique_ptr create_cgroup_cpu_ctl(uint64_t wg_id); + + static bool is_a_valid_cgroup_path(std::string cg_path); + + static uint64_t cpu_soft_limit_default_value(); protected: - Status write_cg_sys_file(std::string file_path, int value, std::string msg, bool is_append); + Status write_cg_sys_file(std::string file_path, std::string value, std::string msg, + bool is_append); virtual Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) = 0; virtual Status modify_cg_cpu_soft_limit_no_lock(int cpu_shares) = 0; - std::string _doris_cgroup_cpu_path; - uint64_t _cpu_core_num = CpuInfo::num_cores(); - uint64_t _cpu_cfs_period_us = 100000; + Status add_thread_to_cgroup(std::string task_file); + +protected: + inline static uint64_t _cpu_core_num; + const static uint64_t _cpu_cfs_period_us = 100000; + inline static std::string _doris_cgroup_cpu_path = ""; + inline static std::string _doris_cgroup_cpu_query_path = ""; + inline static bool _is_enable_cgroup_v1_in_env = false; + inline static bool _is_enable_cgroup_v2_in_env = false; + inline static bool _is_cgroup_query_path_valid = false; + +protected: int _cpu_hard_limit = 0; std::shared_mutex _lock_mutex; bool _init_succ = false; @@ -96,20 +114,67 @@ class CgroupCpuCtl { class CgroupV1CpuCtl : public CgroupCpuCtl { public: CgroupV1CpuCtl(uint64_t tg_id) : CgroupCpuCtl(tg_id) {} - CgroupV1CpuCtl() = default; Status init() override; Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) override; Status modify_cg_cpu_soft_limit_no_lock(int cpu_shares) override; Status add_thread_to_cgroup() override; - Status delete_unused_cgroup_path(std::set& used_wg_ids) override; - private: - std::string _cgroup_v1_cpu_query_path; std::string _cgroup_v1_cpu_tg_path; // workload group path std::string _cgroup_v1_cpu_tg_quota_file; std::string _cgroup_v1_cpu_tg_shares_file; std::string _cgroup_v1_cpu_tg_task_file; }; +/* + NOTE: cgroup v2 directory structure + 1 root path: + /sys/fs/cgroup + + 2 doris home path: + /sys/fs/cgroup/{doris_home}/ + + 3 doris home subtree_control file: + /sys/fs/cgroup/{doris_home}/cgroup.subtree_control + + 4 query path: + /sys/fs/cgroup/{doris_home}/query/ + + 5 query path subtree_control file: + /sys/fs/cgroup/{doris_home}/query/cgroup.subtree_control + + 6 workload group path: + /sys/fs/cgroup/{doris_home}/query/{workload_group_id} + + 7 workload grou cpu.max file: + /sys/fs/cgroup/{doris_home}/query/{workload_group_id}/cpu.max + + 8 workload grou cpu.weight file: + /sys/fs/cgroup/{doris_home}/query/{workload_group_id}/cpu.weight + + 9 workload group cgroup type file: + /sys/fs/cgroup/{doris_home}/query/{workload_group_id}/cgroup.type + +*/ +class CgroupV2CpuCtl : public CgroupCpuCtl { +public: + CgroupV2CpuCtl(uint64_t tg_id) : CgroupCpuCtl(tg_id) {} + Status init() override; + Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) override; + Status modify_cg_cpu_soft_limit_no_lock(int cpu_shares) override; + Status add_thread_to_cgroup() override; + +private: + Status enable_cpu_controller(std::string file); + +private: + std::string _doris_cgroup_cpu_path_subtree_ctl_file; + std::string _cgroup_v2_query_path_subtree_ctl_file; + std::string _cgroup_v2_query_wg_path; + std::string _cgroup_v2_query_wg_cpu_max_file; + std::string _cgroup_v2_query_wg_cpu_weight_file; + std::string _cgroup_v2_query_wg_thread_file; + std::string _cgroup_v2_query_wg_type_file; +}; + } // namespace doris \ No newline at end of file diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index efd15d0711b268f..27921888774f9b3 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -104,6 +104,10 @@ std::unordered_map> s_task_signatur std::atomic_ulong s_report_version(time(nullptr) * 10000); +void increase_report_version() { + s_report_version.fetch_add(1, std::memory_order_relaxed); +} + // FIXME(plat1ko): Paired register and remove task info bool register_task_info(const TTaskType::type task_type, int64_t signature) { if (task_type == TTaskType::type::PUSH_STORAGE_POLICY || @@ -214,7 +218,7 @@ void alter_tablet(StorageEngine& engine, const TAgentTaskRequest& agent_task_req } if (status.ok()) { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); } // Return result to fe @@ -290,7 +294,7 @@ void alter_cloud_tablet(CloudStorageEngine& engine, const TAgentTaskRequest& age } if (status.ok()) { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); } // Return result to fe @@ -1005,13 +1009,6 @@ void report_task_callback(const TMasterInfo& master_info) { } void report_disk_callback(StorageEngine& engine, const TMasterInfo& master_info) { - // Random sleep 1~5 seconds before doing report. - // In order to avoid the problem that the FE receives many report requests at the same time - // and can not be processed. - if (config::report_random_wait) { - random_sleep(5); - } - TReportRequest request; request.__set_backend(BackendOptions::get_local_backend()); request.__isset.disks = true; @@ -1077,8 +1074,16 @@ void report_tablet_callback(StorageEngine& engine, const TMasterInfo& master_inf request.__set_backend(BackendOptions::get_local_backend()); request.__isset.tablets = true; - uint64_t report_version = s_report_version; - engine.tablet_manager()->build_all_report_tablets_info(&request.tablets); + uint64_t report_version; + for (int i = 0; i < 5; i++) { + request.tablets.clear(); + report_version = s_report_version; + engine.tablet_manager()->build_all_report_tablets_info(&request.tablets); + if (report_version == s_report_version) { + break; + } + } + if (report_version < s_report_version) { // TODO llj This can only reduce the possibility for report error, but can't avoid it. // If FE create a tablet in FE meta and send CREATE task to this BE, the tablet may not be included in this @@ -1534,7 +1539,7 @@ void create_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) .tag("tablet_id", create_tablet_req.tablet_id) .error(status); } else { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); // get path hash of the created tablet TabletSharedPtr tablet; { @@ -1629,7 +1634,7 @@ void push_callback(StorageEngine& engine, const TAgentTaskRequest& req) { .tag("signature", req.signature) .tag("tablet_id", push_req.tablet_id) .tag("push_type", push_req.push_type); - ++s_report_version; + increase_report_version(); finish_task_request.__set_finish_tablet_infos(tablet_infos); } else { LOG_WARNING("failed to execute push task") @@ -1675,7 +1680,7 @@ void cloud_push_callback(CloudStorageEngine& engine, const TAgentTaskRequest& re .tag("signature", req.signature) .tag("tablet_id", push_req.tablet_id) .tag("push_type", push_req.push_type); - ++s_report_version; + increase_report_version(); auto& tablet_info = finish_task_request.finish_tablet_infos.emplace_back(); // Just need tablet_id tablet_info.tablet_id = push_req.tablet_id; @@ -1972,6 +1977,10 @@ void clone_callback(StorageEngine& engine, const TMasterInfo& master_info, LOG_INFO("successfully clone tablet") .tag("signature", req.signature) .tag("tablet_id", clone_req.tablet_id); + if (engine_task.is_new_tablet()) { + increase_report_version(); + finish_task_request.__set_report_version(s_report_version); + } finish_task_request.__set_finish_tablet_infos(tablet_infos); } @@ -2044,6 +2053,7 @@ void calc_delete_bitmap_callback(CloudStorageEngine& engine, const TAgentTaskReq finish_task_request.__set_signature(req.signature); finish_task_request.__set_report_version(s_report_version); finish_task_request.__set_error_tablet_ids(error_tablet_ids); + finish_task_request.__set_resp_partitions(calc_delete_bitmap_req.partitions); finish_task(finish_task_request); remove_task_info(req.task_type, req.signature); @@ -2058,10 +2068,12 @@ void clean_trash_callback(StorageEngine& engine, const TAgentTaskRequest& req) { } void clean_udf_cache_callback(const TAgentTaskRequest& req) { - LOG(INFO) << "clean udf cache start: " << req.clean_udf_cache_req.function_signature; - static_cast( - JniUtil::clean_udf_class_load_cache(req.clean_udf_cache_req.function_signature)); - LOG(INFO) << "clean udf cache finish: " << req.clean_udf_cache_req.function_signature; + if (doris::config::enable_java_support) { + LOG(INFO) << "clean udf cache start: " << req.clean_udf_cache_req.function_signature; + static_cast( + JniUtil::clean_udf_class_load_cache(req.clean_udf_cache_req.function_signature)); + LOG(INFO) << "clean udf cache finish: " << req.clean_udf_cache_req.function_signature; + } } } // namespace doris diff --git a/be/src/cloud/cloud_base_compaction.cpp b/be/src/cloud/cloud_base_compaction.cpp index 81c1d47b7461e5b..09bb6c4da7937e3 100644 --- a/be/src/cloud/cloud_base_compaction.cpp +++ b/be/src/cloud/cloud_base_compaction.cpp @@ -237,7 +237,13 @@ Status CloudBaseCompaction::execute_compact() { using namespace std::chrono; auto start = steady_clock::now(); - RETURN_IF_ERROR(CloudCompactionMixin::execute_compact()); + auto res = CloudCompactionMixin::execute_compact(); + if (!res.ok()) { + LOG(WARNING) << "fail to do " << compaction_name() << ". res=" << res + << ", tablet=" << _tablet->tablet_id() + << ", output_version=" << _output_version; + return res; + } LOG_INFO("finish CloudBaseCompaction, tablet_id={}, cost={}ms", _tablet->tablet_id(), duration_cast(steady_clock::now() - start).count()) .tag("job_id", _uuid) diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp b/be/src/cloud/cloud_cumulative_compaction.cpp index cc84dce1b588405..05fc463f08e3780 100644 --- a/be/src/cloud/cloud_cumulative_compaction.cpp +++ b/be/src/cloud/cloud_cumulative_compaction.cpp @@ -172,7 +172,13 @@ Status CloudCumulativeCompaction::execute_compact() { using namespace std::chrono; auto start = steady_clock::now(); - RETURN_IF_ERROR(CloudCompactionMixin::execute_compact()); + auto res = CloudCompactionMixin::execute_compact(); + if (!res.ok()) { + LOG(WARNING) << "fail to do " << compaction_name() << ". res=" << res + << ", tablet=" << _tablet->tablet_id() + << ", output_version=" << _output_version; + return res; + } LOG_INFO("finish CloudCumulativeCompaction, tablet_id={}, cost={}ms", _tablet->tablet_id(), duration_cast(steady_clock::now() - start).count()) .tag("job_id", _uuid) diff --git a/be/src/cloud/cloud_cumulative_compaction_policy.cpp b/be/src/cloud/cloud_cumulative_compaction_policy.cpp index fc56f971cad522d..b8c4ee20cb2077c 100644 --- a/be/src/cloud/cloud_cumulative_compaction_policy.cpp +++ b/be/src/cloud/cloud_cumulative_compaction_policy.cpp @@ -268,6 +268,10 @@ int32_t CloudTimeSeriesCumulativeCompactionPolicy::pick_input_rowsets( continue; } return transient_size; + } else if ( + *compaction_score >= + config::compaction_max_rowset_count) { // If the number of rowsets is too large: FDB_ERROR_CODE_TXN_TOO_LARGE + return transient_size; } } diff --git a/be/src/cloud/cloud_delete_task.cpp b/be/src/cloud/cloud_delete_task.cpp index 210e89b838de7d7..9c36e418d93bd60 100644 --- a/be/src/cloud/cloud_delete_task.cpp +++ b/be/src/cloud/cloud_delete_task.cpp @@ -105,7 +105,7 @@ Status CloudDeleteTask::execute(CloudStorageEngine& engine, const TPushReq& requ request.timeout, nullptr); } - return Status::OK(); + return st; } } // namespace doris diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp index 22f6689ff237822..b6c9aa318f387c0 100644 --- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp +++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp @@ -120,7 +120,7 @@ void CloudTabletCalcDeleteBitmapTask::set_compaction_stats(int64_t ms_base_compa int64_t ms_cumulative_compaction_cnt, int64_t ms_cumulative_point) { _ms_base_compaction_cnt = ms_base_compaction_cnt; - _ms_cumulative_compaction_cnt = ms_base_compaction_cnt; + _ms_cumulative_compaction_cnt = ms_cumulative_compaction_cnt; _ms_cumulative_point = ms_cumulative_point; } @@ -186,9 +186,10 @@ Status CloudTabletCalcDeleteBitmapTask::handle() const { std::shared_ptr partial_update_info; std::shared_ptr publish_status; int64_t txn_expiration; + TxnPublishInfo previous_publish_info; Status status = _engine.txn_delete_bitmap_cache().get_tablet_txn_info( _transaction_id, _tablet_id, &rowset, &delete_bitmap, &rowset_ids, &txn_expiration, - &partial_update_info, &publish_status); + &partial_update_info, &publish_status, &previous_publish_info); if (status != Status::OK()) { LOG(WARNING) << "failed to get tablet txn info. tablet_id=" << _tablet_id << ", txn_id=" << _transaction_id << ", status=" << status; @@ -204,8 +205,19 @@ Status CloudTabletCalcDeleteBitmapTask::handle() const { txn_info.rowset_ids = rowset_ids; txn_info.partial_update_info = partial_update_info; txn_info.publish_status = publish_status; + txn_info.publish_info = {.publish_version = _version, + .base_compaction_cnt = _ms_base_compaction_cnt, + .cumulative_compaction_cnt = _ms_cumulative_compaction_cnt, + .cumulative_point = _ms_cumulative_point}; auto update_delete_bitmap_time_us = 0; - if (txn_info.publish_status && (*(txn_info.publish_status) == PublishStatus::SUCCEED)) { + if (txn_info.publish_status && (*(txn_info.publish_status) == PublishStatus::SUCCEED) && + _version == previous_publish_info.publish_version && + _ms_base_compaction_cnt == previous_publish_info.base_compaction_cnt && + _ms_cumulative_compaction_cnt == previous_publish_info.cumulative_compaction_cnt && + _ms_cumulative_point == previous_publish_info.cumulative_point) { + // if version or compaction stats can't match, it means that this is a retry and there are + // compaction or other loads finished successfully on the same tablet. So the previous publish + // is stale and we should re-calculate the delete bitmap LOG(INFO) << "tablet=" << _tablet_id << ",txn=" << _transaction_id << ",publish_status=SUCCEED,not need to recalculate and update delete_bitmap."; } else { diff --git a/be/src/cloud/cloud_full_compaction.cpp b/be/src/cloud/cloud_full_compaction.cpp index 34186d357ecc34c..2e11891045c2505 100644 --- a/be/src/cloud/cloud_full_compaction.cpp +++ b/be/src/cloud/cloud_full_compaction.cpp @@ -149,7 +149,13 @@ Status CloudFullCompaction::execute_compact() { using namespace std::chrono; auto start = steady_clock::now(); - RETURN_IF_ERROR(CloudCompactionMixin::execute_compact()); + auto res = CloudCompactionMixin::execute_compact(); + if (!res.ok()) { + LOG(WARNING) << "fail to do " << compaction_name() << ". res=" << res + << ", tablet=" << _tablet->tablet_id() + << ", output_version=" << _output_version; + return res; + } LOG_INFO("finish CloudFullCompaction, tablet_id={}, cost={}ms", _tablet->tablet_id(), duration_cast(steady_clock::now() - start).count()) .tag("job_id", _uuid) diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 63e3d4f30818fa8..66e089c22e9d534 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -94,6 +94,7 @@ void CloudInternalServiceImpl::get_file_cache_meta_by_tablet_id( meta->set_rowset_id(rowset_id); meta->set_segment_id(segment_id); meta->set_file_name(file_name); + meta->set_file_size(rowset->rowset_meta()->segment_file_size(segment_id)); meta->set_offset(std::get<0>(tuple)); meta->set_size(std::get<1>(tuple)); meta->set_cache_type(cache_type_to_pb(std::get<2>(tuple))); diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index e743ea9b12c8ce5..2599a8c7b7661c9 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -329,6 +329,9 @@ Status retry_rpc(std::string_view op_name, const Request& req, Response* res, error_msg = cntl.ErrorText(); } else if (res->status().code() == MetaServiceCode::OK) { return Status::OK(); + } else if (res->status().code() == MetaServiceCode::INVALID_ARGUMENT) { + return Status::Error("failed to {}: {}", op_name, + res->status().msg()); } else if (res->status().code() != MetaServiceCode::KV_TXN_CONFLICT) { return Status::Error("failed to {}: {}", op_name, res->status().msg()); @@ -559,14 +562,29 @@ bool CloudMetaMgr::sync_tablet_delete_bitmap_by_cache(CloudTablet* tablet, int64 } txn_processed.insert(txn_id); DeleteBitmapPtr tmp_delete_bitmap; - RowsetIdUnorderedSet tmp_rowset_ids; std::shared_ptr publish_status = std::make_shared(PublishStatus::INIT); CloudStorageEngine& engine = ExecEnv::GetInstance()->storage_engine().to_cloud(); Status status = engine.txn_delete_bitmap_cache().get_delete_bitmap( - txn_id, tablet->tablet_id(), &tmp_delete_bitmap, &tmp_rowset_ids, &publish_status); - if (status.ok() && *(publish_status.get()) == PublishStatus::SUCCEED) { - delete_bitmap->merge(*tmp_delete_bitmap); + txn_id, tablet->tablet_id(), &tmp_delete_bitmap, nullptr, &publish_status); + // CloudMetaMgr::sync_tablet_delete_bitmap_by_cache() is called after we sync rowsets from meta services. + // If the control flows reaches here, it's gauranteed that the rowsets is commited in meta services, so we can + // use the delete bitmap from cache directly if *publish_status == PublishStatus::SUCCEED without checking other + // stats(version or compaction stats) + if (status.ok() && *publish_status == PublishStatus::SUCCEED) { + // tmp_delete_bitmap contains sentinel marks, we should remove it before merge it to delete bitmap. + // Also, the version of delete bitmap key in tmp_delete_bitmap is DeleteBitmap::TEMP_VERSION_COMMON, + // we should replace it with the rowset's real version + DCHECK(rs_meta.start_version() == rs_meta.end_version()); + int64_t rowset_version = rs_meta.start_version(); + for (const auto& [delete_bitmap_key, bitmap_value] : tmp_delete_bitmap->delete_bitmap) { + // skip sentinel mark, which is used for delete bitmap correctness check + if (std::get<1>(delete_bitmap_key) != DeleteBitmap::INVALID_SEGMENT_ID) { + delete_bitmap->merge({std::get<0>(delete_bitmap_key), + std::get<1>(delete_bitmap_key), rowset_version}, + bitmap_value); + } + } engine.txn_delete_bitmap_cache().remove_unused_tablet_txn_info(txn_id, tablet->tablet_id()); } else { diff --git a/be/src/cloud/cloud_rowset_writer.cpp b/be/src/cloud/cloud_rowset_writer.cpp index ad5c57fd21ee495..7753bf7b65b7bf0 100644 --- a/be/src/cloud/cloud_rowset_writer.cpp +++ b/be/src/cloud/cloud_rowset_writer.cpp @@ -115,6 +115,14 @@ Status CloudRowsetWriter::build(RowsetSharedPtr& rowset) { _rowset_meta->add_segments_file_size(seg_file_size.value()); } + if (auto idx_files_info = _idx_files_info.get_inverted_files_info(_segment_start_id); + !idx_files_info.has_value()) [[unlikely]] { + LOG(ERROR) << "expected inverted index files info, but none presents: " + << idx_files_info.error(); + } else { + _rowset_meta->add_inverted_index_files_info(idx_files_info.value()); + } + RETURN_NOT_OK_STATUS_WITH_WARN(RowsetFactory::create_rowset(rowset_schema, _context.tablet_path, _rowset_meta, &rowset), "rowset init failed when build new rowset"); diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index ed3e5f9433fcfdd..254a0d8d96646f3 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -283,9 +283,13 @@ Status CloudSchemaChangeJob::_convert_historical_rowsets(const SchemaChangeParam } } - RETURN_IF_ERROR(sc_procedure->process(rs_reader, rowset_writer.get(), _new_tablet, - _base_tablet, _base_tablet_schema, - _new_tablet_schema)); + st = sc_procedure->process(rs_reader, rowset_writer.get(), _new_tablet, _base_tablet, + _base_tablet_schema, _new_tablet_schema); + if (!st.ok()) { + return Status::InternalError( + "failed to process schema change on rowset, version=[{}-{}], status={}", + rs_reader->version().first, rs_reader->version().second, st.to_string()); + } RowsetSharedPtr new_rowset; st = rowset_writer->build(new_rowset); diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index de4bbac7b3ef6c5..d10c95d7d691862 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -38,6 +38,7 @@ #include "cloud/cloud_warm_up_manager.h" #include "cloud/config.h" #include "io/cache/block_file_cache_downloader.h" +#include "io/cache/block_file_cache_factory.h" #include "io/cache/file_cache_common.h" #include "io/fs/file_system.h" #include "io/fs/hdfs_file_system.h" @@ -48,6 +49,7 @@ #include "olap/memtable_flush_executor.h" #include "olap/storage_policy.h" #include "runtime/memory/cache_manager.h" +#include "util/parse_util.h" namespace doris { @@ -180,14 +182,21 @@ Status CloudStorageEngine::open() { // TODO(plat1ko): DeleteBitmapTxnManager _memtable_flush_executor = std::make_unique(); - // TODO(plat1ko): Use file cache disks number? - _memtable_flush_executor->init(1); + // Use file cache disks number + _memtable_flush_executor->init(io::FileCacheFactory::instance()->get_cache_instance_size()); _calc_delete_bitmap_executor = std::make_unique(); _calc_delete_bitmap_executor->init(); - _txn_delete_bitmap_cache = - std::make_unique(config::delete_bitmap_agg_cache_capacity); + // The default cache is set to 100MB, use memory limit to dynamic adjustment + bool is_percent = false; + int64_t delete_bitmap_agg_cache_cache_limit = + ParseUtil::parse_mem_spec(config::delete_bitmap_dynamic_agg_cache_limit, + MemInfo::mem_limit(), MemInfo::physical_mem(), &is_percent); + _txn_delete_bitmap_cache = std::make_unique( + delete_bitmap_agg_cache_cache_limit > config::delete_bitmap_agg_cache_capacity + ? delete_bitmap_agg_cache_cache_limit + : config::delete_bitmap_agg_cache_capacity); RETURN_IF_ERROR(_txn_delete_bitmap_cache->init()); _file_cache_block_downloader = std::make_unique(*this); @@ -219,6 +228,14 @@ void CloudStorageEngine::stop() { t->join(); } } + + if (_base_compaction_thread_pool) { + _base_compaction_thread_pool->shutdown(); + } + if (_cumu_compaction_thread_pool) { + _cumu_compaction_thread_pool->shutdown(); + } + LOG(INFO) << "Cloud storage engine is stopped."; } bool CloudStorageEngine::stopped() { diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 17ec1fe22b0d852..7f308ddb7bec7e1 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -246,6 +246,38 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ }, .download_done {}, }); + + auto download_idx_file = [&](const io::Path& idx_path) { + io::DownloadFileMeta meta { + .path = idx_path, + .file_size = -1, + .file_system = storage_resource.value()->fs, + .ctx = + { + .expiration_time = expiration_time, + }, + .download_done {}, + }; + _engine.file_cache_block_downloader().submit_download_task(std::move(meta)); + }; + auto schema_ptr = rowset_meta->tablet_schema(); + auto idx_version = schema_ptr->get_inverted_index_storage_format(); + if (idx_version == InvertedIndexStorageFormatPB::V1) { + for (const auto& index : schema_ptr->indexes()) { + if (index.index_type() == IndexType::INVERTED) { + auto idx_path = storage_resource.value()->remote_idx_v1_path( + *rowset_meta, seg_id, index.index_id(), + index.get_index_suffix()); + download_idx_file(idx_path); + } + } + } else if (idx_version == InvertedIndexStorageFormatPB::V2) { + if (schema_ptr->has_inverted_index()) { + auto idx_path = storage_resource.value()->remote_idx_v2_path( + *rowset_meta, seg_id); + download_idx_file(idx_path); + } + } } #endif } @@ -648,8 +680,13 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t tx RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap( *this, txn_id, COMPACTION_DELETE_BITMAP_LOCK_ID, new_delete_bitmap.get())); - _engine.txn_delete_bitmap_cache().update_tablet_txn_info( - txn_id, tablet_id(), new_delete_bitmap, cur_rowset_ids, PublishStatus::SUCCEED); + + // store the delete bitmap with sentinel marks in txn_delete_bitmap_cache because if the txn is retried for some reason, + // it will use the delete bitmap from txn_delete_bitmap_cache when re-calculating the delete bitmap, during which it will do + // delete bitmap correctness check. If we store the new_delete_bitmap, the delete bitmap correctness check will fail + _engine.txn_delete_bitmap_cache().update_tablet_txn_info(txn_id, tablet_id(), delete_bitmap, + cur_rowset_ids, PublishStatus::SUCCEED, + txn_info->publish_info); return Status::OK(); } diff --git a/be/src/cloud/cloud_tablet_hotspot.cpp b/be/src/cloud/cloud_tablet_hotspot.cpp index ae8b3a54d2b6cfb..dd197268646fbc2 100644 --- a/be/src/cloud/cloud_tablet_hotspot.cpp +++ b/be/src/cloud/cloud_tablet_hotspot.cpp @@ -89,20 +89,20 @@ void TabletHotspot::get_top_n_hot_partition(std::vector* hot_t hot_partition.qpd = std::max(hot_partition.qpd, counter->qpd()); hot_partition.qpw = std::max(hot_partition.qpw, counter->qpw()); hot_partition.last_access_time = - std::max(hot_partition.last_access_time, - std::chrono::duration_cast( - counter->last_access_time.time_since_epoch()) - .count()); + std::max(hot_partition.last_access_time, + std::chrono::duration_cast( + counter->last_access_time.time_since_epoch()) + .count()); } else if (counter->qpw() != 0) { auto& hot_partition = week_hot_partitions[std::make_pair( counter->table_id, counter->index_id)][counter->partition_id]; hot_partition.qpd = 0; hot_partition.qpw = std::max(hot_partition.qpw, counter->qpw()); hot_partition.last_access_time = - std::max(hot_partition.last_access_time, - std::chrono::duration_cast( - counter->last_access_time.time_since_epoch()) - .count()); + std::max(hot_partition.last_access_time, + std::chrono::duration_cast( + counter->last_access_time.time_since_epoch()) + .count()); } } }); diff --git a/be/src/cloud/cloud_tablets_channel.cpp b/be/src/cloud/cloud_tablets_channel.cpp index e063ab68116bb26..85b8e3ea33a8650 100644 --- a/be/src/cloud/cloud_tablets_channel.cpp +++ b/be/src/cloud/cloud_tablets_channel.cpp @@ -59,15 +59,20 @@ Status CloudTabletsChannel::add_batch(const PTabletWriterAddBlockRequest& reques _build_tablet_to_rowidxs(request, &tablet_to_rowidxs); std::unordered_set partition_ids; - for (auto& [tablet_id, _] : tablet_to_rowidxs) { - auto tablet_writer_it = _tablet_writers.find(tablet_id); - if (tablet_writer_it == _tablet_writers.end()) { - return Status::InternalError("unknown tablet to append data, tablet={}", tablet_id); + { + // add_batch may concurrency with inc_open but not under _lock. + // so need to protect it with _tablet_writers_lock. + std::lock_guard l(_tablet_writers_lock); + for (auto& [tablet_id, _] : tablet_to_rowidxs) { + auto tablet_writer_it = _tablet_writers.find(tablet_id); + if (tablet_writer_it == _tablet_writers.end()) { + return Status::InternalError("unknown tablet to append data, tablet={}", tablet_id); + } + partition_ids.insert(tablet_writer_it->second->partition_id()); + } + if (!partition_ids.empty()) { + RETURN_IF_ERROR(_init_writers_by_partition_ids(partition_ids)); } - partition_ids.insert(tablet_writer_it->second->partition_id()); - } - if (!partition_ids.empty()) { - RETURN_IF_ERROR(_init_writers_by_partition_ids(partition_ids)); } return _write_block_data(request, cur_seq, tablet_to_rowidxs, response); @@ -124,7 +129,7 @@ Status CloudTabletsChannel::close(LoadChannel* parent, const PTabletWriterAddBlo _state = kFinished; // All senders are closed - // 1. close all delta writers + // 1. close all delta writers. under _lock. std::vector writers_to_commit; writers_to_commit.reserve(_tablet_writers.size()); bool success = true; diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp index 583992e76f7aba7..c6a3b54edc3f67f 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp @@ -27,6 +27,7 @@ #include "cpp/sync_point.h" #include "olap/olap_common.h" #include "olap/tablet_meta.h" +#include "olap/txn_manager.h" namespace doris { @@ -54,7 +55,7 @@ Status CloudTxnDeleteBitmapCache::get_tablet_txn_info( TTransactionId transaction_id, int64_t tablet_id, RowsetSharedPtr* rowset, DeleteBitmapPtr* delete_bitmap, RowsetIdUnorderedSet* rowset_ids, int64_t* txn_expiration, std::shared_ptr* partial_update_info, - std::shared_ptr* publish_status) { + std::shared_ptr* publish_status, TxnPublishInfo* previous_publish_info) { { std::shared_lock rlock(_rwlock); TxnKey key(transaction_id, tablet_id); @@ -68,6 +69,7 @@ Status CloudTxnDeleteBitmapCache::get_tablet_txn_info( *txn_expiration = iter->second.txn_expiration; *partial_update_info = iter->second.partial_update_info; *publish_status = iter->second.publish_status; + *previous_publish_info = iter->second.publish_info; } RETURN_IF_ERROR( get_delete_bitmap(transaction_id, tablet_id, delete_bitmap, rowset_ids, nullptr)); @@ -96,7 +98,9 @@ Status CloudTxnDeleteBitmapCache::get_delete_bitmap( handle == nullptr ? nullptr : reinterpret_cast(value(handle)); if (val) { *delete_bitmap = val->delete_bitmap; - *rowset_ids = val->rowset_ids; + if (rowset_ids) { + *rowset_ids = val->rowset_ids; + } // must call release handle to reduce the reference count, // otherwise there will be memory leak release(handle); @@ -153,12 +157,17 @@ void CloudTxnDeleteBitmapCache::update_tablet_txn_info(TTransactionId transactio int64_t tablet_id, DeleteBitmapPtr delete_bitmap, const RowsetIdUnorderedSet& rowset_ids, - PublishStatus publish_status) { + PublishStatus publish_status, + TxnPublishInfo publish_info) { { std::unique_lock wlock(_rwlock); TxnKey txn_key(transaction_id, tablet_id); - CHECK(_txn_map.count(txn_key) > 0); - *(_txn_map[txn_key].publish_status.get()) = publish_status; + CHECK(_txn_map.contains(txn_key)); + TxnVal& txn_val = _txn_map[txn_key]; + *(txn_val.publish_status) = publish_status; + if (publish_status == PublishStatus::SUCCEED) { + txn_val.publish_info = publish_info; + } } std::string key_str = fmt::format("{}/{}", transaction_id, tablet_id); CacheKey key(key_str); diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.h b/be/src/cloud/cloud_txn_delete_bitmap_cache.h index 5012db6b8e5bf3f..75577ae2e3fee0a 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.h +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.h @@ -42,7 +42,8 @@ class CloudTxnDeleteBitmapCache : public LRUCachePolicyTrackingManual { RowsetSharedPtr* rowset, DeleteBitmapPtr* delete_bitmap, RowsetIdUnorderedSet* rowset_ids, int64_t* txn_expiration, std::shared_ptr* partial_update_info, - std::shared_ptr* publish_status); + std::shared_ptr* publish_status, + TxnPublishInfo* previous_publish_info); void set_tablet_txn_info(TTransactionId transaction_id, int64_t tablet_id, DeleteBitmapPtr delete_bitmap, const RowsetIdUnorderedSet& rowset_ids, @@ -52,12 +53,16 @@ class CloudTxnDeleteBitmapCache : public LRUCachePolicyTrackingManual { void update_tablet_txn_info(TTransactionId transaction_id, int64_t tablet_id, DeleteBitmapPtr delete_bitmap, const RowsetIdUnorderedSet& rowset_ids, - PublishStatus publish_status); + PublishStatus publish_status, TxnPublishInfo publish_info = {}); void remove_expired_tablet_txn_info(); void remove_unused_tablet_txn_info(TTransactionId transaction_id, int64_t tablet_id); + // !!!ATTENTION!!!: the delete bitmap stored in CloudTxnDeleteBitmapCache contains sentinel marks, + // and the version in BitmapKey is DeleteBitmap::TEMP_VERSION_COMMON. + // when using delete bitmap from this cache, the caller should manually remove these marks if don't need it + // and should replace versions in BitmapKey by the correct version Status get_delete_bitmap(TTransactionId transaction_id, int64_t tablet_id, DeleteBitmapPtr* delete_bitmap, RowsetIdUnorderedSet* rowset_ids, std::shared_ptr* publish_status); @@ -88,6 +93,8 @@ class CloudTxnDeleteBitmapCache : public LRUCachePolicyTrackingManual { int64_t txn_expiration; std::shared_ptr partial_update_info; std::shared_ptr publish_status = nullptr; + // used to determine if the retry needs to re-calculate the delete bitmap + TxnPublishInfo publish_info; TxnVal() : txn_expiration(0) {}; TxnVal(RowsetSharedPtr rowset_, int64_t txn_expiration_, std::shared_ptr partial_update_info_, diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 6324f7b23c9ea9d..07beeaeb078a464 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -49,6 +49,16 @@ CloudWarmUpManager::~CloudWarmUpManager() { } } +std::unordered_map snapshot_rs_metas(BaseTablet* tablet) { + std::unordered_map id_to_rowset_meta_map; + auto visitor = [&id_to_rowset_meta_map](const RowsetSharedPtr& r) { + id_to_rowset_meta_map.emplace(r->rowset_meta()->rowset_id().to_string(), r->rowset_meta()); + }; + constexpr bool include_stale = false; + tablet->traverse_rowsets(visitor, include_stale); + return id_to_rowset_meta_map; +} + void CloudWarmUpManager::handle_jobs() { #ifndef BE_TEST constexpr int WAIT_TIME_SECONDS = 600; @@ -78,7 +88,7 @@ void CloudWarmUpManager::handle_jobs() { std::shared_ptr wait = std::make_shared(0); auto tablet_meta = tablet->tablet_meta(); - auto rs_metas = tablet_meta->snapshot_rs_metas(); + auto rs_metas = snapshot_rs_metas(tablet.get()); for (auto& [_, rs] : rs_metas) { for (int64_t seg_id = 0; seg_id < rs->num_segments(); seg_id++) { auto storage_resource = rs->remote_storage_resource(); @@ -114,6 +124,45 @@ void CloudWarmUpManager::handle_jobs() { wait->signal(); }, }); + + auto download_idx_file = [&](const io::Path& idx_path) { + io::DownloadFileMeta meta { + .path = idx_path, + .file_size = -1, + .file_system = storage_resource.value()->fs, + .ctx = + { + .expiration_time = expiration_time, + }, + .download_done = + [wait](Status st) { + if (!st) { + LOG_WARNING("Warm up error ").error(st); + } + wait->signal(); + }, + }; + _engine.file_cache_block_downloader().submit_download_task(std::move(meta)); + }; + auto schema_ptr = rs->tablet_schema(); + auto idx_version = schema_ptr->get_inverted_index_storage_format(); + if (idx_version == InvertedIndexStorageFormatPB::V1) { + for (const auto& index : schema_ptr->indexes()) { + if (index.index_type() == IndexType::INVERTED) { + wait->add_count(); + auto idx_path = storage_resource.value()->remote_idx_v1_path( + *rs, seg_id, index.index_id(), index.get_index_suffix()); + download_idx_file(idx_path); + } + } + } else if (idx_version == InvertedIndexStorageFormatPB::V2) { + if (schema_ptr->has_inverted_index()) { + wait->add_count(); + auto idx_path = + storage_resource.value()->remote_idx_v2_path(*rs, seg_id); + download_idx_file(idx_path); + } + } } } timespec time; diff --git a/be/src/cloud/config.cpp b/be/src/cloud/config.cpp index 80522759b84b449..82c466120e94fb6 100644 --- a/be/src/cloud/config.cpp +++ b/be/src/cloud/config.cpp @@ -35,7 +35,7 @@ DEFINE_Int64(tablet_cache_shards, "16"); DEFINE_mInt32(tablet_sync_interval_s, "1800"); DEFINE_mInt64(min_compaction_failure_interval_ms, "5000"); -DEFINE_mInt64(base_compaction_freeze_interval_s, "86400"); +DEFINE_mInt64(base_compaction_freeze_interval_s, "7200"); DEFINE_mInt64(cu_compaction_freeze_interval_s, "1200"); DEFINE_mInt64(cumu_compaction_interval_s, "1800"); @@ -48,6 +48,7 @@ DEFINE_mDouble(cumu_compaction_thread_num_factor, "0.5"); DEFINE_mInt32(check_auto_compaction_interval_seconds, "5"); DEFINE_mInt32(max_base_compaction_task_num_per_disk, "2"); DEFINE_mBool(prioritize_query_perf_in_compaction, "false"); +DEFINE_mInt32(compaction_max_rowset_count, "10000"); DEFINE_mInt32(refresh_s3_info_interval_s, "60"); DEFINE_mInt32(vacuum_stale_rowsets_interval_s, "300"); diff --git a/be/src/cloud/config.h b/be/src/cloud/config.h index bf041ba0fa6fc5a..02e7014801e5668 100644 --- a/be/src/cloud/config.h +++ b/be/src/cloud/config.h @@ -79,6 +79,7 @@ DECLARE_mDouble(cumu_compaction_thread_num_factor); DECLARE_mInt32(check_auto_compaction_interval_seconds); DECLARE_mInt32(max_base_compaction_task_num_per_disk); DECLARE_mBool(prioritize_query_perf_in_compaction); +DECLARE_mInt32(compaction_max_rowset_count); // CloudStorageEngine config DECLARE_mInt32(refresh_s3_info_interval_s); diff --git a/be/src/cloud/injection_point_action.cpp b/be/src/cloud/injection_point_action.cpp index d5a13238837b816..be90ee23afddae2 100644 --- a/be/src/cloud/injection_point_action.cpp +++ b/be/src/cloud/injection_point_action.cpp @@ -108,6 +108,22 @@ void register_suites() { sp->set_call_back("VOlapTableSink::close", [](auto&&) { std::this_thread::sleep_for(std::chrono::seconds(5)); }); }); + suite_map.emplace("test_file_segment_cache_corruption", [] { + auto* sp = SyncPoint::get_instance(); + sp->set_call_back("Segment::open:corruption", [](auto&& args) { + LOG(INFO) << "injection Segment::open:corruption"; + auto* arg0 = try_any_cast(args[0]); + *arg0 = Status::Corruption("test_file_segment_cache_corruption injection error"); + }); + }); + suite_map.emplace("test_file_segment_cache_corruption1", [] { + auto* sp = SyncPoint::get_instance(); + sp->set_call_back("Segment::open:corruption1", [](auto&& args) { + LOG(INFO) << "injection Segment::open:corruption1"; + auto* arg0 = try_any_cast(args[0]); + *arg0 = Status::Corruption("test_file_segment_cache_corruption injection error"); + }); + }); } void set_sleep(const std::string& point, HttpRequest* req) { @@ -215,6 +231,7 @@ void handle_set(HttpRequest* req) { void handle_clear(HttpRequest* req) { const auto& point = req->param("name"); auto* sp = SyncPoint::get_instance(); + LOG(INFO) << "clear injection point : " << (point.empty() ? "(all points)" : point); if (point.empty()) { // If point name is emtpy, clear all sp->clear_all_call_backs(); @@ -257,7 +274,7 @@ void handle_disable(HttpRequest* req) { InjectionPointAction::InjectionPointAction() = default; void InjectionPointAction::handle(HttpRequest* req) { - LOG(INFO) << req->debug_string(); + LOG(INFO) << "handle InjectionPointAction " << req->debug_string(); auto& op = req->param("op"); if (op == "set") { handle_set(req); diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index 24bdadead33c1e3..d5342186541ea42 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -82,6 +82,8 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in) } out->set_enable_segments_file_size(in.enable_segments_file_size()); out->set_has_variant_type_in_schema(in.has_has_variant_type_in_schema()); + out->set_enable_inverted_index_file_info(in.enable_inverted_index_file_info()); + out->mutable_inverted_index_file_info()->CopyFrom(in.inverted_index_file_info()); } void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { @@ -132,6 +134,8 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { } out->set_enable_segments_file_size(in.enable_segments_file_size()); out->set_has_variant_type_in_schema(in.has_variant_type_in_schema()); + out->set_enable_inverted_index_file_info(in.enable_inverted_index_file_info()); + out->mutable_inverted_index_file_info()->Swap(in.mutable_inverted_index_file_info()); } RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in) { @@ -190,6 +194,8 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) out->set_schema_version(in.schema_version()); } out->set_enable_segments_file_size(in.enable_segments_file_size()); + out->set_enable_inverted_index_file_info(in.enable_inverted_index_file_info()); + out->mutable_inverted_index_file_info()->CopyFrom(in.inverted_index_file_info()); } void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { @@ -237,6 +243,8 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { out->set_schema_version(in.schema_version()); } out->set_enable_segments_file_size(in.enable_segments_file_size()); + out->set_enable_inverted_index_file_info(in.enable_inverted_index_file_info()); + out->mutable_inverted_index_file_info()->Swap(in.mutable_inverted_index_file_info()); } TabletSchemaCloudPB doris_tablet_schema_to_cloud(const TabletSchemaPB& in) { diff --git a/be/src/clucene b/be/src/clucene index 5db9db68e448b8c..fdbf2204031128b 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 5db9db68e448b8ccfd360d02666bbac44e6f8d1a +Subproject commit fdbf2204031128b2bd8505fc73c06403b7c1a815 diff --git a/be/src/common/cgroup_memory_ctl.cpp b/be/src/common/cgroup_memory_ctl.cpp new file mode 100644 index 000000000000000..a29432bdb4ede5e --- /dev/null +++ b/be/src/common/cgroup_memory_ctl.cpp @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CgroupsMemoryUsageObserver.cpp +// and modified by Doris + +#include "common/cgroup_memory_ctl.h" + +#include +#include +#include +#include + +#include "common/status.h" +#include "util/cgroup_util.h" + +namespace doris { + +// Is the memory controller of cgroups v2 enabled on the system? +// Assumes that cgroupsv2_enable() is enabled. +Status cgroupsv2_memory_controller_enabled(bool* ret) { +#if defined(OS_LINUX) + if (!CGroupUtil::cgroupsv2_enable()) { + return Status::CgroupError("cgroupsv2_enable is false"); + } + // According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines which controllers are available + // for the current + child cgroups. The set of available controllers can be restricted from level to level using file + // "cgroups.subtree_control". It is therefore sufficient to check the bottom-most nested "cgroup.controllers" file. + std::string cgroup = CGroupUtil::cgroupv2_of_process(); + auto cgroup_dir = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); + std::ifstream controllers_file(cgroup_dir / "cgroup.controllers"); + if (!controllers_file.is_open()) { + *ret = false; + return Status::CgroupError("open cgroup.controllers failed"); + } + std::string controllers; + std::getline(controllers_file, controllers); + *ret = controllers.find("memory") != std::string::npos; + return Status::OK(); +#else + *ret = false; + return Status::CgroupError("cgroupsv2 only support Linux"); +#endif +} + +struct CgroupsV1Reader : CGroupMemoryCtl::ICgroupsReader { + explicit CgroupsV1Reader(std::filesystem::path mount_file_dir) + : _mount_file_dir(std::move(mount_file_dir)) {} + + Status read_memory_limit(int64_t* value) override { + RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file( + (_mount_file_dir / "memory.limit_in_bytes"), value)); + return Status::OK(); + } + + Status read_memory_usage(int64_t* value) override { + std::unordered_map metrics_map; + CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / "memory.stat"), + metrics_map); + *value = metrics_map["rss"]; + return Status::OK(); + } + +private: + std::filesystem::path _mount_file_dir; +}; + +struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { + explicit CgroupsV2Reader(std::filesystem::path mount_file_dir) + : _mount_file_dir(std::move(mount_file_dir)) {} + + Status read_memory_limit(int64_t* value) override { + RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file((_mount_file_dir / "memory.max"), + value)); + return Status::OK(); + } + + Status read_memory_usage(int64_t* value) override { + // memory.current contains a single number + // the reason why we subtract it described here: https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667 + RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file( + (_mount_file_dir / "memory.current"), value)); + std::unordered_map metrics_map; + CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / "memory.stat"), + metrics_map); + if (*value < metrics_map["inactive_file"]) { + return Status::CgroupError("CgroupsV2Reader read_memory_usage negative memory usage"); + } + *value -= metrics_map["inactive_file"]; + return Status::OK(); + } + +private: + std::filesystem::path _mount_file_dir; +}; + +std::pair get_cgroups_path() { + bool enable_controller; + auto cgroupsv2_memory_controller_st = cgroupsv2_memory_controller_enabled(&enable_controller); + if (CGroupUtil::cgroupsv2_enable() && cgroupsv2_memory_controller_st.ok() && + enable_controller) { + auto v2_memory_stat_path = CGroupUtil::get_cgroupsv2_path("memory.stat"); + auto v2_memory_current_path = CGroupUtil::get_cgroupsv2_path("memory.current"); + auto v2_memory_max_path = CGroupUtil::get_cgroupsv2_path("memory.max"); + if (v2_memory_stat_path.has_value() && v2_memory_current_path.has_value() && + v2_memory_max_path.has_value() && v2_memory_stat_path == v2_memory_current_path && + v2_memory_current_path == v2_memory_max_path) { + return {*v2_memory_stat_path, CGroupUtil::CgroupsVersion::V2}; + } + } + + std::string cgroup_path; + auto st = CGroupUtil::find_abs_cgroupv1_path("memory", &cgroup_path); + if (st.ok()) { + return {cgroup_path, CGroupUtil::CgroupsVersion::V1}; + } + + return {"", CGroupUtil::CgroupsVersion::V1}; +} + +Status get_cgroups_reader(std::shared_ptr& reader) { + const auto [cgroup_path, version] = get_cgroups_path(); + if (cgroup_path.empty()) { + bool enable_controller; + auto st = cgroupsv2_memory_controller_enabled(&enable_controller); + return Status::CgroupError( + "Cannot find cgroups v1 or v2 current memory file, cgroupsv2_enable: {},{}, " + "cgroupsv2_memory_controller_enabled: {}, cgroupsv1_enable: {}", + CGroupUtil::cgroupsv2_enable(), enable_controller, st.to_string(), + CGroupUtil::cgroupsv1_enable()); + } + + if (version == CGroupUtil::CgroupsVersion::V2) { + reader = std::make_shared(cgroup_path); + } else { + reader = std::make_shared(cgroup_path); + } + return Status::OK(); +} + +Status CGroupMemoryCtl::find_cgroup_mem_limit(int64_t* bytes) { + std::shared_ptr reader; + RETURN_IF_ERROR(get_cgroups_reader(reader)); + RETURN_IF_ERROR(reader->read_memory_limit(bytes)); + return Status::OK(); +} + +Status CGroupMemoryCtl::find_cgroup_mem_usage(int64_t* bytes) { + std::shared_ptr reader; + RETURN_IF_ERROR(get_cgroups_reader(reader)); + RETURN_IF_ERROR(reader->read_memory_usage(bytes)); + return Status::OK(); +} + +std::string CGroupMemoryCtl::debug_string() { + const auto [cgroup_path, version] = get_cgroups_path(); + if (cgroup_path.empty()) { + bool enable_controller; + auto st = cgroupsv2_memory_controller_enabled(&enable_controller); + return fmt::format( + "Cannot find cgroups v1 or v2 current memory file, cgroupsv2_enable: {},{}, " + "cgroupsv2_memory_controller_enabled: {}, cgroupsv1_enable: {}", + CGroupUtil::cgroupsv2_enable(), enable_controller, st.to_string(), + CGroupUtil::cgroupsv1_enable()); + } + + int64_t mem_limit; + auto mem_limit_st = find_cgroup_mem_limit(&mem_limit); + + int64_t mem_usage; + auto mem_usage_st = find_cgroup_mem_usage(&mem_usage); + + return fmt::format( + "Process CGroup Memory Info (cgroups path: {}, cgroup version: {}): memory limit: " + "{}, " + "memory usage: {}", + cgroup_path, (version == CGroupUtil::CgroupsVersion::V1) ? "v1" : "v2", + mem_limit_st.ok() ? std::to_string(mem_limit) : mem_limit_st.to_string(), + mem_usage_st.ok() ? std::to_string(mem_usage) : mem_usage_st.to_string()); +} + +} // namespace doris diff --git a/be/src/common/cgroup_memory_ctl.h b/be/src/common/cgroup_memory_ctl.h new file mode 100644 index 000000000000000..83f33e03cda17e6 --- /dev/null +++ b/be/src/common/cgroup_memory_ctl.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "common/status.h" + +namespace doris { + +class CGroupMemoryCtl { +public: + // Inherited by cgroup v1 and v2 + struct ICgroupsReader { + virtual ~ICgroupsReader() = default; + + virtual Status read_memory_limit(int64_t* value) = 0; + + virtual Status read_memory_usage(int64_t* value) = 0; + }; + + // Determines the CGroup memory limit from the current processes' cgroup. + // If the limit is more than INT64_MAX, INT64_MAX is returned (since that is + // effectively unlimited anyway). Does not take into account memory limits + // set on any ancestor CGroups. + static Status find_cgroup_mem_limit(int64_t* bytes); + + // https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command + static Status find_cgroup_mem_usage(int64_t* bytes); + + // Returns a human-readable string with information about CGroups. + static std::string debug_string(); +}; +} // namespace doris diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index f66a7dd17c5e093..03d4454ccdaa2f4 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -40,6 +40,7 @@ #include "common/config.h" #include "common/logging.h" #include "common/status.h" +#include "config.h" #include "io/fs/file_writer.h" #include "io/fs/local_file_system.h" #include "util/cpu_info.h" @@ -124,8 +125,8 @@ DEFINE_Int64(max_sys_mem_available_low_water_mark_bytes, "6871947673"); DEFINE_Int64(memtable_limiter_reserved_memory_bytes, "838860800"); // The size of the memory that gc wants to release each time, as a percentage of the mem limit. -DEFINE_mString(process_minor_gc_size, "10%"); -DEFINE_mString(process_full_gc_size, "20%"); +DEFINE_mString(process_minor_gc_size, "5%"); +DEFINE_mString(process_full_gc_size, "10%"); // If true, when the process does not exceed the soft mem limit, the query memory will not be limited; // when the process memory exceeds the soft mem limit, the query with the largest ratio between the currently @@ -137,9 +138,9 @@ DEFINE_mBool(disable_memory_gc, "false"); DEFINE_mBool(enable_stacktrace, "true"); -DEFINE_mBool(enable_stacktrace_in_allocator_check_failed, "false"); +DEFINE_mInt64(stacktrace_in_alloc_large_memory_bytes, "2147483648"); -DEFINE_mInt64(large_memory_check_bytes, "2147483648"); +DEFINE_mInt64(crash_in_alloc_large_memory_bytes, "-1"); DEFINE_mBool(enable_memory_orphan_check, "false"); @@ -199,8 +200,6 @@ DEFINE_Int32(release_snapshot_worker_count, "5"); DEFINE_mBool(report_random_wait, "true"); // the interval time(seconds) for agent report tasks signature to FE DEFINE_mInt32(report_task_interval_seconds, "10"); -// the interval time(seconds) for refresh storage policy from FE -DEFINE_mInt32(storage_refresh_storage_policy_task_interval_seconds, "5"); // the interval time(seconds) for agent report disk state to FE DEFINE_mInt32(report_disk_state_interval_seconds, "60"); // the interval time(seconds) for agent report olap table to FE @@ -232,14 +231,8 @@ DEFINE_String(log_buffer_level, ""); // number of threads available to serve backend execution requests DEFINE_Int32(be_service_threads, "64"); -// interval between profile reports; in seconds -DEFINE_mInt32(status_report_interval, "5"); // The pipeline task has a high concurrency, therefore reducing its report frequency DEFINE_mInt32(pipeline_status_report_interval, "10"); -// if true, each disk will have a separate thread pool for scanner -DEFINE_Bool(doris_enable_scanner_thread_pool_per_disk, "true"); -// the timeout of a work thread to wait the blocking priority queue to get a task -DEFINE_mInt64(doris_blocking_priority_queue_wait_timeout_ms, "500"); // number of scanner thread pool size for olap table // and the min thread num of remote scanner thread pool DEFINE_Int32(doris_scanner_thread_pool_thread_num, "-1"); @@ -264,33 +257,18 @@ DEFINE_mInt64(thrift_client_retry_interval_ms, "1000"); // max message size of thrift request // default: 100 * 1024 * 1024 DEFINE_mInt64(thrift_max_message_size, "104857600"); -// max row count number for single scan range, used in segmentv1 -DEFINE_mInt32(doris_scan_range_row_count, "524288"); // max bytes number for single scan range, used in segmentv2 DEFINE_mInt32(doris_scan_range_max_mb, "1024"); -// max bytes number for single scan block, used in segmentv2 -DEFINE_mInt32(doris_scan_block_max_mb, "67108864"); -// size of scanner queue between scanner thread and compute thread -DEFINE_mInt32(doris_scanner_queue_size, "1024"); // single read execute fragment row number DEFINE_mInt32(doris_scanner_row_num, "16384"); // single read execute fragment row bytes DEFINE_mInt32(doris_scanner_row_bytes, "10485760"); -DEFINE_mInt32(min_bytes_in_scanner_queue, "67108864"); -// number of max scan keys -DEFINE_mInt32(doris_max_scan_key_num, "48"); -// the max number of push down values of a single column. -// if exceed, no conditions will be pushed down for that column. -DEFINE_mInt32(max_pushdown_conditions_per_column, "1024"); // (Advanced) Maximum size of per-query receive-side buffer DEFINE_mInt32(exchg_node_buffer_size_bytes, "20485760"); DEFINE_mInt32(exchg_buffer_queue_capacity_factor, "64"); -DEFINE_mInt64(column_dictionary_key_ratio_threshold, "0"); -DEFINE_mInt64(column_dictionary_key_size_threshold, "0"); // memory_limitation_per_thread_for_schema_change_bytes unit bytes DEFINE_mInt64(memory_limitation_per_thread_for_schema_change_bytes, "2147483648"); -DEFINE_mInt64(memory_limitation_per_thread_for_storage_migration_bytes, "100000000"); DEFINE_mInt32(cache_prune_interval_sec, "10"); DEFINE_mInt32(cache_periodic_prune_stale_sweep_sec, "300"); @@ -346,7 +324,6 @@ DEFINE_mBool(disable_storage_page_cache, "false"); DEFINE_mBool(disable_storage_row_cache, "true"); // whether to disable pk page cache feature in storage DEFINE_Bool(disable_pk_storage_page_cache, "false"); -DEFINE_Bool(enable_non_pipeline, "false"); // Cache for mow primary key storage page size DEFINE_String(pk_storage_page_cache_limit, "10%"); @@ -555,14 +532,12 @@ DEFINE_mInt32(olap_table_sink_send_interval_microseconds, "1000"); DEFINE_mDouble(olap_table_sink_send_interval_auto_partition_factor, "0.001"); // Fragment thread pool -DEFINE_Int32(fragment_pool_thread_num_min, "64"); -DEFINE_Int32(fragment_pool_thread_num_max, "2048"); -DEFINE_Int32(fragment_pool_queue_size, "4096"); +DEFINE_Int32(fragment_mgr_asynic_work_pool_thread_num_min, "16"); +DEFINE_Int32(fragment_mgr_asynic_work_pool_thread_num_max, "512"); +DEFINE_Int32(fragment_mgr_asynic_work_pool_queue_size, "4096"); // Control the number of disks on the machine. If 0, this comes from the system settings. DEFINE_Int32(num_disks, "0"); -// The maximum number of the threads per disk is also the max queue depth per disk. -DEFINE_Int32(num_threads_per_disk, "0"); // The read size is the size of the reads sent to os. // There is a trade off of latency and throughout, trying to keep disks busy but // not introduce seeks. The literature seems to agree that with 8 MB reads, random @@ -596,7 +571,7 @@ DEFINE_mInt32(memory_maintenance_sleep_time_ms, "100"); // After full gc, no longer full gc and minor gc during sleep. // After minor gc, no minor gc during sleep, but full gc is possible. -DEFINE_mInt32(memory_gc_sleep_time_ms, "1000"); +DEFINE_mInt32(memory_gc_sleep_time_ms, "500"); // Sleep time in milliseconds between memtbale flush mgr refresh iterations DEFINE_mInt64(memtable_mem_tracker_refresh_interval_ms, "5"); @@ -634,6 +609,8 @@ DEFINE_Int32(load_process_safe_mem_permit_percent, "5"); // result buffer cancelled time (unit: second) DEFINE_mInt32(result_buffer_cancelled_interval_time, "300"); +DEFINE_mInt32(arrow_flight_result_sink_buffer_size_rows, "32768"); + // the increased frequency of priority for remaining tasks in BlockingPriorityQueue DEFINE_mInt32(priority_queue_remaining_tasks_increased_frequency, "512"); @@ -812,14 +789,6 @@ DEFINE_Int32(load_stream_eagain_wait_seconds, "600"); DEFINE_Int32(load_stream_flush_token_max_tasks, "15"); // max wait flush token time in load stream DEFINE_Int32(load_stream_max_wait_flush_token_time_ms, "600000"); - -// max send batch parallelism for OlapTableSink -// The value set by the user for send_batch_parallelism is not allowed to exceed max_send_batch_parallelism_per_job, -// if exceed, the value of send_batch_parallelism would be max_send_batch_parallelism_per_job -DEFINE_mInt32(max_send_batch_parallelism_per_job, "5"); -DEFINE_Validator(max_send_batch_parallelism_per_job, - [](const int config) -> bool { return config >= 1; }); - // number of send batch thread pool size DEFINE_Int32(send_batch_thread_pool_thread_num, "64"); // number of send batch thread pool queue size @@ -897,16 +866,9 @@ DEFINE_mInt32(string_type_length_soft_limit_bytes, "1048576"); DEFINE_Validator(string_type_length_soft_limit_bytes, [](const int config) -> bool { return config > 0 && config <= 2147483643; }); -DEFINE_mInt32(jsonb_type_length_soft_limit_bytes, "1048576"); - -DEFINE_Validator(jsonb_type_length_soft_limit_bytes, - [](const int config) -> bool { return config > 0 && config <= 2147483643; }); - // Threshold of reading a small file into memory DEFINE_mInt32(in_memory_file_size, "1048576"); // 1MB -// ParquetReaderWrap prefetch buffer size -DEFINE_Int32(parquet_reader_max_buffer_size, "50"); // Max size of parquet page header in bytes DEFINE_mInt32(parquet_header_max_size_mb, "1"); // Max buffer size for parquet row group @@ -922,9 +884,8 @@ DEFINE_mInt32(orc_natural_read_size_mb, "8"); DEFINE_mInt64(big_column_size_buffer, "65535"); DEFINE_mInt64(small_column_size_buffer, "100"); -// When the rows number reached this limit, will check the filter rate the of bloomfilter -// if it is lower than a specific threshold, the predicate will be disabled. -DEFINE_mInt32(rf_predicate_check_row_num, "204800"); +// rf will decide whether the next sampling_frequency blocks need to be filtered based on the filtering rate of the current block. +DEFINE_mInt32(runtime_filter_sampling_frequency, "64"); // cooldown task configs DEFINE_Int32(cooldown_thread_num, "5"); @@ -936,7 +897,8 @@ DEFINE_mInt32(cold_data_compaction_interval_sec, "1800"); DEFINE_String(tmp_file_dir, "tmp"); -DEFINE_Int32(s3_transfer_executor_pool_size, "2"); +DEFINE_Int32(min_s3_file_system_thread_num, "16"); +DEFINE_Int32(max_s3_file_system_thread_num, "64"); DEFINE_Bool(enable_time_lut, "true"); DEFINE_mBool(enable_simdjson_reader, "true"); @@ -1031,8 +993,6 @@ DEFINE_mInt32(index_cache_entry_stay_time_after_lookup_s, "1800"); DEFINE_mInt32(inverted_index_cache_stale_sweep_time_sec, "600"); // inverted index searcher cache size DEFINE_String(inverted_index_searcher_cache_limit, "10%"); -// set `true` to enable insert searcher into cache when write inverted index data -DEFINE_Bool(enable_write_index_searcher_cache, "true"); DEFINE_Bool(enable_inverted_index_cache_check_timestamp, "true"); DEFINE_Int32(inverted_index_fd_number_limit_percent, "40"); // 40% DEFINE_Int32(inverted_index_query_cache_shards, "256"); @@ -1082,10 +1042,10 @@ DEFINE_mInt32(schema_cache_capacity, "1024"); DEFINE_mInt32(schema_cache_sweep_time_sec, "100"); // max number of segment cache, default -1 for backward compatibility fd_number*2/5 -DEFINE_mInt32(segment_cache_capacity, "-1"); -DEFINE_mInt32(estimated_num_columns_per_segment, "200"); +DEFINE_Int32(segment_cache_capacity, "-1"); +DEFINE_Int32(segment_cache_fd_percentage, "40"); DEFINE_mInt32(estimated_mem_per_column_reader, "1024"); -DEFINE_mInt32(segment_cache_memory_percentage, "2"); +DEFINE_Int32(segment_cache_memory_percentage, "2"); // enable feature binlog, default false DEFINE_Bool(enable_feature_binlog, "false"); @@ -1337,7 +1297,7 @@ DEFINE_mInt64(compaction_batch_size, "-1"); // If set to false, the parquet reader will not use page index to filter data. // This is only for debug purpose, in case sometimes the page index // filter wrong data. -DEFINE_mBool(enable_parquet_page_index, "true"); +DEFINE_mBool(enable_parquet_page_index, "false"); DEFINE_mBool(ignore_not_found_file_in_external_table, "true"); @@ -1345,6 +1305,8 @@ DEFINE_mBool(enable_hdfs_mem_limiter, "true"); DEFINE_mInt16(topn_agg_limit_multiplier, "2"); +DEFINE_mInt64(pipeline_task_leakage_detect_period_secs, "60"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index fd38924f47e74ec..c371ad7ef3b23cd 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -186,13 +186,15 @@ DECLARE_mBool(disable_memory_gc); // if false, turn off all stacktrace DECLARE_mBool(enable_stacktrace); -// Allocator check failed log stacktrace if not catch exception -DECLARE_mBool(enable_stacktrace_in_allocator_check_failed); - -// malloc or new large memory larger than large_memory_check_bytes, default 2G, -// will print a warning containing the stacktrace, but not prevent memory alloc. -// If is -1, disable large memory check. -DECLARE_mInt64(large_memory_check_bytes); +// when alloc memory larger than stacktrace_in_alloc_large_memory_bytes, default 2G, +// if alloc successful, will print a warning with stacktrace, but not prevent memory alloc. +// if alloc failed using Doris Allocator, will print stacktrace in error log. +// if is -1, disable print stacktrace when alloc large memory. +DECLARE_mInt64(stacktrace_in_alloc_large_memory_bytes); +// when alloc memory larger than crash_in_alloc_large_memory_bytes will crash, default -1 means disabled. +// if you need a core dump to analyze large memory allocation, +// modify this parameter to crash when large memory allocation occur will help +DECLARE_mInt64(crash_in_alloc_large_memory_bytes); // default is true. if any memory tracking in Orphan mem tracker will report error. DECLARE_mBool(enable_memory_orphan_check); @@ -254,8 +256,6 @@ DECLARE_Int32(release_snapshot_worker_count); DECLARE_mBool(report_random_wait); // the interval time(seconds) for agent report tasks signature to FE DECLARE_mInt32(report_task_interval_seconds); -// the interval time(seconds) for refresh storage policy from FE -DECLARE_mInt32(storage_refresh_storage_policy_task_interval_seconds); // the interval time(seconds) for agent report disk state to FE DECLARE_mInt32(report_disk_state_interval_seconds); // the interval time(seconds) for agent report olap table to FE @@ -290,12 +290,7 @@ DECLARE_String(log_buffer_level); DECLARE_Int32(be_service_threads); // interval between profile reports; in seconds -DECLARE_mInt32(status_report_interval); DECLARE_mInt32(pipeline_status_report_interval); -// if true, each disk will have a separate thread pool for scanner -DECLARE_Bool(doris_enable_scanner_thread_pool_per_disk); -// the timeout of a work thread to wait the blocking priority queue to get a task -DECLARE_mInt64(doris_blocking_priority_queue_wait_timeout_ms); // number of scanner thread pool size for olap table // and the min thread num of remote scanner thread pool DECLARE_mInt32(doris_scanner_thread_pool_thread_num); @@ -315,33 +310,18 @@ DECLARE_mInt64(thrift_client_retry_interval_ms); // max message size of thrift request // default: 100 * 1024 * 1024 DECLARE_mInt64(thrift_max_message_size); -// max row count number for single scan range, used in segmentv1 -DECLARE_mInt32(doris_scan_range_row_count); // max bytes number for single scan range, used in segmentv2 DECLARE_mInt32(doris_scan_range_max_mb); -// max bytes number for single scan block, used in segmentv2 -DECLARE_mInt32(doris_scan_block_max_mb); -// size of scanner queue between scanner thread and compute thread -DECLARE_mInt32(doris_scanner_queue_size); // single read execute fragment row number DECLARE_mInt32(doris_scanner_row_num); // single read execute fragment row bytes DECLARE_mInt32(doris_scanner_row_bytes); -DECLARE_mInt32(min_bytes_in_scanner_queue); -// number of max scan keys -DECLARE_mInt32(doris_max_scan_key_num); -// the max number of push down values of a single column. -// if exceed, no conditions will be pushed down for that column. -DECLARE_mInt32(max_pushdown_conditions_per_column); // (Advanced) Maximum size of per-query receive-side buffer DECLARE_mInt32(exchg_node_buffer_size_bytes); DECLARE_mInt32(exchg_buffer_queue_capacity_factor); -DECLARE_mInt64(column_dictionary_key_ratio_threshold); -DECLARE_mInt64(column_dictionary_key_size_threshold); // memory_limitation_per_thread_for_schema_change_bytes unit bytes DECLARE_mInt64(memory_limitation_per_thread_for_schema_change_bytes); -DECLARE_mInt64(memory_limitation_per_thread_for_storage_migration_bytes); // all cache prune interval, used by GC and periodic thread. DECLARE_mInt32(cache_prune_interval_sec); @@ -400,7 +380,6 @@ DECLARE_Bool(disable_storage_page_cache); DECLARE_mBool(disable_storage_row_cache); // whether to disable pk page cache feature in storage DECLARE_Bool(disable_pk_storage_page_cache); -DECLARE_Bool(enable_non_pipeline); // Cache for mow primary key storage page size, it's seperated from // storage_page_cache_limit @@ -610,14 +589,12 @@ DECLARE_mInt32(olap_table_sink_send_interval_microseconds); DECLARE_mDouble(olap_table_sink_send_interval_auto_partition_factor); // Fragment thread pool -DECLARE_Int32(fragment_pool_thread_num_min); -DECLARE_Int32(fragment_pool_thread_num_max); -DECLARE_Int32(fragment_pool_queue_size); +DECLARE_Int32(fragment_mgr_asynic_work_pool_thread_num_min); +DECLARE_Int32(fragment_mgr_asynic_work_pool_thread_num_max); +DECLARE_Int32(fragment_mgr_asynic_work_pool_queue_size); // Control the number of disks on the machine. If 0, this comes from the system settings. DECLARE_Int32(num_disks); -// The maximum number of the threads per disk is also the max queue depth per disk. -DECLARE_Int32(num_threads_per_disk); // The read size is the size of the reads sent to os. // There is a trade off of latency and throughout, trying to keep disks busy but // not introduce seeks. The literature seems to agree that with 8 MB reads, random @@ -692,6 +669,9 @@ DECLARE_Int32(load_process_safe_mem_permit_percent); // result buffer cancelled time (unit: second) DECLARE_mInt32(result_buffer_cancelled_interval_time); +// arrow flight result sink buffer rows size, default 4096 * 8 +DECLARE_mInt32(arrow_flight_result_sink_buffer_size_rows); + // the increased frequency of priority for remaining tasks in BlockingPriorityQueue DECLARE_mInt32(priority_queue_remaining_tasks_increased_frequency); @@ -873,12 +853,6 @@ DECLARE_Int32(load_stream_eagain_wait_seconds); DECLARE_Int32(load_stream_flush_token_max_tasks); // max wait flush token time in load stream DECLARE_Int32(load_stream_max_wait_flush_token_time_ms); - -// max send batch parallelism for OlapTableSink -// The value set by the user for send_batch_parallelism is not allowed to exceed max_send_batch_parallelism_per_job, -// if exceed, the value of send_batch_parallelism would be max_send_batch_parallelism_per_job -DECLARE_mInt32(max_send_batch_parallelism_per_job); - // number of send batch thread pool size DECLARE_Int32(send_batch_thread_pool_thread_num); // number of send batch thread pool queue size @@ -955,13 +929,9 @@ DECLARE_String(rpc_load_balancer); // so we set a soft limit, default is 1MB DECLARE_mInt32(string_type_length_soft_limit_bytes); -DECLARE_mInt32(jsonb_type_length_soft_limit_bytes); - // Threshold fo reading a small file into memory DECLARE_mInt32(in_memory_file_size); -// ParquetReaderWrap prefetch buffer size -DECLARE_Int32(parquet_reader_max_buffer_size); // Max size of parquet page header in bytes DECLARE_mInt32(parquet_header_max_size_mb); // Max buffer size for parquet row group @@ -980,9 +950,7 @@ DECLARE_mInt32(orc_natural_read_size_mb); DECLARE_mInt64(big_column_size_buffer); DECLARE_mInt64(small_column_size_buffer); -// When the rows number reached this limit, will check the filter rate the of bloomfilter -// if it is lower than a specific threshold, the predicate will be disabled. -DECLARE_mInt32(rf_predicate_check_row_num); +DECLARE_mInt32(runtime_filter_sampling_frequency); // cooldown task configs DECLARE_Int32(cooldown_thread_num); @@ -992,7 +960,8 @@ DECLARE_mInt32(confirm_unused_remote_files_interval_sec); DECLARE_Int32(cold_data_compaction_thread_num); DECLARE_mInt32(cold_data_compaction_interval_sec); -DECLARE_Int32(s3_transfer_executor_pool_size); +DECLARE_Int32(min_s3_file_system_thread_num); +DECLARE_Int32(max_s3_file_system_thread_num); DECLARE_Bool(enable_time_lut); DECLARE_mBool(enable_simdjson_reader); @@ -1083,8 +1052,6 @@ DECLARE_mInt32(index_cache_entry_stay_time_after_lookup_s); DECLARE_mInt32(inverted_index_cache_stale_sweep_time_sec); // inverted index searcher cache size DECLARE_String(inverted_index_searcher_cache_limit); -// set `true` to enable insert searcher into cache when write inverted index data -DECLARE_Bool(enable_write_index_searcher_cache); DECLARE_Bool(enable_inverted_index_cache_check_timestamp); DECLARE_Int32(inverted_index_fd_number_limit_percent); // 50% DECLARE_Int32(inverted_index_query_cache_shards); @@ -1133,10 +1100,10 @@ DECLARE_mInt32(schema_cache_capacity); DECLARE_mInt32(schema_cache_sweep_time_sec); // max number of segment cache -DECLARE_mInt32(segment_cache_capacity); -DECLARE_mInt32(estimated_num_columns_per_segment); -DECLARE_mInt32(estimated_mem_per_column_reader); +DECLARE_Int32(segment_cache_capacity); +DECLARE_Int32(segment_cache_fd_percentage); DECLARE_Int32(segment_cache_memory_percentage); +DECLARE_mInt32(estimated_mem_per_column_reader); // enable binlog DECLARE_Bool(enable_feature_binlog); @@ -1440,6 +1407,8 @@ DECLARE_mBool(enable_hdfs_mem_limiter); // we should do agg limit opt DECLARE_mInt16(topn_agg_limit_multiplier); +DECLARE_mInt64(pipeline_task_leakage_detect_period_secs); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 7667820b83f84fc..d8245f4045ce81f 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -228,6 +228,7 @@ void Daemon::memory_maintenance_thread() { DorisMetrics::instance()->system_metrics()->update_allocator_metrics(); } #endif + MemInfo::refresh_memory_bvar(); // Update and print memory stat when the memory changes by 256M. if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) { @@ -387,16 +388,34 @@ void Daemon::je_purge_dirty_pages_thread() const { if (_stop_background_threads_latch.count() == 0) { break; } + if (config::disable_memory_gc) { + continue; + } doris::MemInfo::je_purge_all_arena_dirty_pages(); doris::MemInfo::je_purge_dirty_pages_notify.store(false, std::memory_order_relaxed); } while (true); } +void Daemon::cache_prune_stale_thread() { + int32_t interval = config::cache_periodic_prune_stale_sweep_sec; + while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))) { + if (interval <= 0) { + LOG(WARNING) << "config of cache clean interval is illegal: [" << interval + << "], force set to 3600 "; + interval = 3600; + } + if (config::disable_memory_gc) { + continue; + } + CacheManager::instance()->for_each_cache_prune_stale(); + } +} + void Daemon::wg_weighted_memory_ratio_refresh_thread() { // Refresh weighted memory ratio of workload groups while (!_stop_background_threads_latch.wait_for( std::chrono::milliseconds(config::wg_weighted_memory_ratio_refresh_interval_ms))) { - doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_ratio(); + doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit(); } } @@ -435,6 +454,11 @@ void Daemon::start() { st = Thread::create( "Daemon", "je_purge_dirty_pages_thread", [this]() { this->je_purge_dirty_pages_thread(); }, &_threads.emplace_back()); + CHECK(st.ok()) << st; + st = Thread::create( + "Daemon", "cache_prune_stale_thread", [this]() { this->cache_prune_stale_thread(); }, + &_threads.emplace_back()); + CHECK(st.ok()) << st; st = Thread::create( "Daemon", "query_runtime_statistics_thread", [this]() { this->report_runtime_query_statistics_thread(); }, &_threads.emplace_back()); diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h index 2a8adf20e4681ae..64c9f0c8993ae38 100644 --- a/be/src/common/daemon.h +++ b/be/src/common/daemon.h @@ -43,6 +43,7 @@ class Daemon { void memtable_memory_refresh_thread(); void calculate_metrics_thread(); void je_purge_dirty_pages_thread() const; + void cache_prune_stale_thread(); void report_runtime_query_statistics_thread(); void wg_weighted_memory_ratio_refresh_thread(); void be_proc_monitor_thread(); diff --git a/be/src/common/exception.h b/be/src/common/exception.h index b35ef7e8ff8fced..8d35ae4cb5fb695 100644 --- a/be/src/common/exception.h +++ b/be/src/common/exception.h @@ -20,10 +20,8 @@ #include #include -#include #include #include -#include #include #include #include @@ -131,3 +129,26 @@ inline const std::string& Exception::to_string() const { } \ } \ } while (0); + +#define HANDLE_EXCEPTION_IF_CATCH_EXCEPTION(stmt, exception_handler) \ + do { \ + try { \ + doris::enable_thread_catch_bad_alloc++; \ + Defer defer {[&]() { doris::enable_thread_catch_bad_alloc--; }}; \ + { \ + Status _status_ = (stmt); \ + if (UNLIKELY(!_status_.ok())) { \ + exception_handler(doris::Exception()); \ + return _status_; \ + } \ + } \ + } catch (const doris::Exception& e) { \ + exception_handler(e); \ + if (e.code() == doris::ErrorCode::MEM_ALLOC_FAILED) { \ + return Status::MemoryLimitExceeded(fmt::format( \ + "PreCatch error code:{}, {}, __FILE__:{}, __LINE__:{}, __FUNCTION__:{}", \ + e.code(), e.to_string(), __FILE__, __LINE__, __PRETTY_FUNCTION__)); \ + } \ + return Status::Error(e.code(), e.to_string()); \ + } \ + } while (0); diff --git a/be/src/common/status.cpp b/be/src/common/status.cpp index d17e18951c56154..cc6c10c29414de7 100644 --- a/be/src/common/status.cpp +++ b/be/src/common/status.cpp @@ -34,6 +34,13 @@ void Status::to_thrift(TStatus* s) const { // << "The error code has to > 0 because TStatusCode need it > 0, it's actual value is " // << _code; s->status_code = (int16_t)_code > 0 ? (TStatusCode::type)_code : TStatusCode::INTERNAL_ERROR; + + if (_code == ErrorCode::VERSION_ALREADY_MERGED) { + s->status_code = TStatusCode::OLAP_ERR_VERSION_ALREADY_MERGED; + } else if (_code == ErrorCode::TABLE_NOT_FOUND) { + s->status_code = TStatusCode::TABLET_MISSING; + } + s->error_msgs.push_back(fmt::format("({})[{}]{}", BackendOptions::get_localhost(), code_as_string(), _err_msg ? _err_msg->_msg : "")); s->__isset.error_msgs = true; diff --git a/be/src/common/status.h b/be/src/common/status.h index 11c7c42ac994965..4e3aaf74d7bee0c 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -290,7 +290,8 @@ namespace ErrorCode { E(KEY_ALREADY_EXISTS, -7001, false); \ E(ENTRY_NOT_FOUND, -7002, false); \ E(INVALID_TABLET_STATE, -7211, false); \ - E(ROWSETS_EXPIRED, -7311, false); + E(ROWSETS_EXPIRED, -7311, false); \ + E(CGROUP_ERROR, -7411, false); // Define constexpr int error_code_name = error_code_value #define M(NAME, ERRORCODE, ENABLESTACKTRACE) constexpr int NAME = ERRORCODE; @@ -314,8 +315,8 @@ extern ErrorCodeState error_states[MAX_ERROR_CODE_DEFINE_NUM]; class ErrorCodeInitializer { public: ErrorCodeInitializer(int temp) : signal_value(temp) { - for (int i = 0; i < MAX_ERROR_CODE_DEFINE_NUM; ++i) { - error_states[i].error_code = 0; + for (auto& error_state : error_states) { + error_state.error_code = 0; } #define M(NAME, ENABLESTACKTRACE) \ error_states[TStatusCode::NAME].stacktrace = ENABLESTACKTRACE; \ @@ -338,7 +339,7 @@ class ErrorCodeInitializer { #undef M } - void check_init() { + void check_init() const { //the signal value is 0, it means the global error states not inited, it's logical error // DO NOT use dcheck here, because dcheck depend on glog, and glog maybe not inited at this time. if (signal_value == 0) { @@ -441,41 +442,50 @@ class [[nodiscard]] Status { return status; } - static Status OK() { return Status(); } + static Status OK() { return {}; } +// default have stacktrace. could disable manually. #define ERROR_CTOR(name, code) \ template \ static Status name(std::string_view msg, Args&&... args) { \ return Error(msg, std::forward(args)...); \ } +// default have no stacktrace. could enable manually. +#define ERROR_CTOR_NOSTACK(name, code) \ + template \ + static Status name(std::string_view msg, Args&&... args) { \ + return Error(msg, std::forward(args)...); \ + } + ERROR_CTOR(PublishTimeout, PUBLISH_TIMEOUT) ERROR_CTOR(MemoryAllocFailed, MEM_ALLOC_FAILED) ERROR_CTOR(BufferAllocFailed, BUFFER_ALLOCATION_FAILED) - ERROR_CTOR(InvalidArgument, INVALID_ARGUMENT) - ERROR_CTOR(InvalidJsonPath, INVALID_JSON_PATH) + ERROR_CTOR_NOSTACK(InvalidArgument, INVALID_ARGUMENT) + ERROR_CTOR_NOSTACK(InvalidJsonPath, INVALID_JSON_PATH) ERROR_CTOR(MinimumReservationUnavailable, MINIMUM_RESERVATION_UNAVAILABLE) ERROR_CTOR(Corruption, CORRUPTION) ERROR_CTOR(IOError, IO_ERROR) ERROR_CTOR(NotFound, NOT_FOUND) - ERROR_CTOR(AlreadyExist, ALREADY_EXIST) + ERROR_CTOR_NOSTACK(AlreadyExist, ALREADY_EXIST) ERROR_CTOR(NotSupported, NOT_IMPLEMENTED_ERROR) - ERROR_CTOR(EndOfFile, END_OF_FILE) + ERROR_CTOR_NOSTACK(EndOfFile, END_OF_FILE) ERROR_CTOR(InternalError, INTERNAL_ERROR) - ERROR_CTOR(WaitForRf, PIP_WAIT_FOR_RF) - ERROR_CTOR(WaitForScannerContext, PIP_WAIT_FOR_SC) + ERROR_CTOR_NOSTACK(WaitForRf, PIP_WAIT_FOR_RF) + ERROR_CTOR_NOSTACK(WaitForScannerContext, PIP_WAIT_FOR_SC) ERROR_CTOR(RuntimeError, RUNTIME_ERROR) - ERROR_CTOR(Cancelled, CANCELLED) + ERROR_CTOR_NOSTACK(Cancelled, CANCELLED) ERROR_CTOR(MemoryLimitExceeded, MEM_LIMIT_EXCEEDED) ERROR_CTOR(RpcError, THRIFT_RPC_ERROR) ERROR_CTOR(TimedOut, TIMEOUT) - ERROR_CTOR(TooManyTasks, TOO_MANY_TASKS) + ERROR_CTOR_NOSTACK(TooManyTasks, TOO_MANY_TASKS) ERROR_CTOR(Uninitialized, UNINITIALIZED) ERROR_CTOR(Aborted, ABORTED) - ERROR_CTOR(DataQualityError, DATA_QUALITY_ERROR) - ERROR_CTOR(NotAuthorized, NOT_AUTHORIZED) + ERROR_CTOR_NOSTACK(DataQualityError, DATA_QUALITY_ERROR) + ERROR_CTOR_NOSTACK(NotAuthorized, NOT_AUTHORIZED) ERROR_CTOR(HttpError, HTTP_ERROR) - ERROR_CTOR(NeedSendAgain, NEED_SEND_AGAIN) + ERROR_CTOR_NOSTACK(NeedSendAgain, NEED_SEND_AGAIN) + ERROR_CTOR_NOSTACK(CgroupError, CGROUP_ERROR) #undef ERROR_CTOR template @@ -584,15 +594,15 @@ class AtomicStatus { return error_st_; } + AtomicStatus(const AtomicStatus&) = delete; + void operator=(const AtomicStatus&) = delete; + private: std::atomic_int16_t error_code_ = 0; Status error_st_; // mutex's lock is not a const method, but we will use this mutex in // some const method, so that it should be mutable. mutable std::mutex mutex_; - - AtomicStatus(const AtomicStatus&) = delete; - void operator=(const AtomicStatus&) = delete; }; inline std::ostream& operator<<(std::ostream& ostr, const Status& status) { @@ -638,9 +648,6 @@ inline std::string Status::to_string_no_stack() const { } \ } while (false) -#define RETURN_ERROR_IF_NON_VEC \ - return Status::NotSupported("Non-vectorized engine is not supported since Doris 2.0."); - #define RETURN_IF_STATUS_ERROR(status, stmt) \ do { \ status = (stmt); \ diff --git a/be/src/exec/es/es_scroll_parser.cpp b/be/src/exec/es/es_scroll_parser.cpp index f3c8dc57ebac4b1..3d02e335787d8c7 100644 --- a/be/src/exec/es/es_scroll_parser.cpp +++ b/be/src/exec/es/es_scroll_parser.cpp @@ -100,9 +100,9 @@ static const std::string ERROR_COL_DATA_IS_ARRAY = static const std::string INVALID_NULL_VALUE = "Invalid null value occurs: Non-null column `$0` contains NULL"; -#define RETURN_ERROR_IF_COL_IS_ARRAY(col, type) \ +#define RETURN_ERROR_IF_COL_IS_ARRAY(col, type, is_array) \ do { \ - if (col.IsArray()) { \ + if (col.IsArray() == is_array) { \ std::stringstream ss; \ ss << "Expected value of type: " << type_to_string(type) \ << "; but found type: " << json_type_to_string(col.GetType()) \ @@ -167,7 +167,7 @@ Status get_int_value(const rapidjson::Value& col, PrimitiveType type, void* slot return Status::OK(); } - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true); RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type); StringParser::ParseResult result; @@ -294,7 +294,7 @@ Status get_date_int(const rapidjson::Value& col, PrimitiveType type, bool pure_d return get_date_value_int(col[0], type, false, slot, time_zone); } else { // this would happened just only when `enable_docvalue_scan = false`, and field has string format date from _source - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true); RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type); return get_date_value_int(col, type, true, slot, time_zone); } @@ -322,7 +322,7 @@ Status get_float_value(const rapidjson::Value& col, PrimitiveType type, void* sl return Status::OK(); } - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true); RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type); StringParser::ParseResult result; @@ -351,7 +351,7 @@ Status insert_float_value(const rapidjson::Value& col, PrimitiveType type, return Status::OK(); } - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true); RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type); StringParser::ParseResult result; @@ -390,7 +390,7 @@ Status insert_int_value(const rapidjson::Value& col, PrimitiveType type, return Status::OK(); } - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true); RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type); StringParser::ParseResult result; @@ -543,7 +543,7 @@ Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc, val = col[0].GetString(); } } else { - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true); if (!col.IsString()) { val = json_value_to_string(col); } else { @@ -623,7 +623,7 @@ Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc, const rapidjson::Value& str_col = is_nested_str ? col[0] : col; - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true); const std::string& val = str_col.GetString(); size_t val_size = str_col.GetStringLength(); @@ -649,7 +649,7 @@ Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc, val = col[0].GetString(); } } else { - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true); if (!col.IsString()) { val = json_value_to_string(col); } else { @@ -679,13 +679,14 @@ Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc, case TYPE_ARRAY: { vectorized::Array array; const auto& sub_type = tuple_desc->slots()[i]->type().children[0].type; - for (auto& sub_col : col.GetArray()) { + RETURN_ERROR_IF_COL_IS_ARRAY(col, type, false); + for (const auto& sub_col : col.GetArray()) { switch (sub_type) { case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_STRING: { std::string val; - RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, sub_type); + RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, sub_type, true); if (!sub_col.IsString()) { val = json_value_to_string(sub_col); } else { diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index c2199cd6be93601..76d5ec83d7d04b5 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -770,6 +770,7 @@ bool ColumnValueRange::convert_to_avg_range_value( if (step_size > MAX_STEP_SIZE) { return no_split(); } + size_t real_step_size = 0; // Add null key if contain null, must do after no_split check if (contain_null()) { @@ -797,6 +798,15 @@ bool ColumnValueRange::convert_to_avg_range_value( break; } ++min_value; + ++real_step_size; + if (real_step_size > MAX_STEP_SIZE) { + throw Exception(Status::InternalError( + "convert_to_avg_range_value meet error. type={}, step_size={}, " + "min_value={}, max_value={}", + int(primitive_type), step_size, + cast_to_string(min_value, scale()), + cast_to_string(max_value, scale()))); + } } return step_size != 0; diff --git a/be/src/exec/rowid_fetcher.cpp b/be/src/exec/rowid_fetcher.cpp index 0ec1c7ce3a34a47..beb8c2f0962a5a9 100644 --- a/be/src/exec/rowid_fetcher.cpp +++ b/be/src/exec/rowid_fetcher.cpp @@ -381,6 +381,7 @@ Status RowIdStorageReader::read_by_rowids(const PMultiGetRequest& request, << ", row_size:" << row_size; *response->add_row_locs() = row_loc; }); + // TODO: supoort session variable enable_page_cache and disable_file_cache if necessary. SegmentCacheHandle segment_cache; RETURN_IF_ERROR(scope_timer_run( [&]() { diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp index de9857bad2ce831..be0bd8eff72c4c2 100644 --- a/be/src/exec/schema_scanner.cpp +++ b/be/src/exec/schema_scanner.cpp @@ -43,15 +43,21 @@ #include "exec/schema_scanner/schema_schemata_scanner.h" #include "exec/schema_scanner/schema_table_options_scanner.h" #include "exec/schema_scanner/schema_table_privileges_scanner.h" +#include "exec/schema_scanner/schema_table_properties_scanner.h" #include "exec/schema_scanner/schema_tables_scanner.h" #include "exec/schema_scanner/schema_user_privileges_scanner.h" #include "exec/schema_scanner/schema_user_scanner.h" #include "exec/schema_scanner/schema_variables_scanner.h" #include "exec/schema_scanner/schema_views_scanner.h" +#include "exec/schema_scanner/schema_workload_group_privileges.h" +#include "exec/schema_scanner/schema_workload_group_resource_usage_scanner.h" #include "exec/schema_scanner/schema_workload_groups_scanner.h" #include "exec/schema_scanner/schema_workload_sched_policy_scanner.h" #include "olap/hll.h" +#include "pipeline/dependency.h" #include "runtime/define_primitive_type.h" +#include "runtime/fragment_mgr.h" +#include "runtime/types.h" #include "util/string_util.h" #include "util/types.h" #include "vec/columns/column.h" @@ -65,6 +71,7 @@ #include "vec/core/column_with_type_and_name.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" namespace doris { class ObjectPool; @@ -85,7 +92,60 @@ Status SchemaScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaScanner::get_next_block(RuntimeState* state, vectorized::Block* block, bool* eos) { + if (_data_block == nullptr) { + return Status::InternalError("No data left!"); + } + DCHECK(_async_thread_running == false); + RETURN_IF_ERROR(_scanner_status.status()); + for (size_t i = 0; i < block->columns(); i++) { + std::move(*block->get_by_position(i).column) + .mutate() + ->insert_range_from(*_data_block->get_by_position(i).column, 0, + _data_block->rows()); + } + _data_block->clear_column_data(); + *eos = _eos; + if (!*eos) { + RETURN_IF_ERROR(get_next_block_async(state)); + } + return Status::OK(); +} + +Status SchemaScanner::get_next_block_async(RuntimeState* state) { + _dependency->block(); + auto task_ctx = state->get_task_execution_context(); + RETURN_IF_ERROR(ExecEnv::GetInstance()->fragment_mgr()->get_thread_pool()->submit_func( + [this, task_ctx, state]() { + DCHECK(_async_thread_running == false); + auto task_lock = task_ctx.lock(); + if (task_lock == nullptr) { + _scanner_status.update(Status::InternalError("Task context not exists!")); + return; + } + SCOPED_ATTACH_TASK(state); + _dependency->block(); + _async_thread_running = true; + _finish_dependency->block(); + if (!_opened) { + _data_block = vectorized::Block::create_unique(); + _init_block(_data_block.get()); + _scanner_status.update(start(state)); + _opened = true; + } + bool eos = false; + _scanner_status.update(get_next_block_internal(_data_block.get(), &eos)); + _eos = eos; + _async_thread_running = false; + _dependency->set_ready(); + if (eos) { + _finish_dependency->set_ready(); + } + })); + return Status::OK(); +} + +Status SchemaScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("used before initialized."); } @@ -170,12 +230,28 @@ std::unique_ptr SchemaScanner::create(TSchemaTableType::type type return SchemaWorkloadSchedulePolicyScanner::create_unique(); case TSchemaTableType::SCH_TABLE_OPTIONS: return SchemaTableOptionsScanner::create_unique(); + case TSchemaTableType::SCH_WORKLOAD_GROUP_PRIVILEGES: + return SchemaWorkloadGroupPrivilegesScanner::create_unique(); + case TSchemaTableType::SCH_WORKLOAD_GROUP_RESOURCE_USAGE: + return SchemaBackendWorkloadGroupResourceUsage::create_unique(); + case TSchemaTableType::SCH_TABLE_PROPERTIES: + return SchemaTablePropertiesScanner::create_unique(); default: return SchemaDummyScanner::create_unique(); break; } } +void SchemaScanner::_init_block(vectorized::Block* src_block) { + const std::vector& columns_desc(get_column_desc()); + for (int i = 0; i < columns_desc.size(); ++i) { + TypeDescriptor descriptor(columns_desc[i].type); + auto data_type = vectorized::DataTypeFactory::instance().create_data_type(descriptor, true); + src_block->insert(vectorized::ColumnWithTypeAndName(data_type->create_column(), data_type, + columns_desc[i].name)); + } +} + Status SchemaScanner::fill_dest_column_for_range(vectorized::Block* block, size_t pos, const std::vector& datas) { const ColumnDesc& col_desc = _columns[pos]; diff --git a/be/src/exec/schema_scanner.h b/be/src/exec/schema_scanner.h index a23706ac6a440a3..da61d58b943fc4b 100644 --- a/be/src/exec/schema_scanner.h +++ b/be/src/exec/schema_scanner.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -43,6 +44,10 @@ namespace vectorized { class Block; } +namespace pipeline { +class Dependency; +} + struct SchemaScannerCommonParam { SchemaScannerCommonParam() : db(nullptr), @@ -64,6 +69,7 @@ struct SchemaScannerCommonParam { int32_t port; // frontend thrift port int64_t thread_id; const std::string* catalog = nullptr; + std::set fe_addr_list; }; // scanner parameter from frontend @@ -94,15 +100,23 @@ class SchemaScanner { // init object need information, schema etc. virtual Status init(SchemaScannerParam* param, ObjectPool* pool); + Status get_next_block(RuntimeState* state, vectorized::Block* block, bool* eos); // Start to work virtual Status start(RuntimeState* state); - virtual Status get_next_block(vectorized::Block* block, bool* eos); + virtual Status get_next_block_internal(vectorized::Block* block, bool* eos); const std::vector& get_column_desc() const { return _columns; } // factory function static std::unique_ptr create(TSchemaTableType::type type); TSchemaTableType::type type() const { return _schema_table_type; } + void set_dependency(std::shared_ptr dep, + std::shared_ptr fin_dep) { + _dependency = dep; + _finish_dependency = fin_dep; + } + Status get_next_block_async(RuntimeState* state); protected: + void _init_block(vectorized::Block* src_block); Status fill_dest_column_for_range(vectorized::Block* block, size_t pos, const std::vector& datas); @@ -125,6 +139,15 @@ class SchemaScanner { RuntimeProfile::Counter* _get_table_timer = nullptr; RuntimeProfile::Counter* _get_describe_timer = nullptr; RuntimeProfile::Counter* _fill_block_timer = nullptr; + + std::shared_ptr _dependency = nullptr; + std::shared_ptr _finish_dependency = nullptr; + + std::unique_ptr _data_block; + AtomicStatus _scanner_status; + std::atomic _eos = false; + std::atomic _opened = false; + std::atomic _async_thread_running = false; }; } // namespace doris diff --git a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp index 2115a38a6ebce3a..46522a36242fc13 100644 --- a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp @@ -137,7 +137,7 @@ Status SchemaActiveQueriesScanner::_get_active_queries_block_from_fe() { return Status::OK(); } -Status SchemaActiveQueriesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaActiveQueriesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_active_queries_scanner.h b/be/src/exec/schema_scanner/schema_active_queries_scanner.h index 1df5b1f9d7402dd..7e9ae4b80340836 100644 --- a/be/src/exec/schema_scanner/schema_active_queries_scanner.h +++ b/be/src/exec/schema_scanner/schema_active_queries_scanner.h @@ -36,7 +36,7 @@ class SchemaActiveQueriesScanner : public SchemaScanner { ~SchemaActiveQueriesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp b/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp index f1155796ed434d3..74e95f4203217cb 100644 --- a/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp +++ b/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp @@ -27,7 +27,7 @@ namespace doris { std::vector SchemaBackendActiveTasksScanner::_s_tbls_columns = { // name, type, size - {"BE_ID", TYPE_BIGINT, sizeof(StringRef), false}, + {"BE_ID", TYPE_BIGINT, sizeof(int64_t), false}, {"FE_HOST", TYPE_VARCHAR, sizeof(StringRef), false}, {"QUERY_ID", TYPE_VARCHAR, sizeof(StringRef), false}, {"TASK_TIME_MS", TYPE_BIGINT, sizeof(int64_t), false}, @@ -51,7 +51,8 @@ Status SchemaBackendActiveTasksScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaBackendActiveTasksScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaBackendActiveTasksScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_backend_active_tasks.h b/be/src/exec/schema_scanner/schema_backend_active_tasks.h index d8a2a1ffa3f96a7..43819818b57f69f 100644 --- a/be/src/exec/schema_scanner/schema_backend_active_tasks.h +++ b/be/src/exec/schema_scanner/schema_backend_active_tasks.h @@ -36,7 +36,7 @@ class SchemaBackendActiveTasksScanner : public SchemaScanner { ~SchemaBackendActiveTasksScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_charsets_scanner.cpp b/be/src/exec/schema_scanner/schema_charsets_scanner.cpp index 534f045341b7e37..d06cd8fa7456342 100644 --- a/be/src/exec/schema_scanner/schema_charsets_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_charsets_scanner.cpp @@ -48,7 +48,7 @@ SchemaCharsetsScanner::SchemaCharsetsScanner() SchemaCharsetsScanner::~SchemaCharsetsScanner() {} -Status SchemaCharsetsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaCharsetsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_charsets_scanner.h b/be/src/exec/schema_scanner/schema_charsets_scanner.h index 1f01070875ccf64..d5089c62826b0b7 100644 --- a/be/src/exec/schema_scanner/schema_charsets_scanner.h +++ b/be/src/exec/schema_scanner/schema_charsets_scanner.h @@ -36,7 +36,7 @@ class SchemaCharsetsScanner : public SchemaScanner { SchemaCharsetsScanner(); ~SchemaCharsetsScanner() override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct CharsetStruct { diff --git a/be/src/exec/schema_scanner/schema_collations_scanner.cpp b/be/src/exec/schema_scanner/schema_collations_scanner.cpp index 9d50b5216303d84..8592eb7575c387d 100644 --- a/be/src/exec/schema_scanner/schema_collations_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_collations_scanner.cpp @@ -50,7 +50,7 @@ SchemaCollationsScanner::SchemaCollationsScanner() SchemaCollationsScanner::~SchemaCollationsScanner() {} -Status SchemaCollationsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaCollationsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_collations_scanner.h b/be/src/exec/schema_scanner/schema_collations_scanner.h index f0f60538cacce02..2fe200da78d04d7 100644 --- a/be/src/exec/schema_scanner/schema_collations_scanner.h +++ b/be/src/exec/schema_scanner/schema_collations_scanner.h @@ -36,7 +36,7 @@ class SchemaCollationsScanner : public SchemaScanner { SchemaCollationsScanner(); ~SchemaCollationsScanner() override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct CollationStruct { diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.cpp b/be/src/exec/schema_scanner/schema_columns_scanner.cpp index deda8af7d8de587..f4e15d2aef0af2d 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_columns_scanner.cpp @@ -347,7 +347,7 @@ Status SchemaColumnsScanner::_get_new_table() { return Status::OK(); } -Status SchemaColumnsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaColumnsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("use this class before inited."); } diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.h b/be/src/exec/schema_scanner/schema_columns_scanner.h index 2499db7ed82a2b3..99150c36d109a2d 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.h +++ b/be/src/exec/schema_scanner/schema_columns_scanner.h @@ -38,7 +38,7 @@ class SchemaColumnsScanner : public SchemaScanner { SchemaColumnsScanner(); ~SchemaColumnsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_dummy_scanner.cpp b/be/src/exec/schema_scanner/schema_dummy_scanner.cpp index 1d5956f390ea262..9e3a703d9fb5d6a 100644 --- a/be/src/exec/schema_scanner/schema_dummy_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_dummy_scanner.cpp @@ -40,7 +40,7 @@ Status SchemaDummyScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaDummyScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaDummyScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { *eos = true; return Status::OK(); } diff --git a/be/src/exec/schema_scanner/schema_dummy_scanner.h b/be/src/exec/schema_scanner/schema_dummy_scanner.h index a67f6fa25c16480..0c5e4aabe357e46 100644 --- a/be/src/exec/schema_scanner/schema_dummy_scanner.h +++ b/be/src/exec/schema_scanner/schema_dummy_scanner.h @@ -33,7 +33,7 @@ class SchemaDummyScanner : public SchemaScanner { SchemaDummyScanner(); ~SchemaDummyScanner() override; Status start(RuntimeState* state = nullptr) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; }; } // namespace doris diff --git a/be/src/exec/schema_scanner/schema_files_scanner.cpp b/be/src/exec/schema_scanner/schema_files_scanner.cpp index 55b7a338c319e8e..20aa07fa69116c2 100644 --- a/be/src/exec/schema_scanner/schema_files_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_files_scanner.cpp @@ -113,7 +113,7 @@ Status SchemaFilesScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaFilesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaFilesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_files_scanner.h b/be/src/exec/schema_scanner/schema_files_scanner.h index 6805a04be4aacc7..bb3b2d684931475 100644 --- a/be/src/exec/schema_scanner/schema_files_scanner.h +++ b/be/src/exec/schema_scanner/schema_files_scanner.h @@ -38,7 +38,7 @@ class SchemaFilesScanner : public SchemaScanner { ~SchemaFilesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; int _db_index; int _table_index; diff --git a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp index 928567a2e4a99ea..1267c32c8d8dfba 100644 --- a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp @@ -225,7 +225,7 @@ Status SchemaMetadataNameIdsScanner::_fill_block_impl(vectorized::Block* block) return Status::OK(); } -Status SchemaMetadataNameIdsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaMetadataNameIdsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h index 9981d441d856aa1..c3beea7769754d4 100644 --- a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h +++ b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h @@ -39,7 +39,7 @@ class SchemaMetadataNameIdsScanner : public SchemaScanner { ~SchemaMetadataNameIdsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp index f1ad1f594f883f2..ea7394e15e12d2f 100644 --- a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp @@ -101,7 +101,7 @@ Status SchemaPartitionsScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaPartitionsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaPartitionsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_partitions_scanner.h b/be/src/exec/schema_scanner/schema_partitions_scanner.h index 47e1d1fcf87d159..87e55db984a3dee 100644 --- a/be/src/exec/schema_scanner/schema_partitions_scanner.h +++ b/be/src/exec/schema_scanner/schema_partitions_scanner.h @@ -38,7 +38,7 @@ class SchemaPartitionsScanner : public SchemaScanner { ~SchemaPartitionsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; int _db_index; int _table_index; diff --git a/be/src/exec/schema_scanner/schema_processlist_scanner.cpp b/be/src/exec/schema_scanner/schema_processlist_scanner.cpp index 0f270a6a8c17770..185ef2ab44237fe 100644 --- a/be/src/exec/schema_scanner/schema_processlist_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_processlist_scanner.cpp @@ -56,14 +56,19 @@ Status SchemaProcessListScanner::start(RuntimeState* state) { TShowProcessListRequest request; request.__set_show_full_sql(true); - RETURN_IF_ERROR(SchemaHelper::show_process_list(*(_param->common_param->ip), - _param->common_param->port, request, - &_process_list_result)); + for (const auto& fe_addr : _param->common_param->fe_addr_list) { + TShowProcessListResult tmp_ret; + RETURN_IF_ERROR( + SchemaHelper::show_process_list(fe_addr.hostname, fe_addr.port, request, &tmp_ret)); + _process_list_result.process_list.insert(_process_list_result.process_list.end(), + tmp_ret.process_list.begin(), + tmp_ret.process_list.end()); + } return Status::OK(); } -Status SchemaProcessListScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaProcessListScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_processlist_scanner.h b/be/src/exec/schema_scanner/schema_processlist_scanner.h index 8aae87e1ef6d0fa..c0b0a47f6154eee 100644 --- a/be/src/exec/schema_scanner/schema_processlist_scanner.h +++ b/be/src/exec/schema_scanner/schema_processlist_scanner.h @@ -40,7 +40,7 @@ class SchemaProcessListScanner : public SchemaScanner { ~SchemaProcessListScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_processlist_columns; diff --git a/be/src/exec/schema_scanner/schema_profiling_scanner.cpp b/be/src/exec/schema_scanner/schema_profiling_scanner.cpp index 2f71eb96f2613ab..0a2a64330bb018d 100644 --- a/be/src/exec/schema_scanner/schema_profiling_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_profiling_scanner.cpp @@ -88,7 +88,7 @@ Status SchemaProfilingScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaProfilingScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaProfilingScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_profiling_scanner.h b/be/src/exec/schema_scanner/schema_profiling_scanner.h index 5399cb14eb43f5d..6b969a478aca699 100644 --- a/be/src/exec/schema_scanner/schema_profiling_scanner.h +++ b/be/src/exec/schema_scanner/schema_profiling_scanner.h @@ -38,7 +38,7 @@ class SchemaProfilingScanner : public SchemaScanner { ~SchemaProfilingScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; }; diff --git a/be/src/exec/schema_scanner/schema_routine_scanner.cpp b/be/src/exec/schema_scanner/schema_routine_scanner.cpp index 3d55addee6c093b..8c263c99d2d6c81 100644 --- a/be/src/exec/schema_scanner/schema_routine_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_routine_scanner.cpp @@ -141,7 +141,7 @@ Status SchemaRoutinesScanner::get_block_from_fe() { return Status::OK(); } -Status SchemaRoutinesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaRoutinesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_routine_scanner.h b/be/src/exec/schema_scanner/schema_routine_scanner.h index 543f9e8e8f684a9..c60d72340e1104d 100644 --- a/be/src/exec/schema_scanner/schema_routine_scanner.h +++ b/be/src/exec/schema_scanner/schema_routine_scanner.h @@ -36,7 +36,7 @@ class SchemaRoutinesScanner : public SchemaScanner { ~SchemaRoutinesScanner() override = default; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp index 6ece8e22331e382..16d5f2daba61e74 100644 --- a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp @@ -97,7 +97,7 @@ Status SchemaRowsetsScanner::_get_all_rowsets() { return Status::OK(); } -Status SchemaRowsetsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaRowsetsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_rowsets_scanner.h b/be/src/exec/schema_scanner/schema_rowsets_scanner.h index b975cc4231bc208..cad34fc04945e47 100644 --- a/be/src/exec/schema_scanner/schema_rowsets_scanner.h +++ b/be/src/exec/schema_scanner/schema_rowsets_scanner.h @@ -40,7 +40,7 @@ class SchemaRowsetsScanner : public SchemaScanner { ~SchemaRowsetsScanner() override = default; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_all_rowsets(); diff --git a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp index 9789b6c72d6f302..f529821e5a54e27 100644 --- a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp @@ -82,7 +82,7 @@ Status SchemaSchemaPrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaSchemaPrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaSchemaPrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h index af2ad49634bd493..9522fba908bb2af 100644 --- a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaSchemaPrivilegesScanner : public SchemaScanner { ~SchemaSchemaPrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_schemata_scanner.cpp b/be/src/exec/schema_scanner/schema_schemata_scanner.cpp index 1854e4f2b54af16..618e831c90e2194 100644 --- a/be/src/exec/schema_scanner/schema_schemata_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_schemata_scanner.cpp @@ -81,7 +81,7 @@ Status SchemaSchemataScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaSchemataScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaSchemataScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before Initialized."); } diff --git a/be/src/exec/schema_scanner/schema_schemata_scanner.h b/be/src/exec/schema_scanner/schema_schemata_scanner.h index 46fad31af1fd5ec..39a5ddda495bddd 100644 --- a/be/src/exec/schema_scanner/schema_schemata_scanner.h +++ b/be/src/exec/schema_scanner/schema_schemata_scanner.h @@ -38,7 +38,7 @@ class SchemaSchemataScanner : public SchemaScanner { ~SchemaSchemataScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _fill_block_impl(vectorized::Block* block); diff --git a/be/src/exec/schema_scanner/schema_table_options_scanner.cpp b/be/src/exec/schema_scanner/schema_table_options_scanner.cpp index 604da59b6377b69..f4b636be68ff6ac 100644 --- a/be/src/exec/schema_scanner/schema_table_options_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_table_options_scanner.cpp @@ -17,6 +17,7 @@ #include "exec/schema_scanner/schema_table_options_scanner.h" +#include "exec/schema_scanner/schema_helper.h" #include "runtime/client_cache.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" @@ -27,28 +28,48 @@ namespace doris { std::vector SchemaTableOptionsScanner::_s_tbls_columns = { - {"TABLE_NAME", TYPE_VARCHAR, sizeof(StringRef), true}, {"TABLE_CATALOG", TYPE_VARCHAR, sizeof(StringRef), true}, {"TABLE_SCHEMA", TYPE_VARCHAR, sizeof(StringRef), true}, + {"TABLE_NAME", TYPE_VARCHAR, sizeof(StringRef), true}, {"TABLE_MODEL", TYPE_STRING, sizeof(StringRef), true}, {"TABLE_MODEL_KEY", TYPE_STRING, sizeof(StringRef), true}, {"DISTRIBUTE_KEY", TYPE_STRING, sizeof(StringRef), true}, {"DISTRIBUTE_TYPE", TYPE_STRING, sizeof(StringRef), true}, {"BUCKETS_NUM", TYPE_INT, sizeof(int32_t), true}, {"PARTITION_NUM", TYPE_INT, sizeof(int32_t), true}, - {"PROPERTIES", TYPE_STRING, sizeof(StringRef), true}, }; SchemaTableOptionsScanner::SchemaTableOptionsScanner() : SchemaScanner(_s_tbls_columns, TSchemaTableType::SCH_TABLE_OPTIONS) {} Status SchemaTableOptionsScanner::start(RuntimeState* state) { + if (!_is_init) { + return Status::InternalError("used before initialized."); + } + + // first get the all the database specific to current catalog + SCOPED_TIMER(_get_db_timer); + TGetDbsParams db_params; + + if (_param->common_param->catalog) { + db_params.__set_catalog(*(_param->common_param->catalog)); + } + if (_param->common_param->current_user_ident) { + db_params.__set_current_user_ident(*(_param->common_param->current_user_ident)); + } + + if (_param->common_param->ip && 0 != _param->common_param->port) { + RETURN_IF_ERROR(SchemaHelper::get_db_names( + *(_param->common_param->ip), _param->common_param->port, db_params, &_db_result)); + } else { + return Status::InternalError("IP or port doesn't exists"); + } _block_rows_limit = state->batch_size(); _rpc_timeout_ms = state->execution_timeout() * 1000; return Status::OK(); } -Status SchemaTableOptionsScanner::get_block_from_fe() { +Status SchemaTableOptionsScanner::get_onedb_info_from_fe(int64_t dbId) { TNetworkAddress master_addr = ExecEnv::GetInstance()->master_info()->network_address; TSchemaTableRequestParams schema_table_request_params; @@ -57,6 +78,8 @@ Status SchemaTableOptionsScanner::get_block_from_fe() { schema_table_request_params.columns_name.emplace_back(_s_tbls_columns[i].name); } schema_table_request_params.__set_current_user_ident(*_param->common_param->current_user_ident); + schema_table_request_params.__set_catalog(*_param->common_param->catalog); + schema_table_request_params.__set_dbId(dbId); TFetchSchemaTableDataRequest request; request.__set_schema_table_name(TSchemaTableName::TABLE_OPTIONS); @@ -103,7 +126,18 @@ Status SchemaTableOptionsScanner::get_block_from_fe() { return Status::OK(); } -Status SchemaTableOptionsScanner::get_next_block(vectorized::Block* block, bool* eos) { +bool SchemaTableOptionsScanner::check_and_mark_eos(bool* eos) const { + if (_row_idx == _total_rows) { + *eos = true; + if (_db_index < _db_result.db_ids.size()) { + *eos = false; + } + return true; + } + return false; +} + +Status SchemaTableOptionsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } @@ -112,13 +146,16 @@ Status SchemaTableOptionsScanner::get_next_block(vectorized::Block* block, bool* return Status::InternalError("input pointer is nullptr."); } - if (_tableoptions_block == nullptr) { - RETURN_IF_ERROR(get_block_from_fe()); - _total_rows = _tableoptions_block->rows(); + if ((_tableoptions_block == nullptr) || (_row_idx == _total_rows)) { + if (_db_index < _db_result.db_ids.size()) { + RETURN_IF_ERROR(get_onedb_info_from_fe(_db_result.db_ids[_db_index])); + _row_idx = 0; // reset row index so that it start filling for next block. + _total_rows = _tableoptions_block->rows(); + _db_index++; + } } - if (_row_idx == _total_rows) { - *eos = true; + if (check_and_mark_eos(eos)) { return Status::OK(); } @@ -127,7 +164,9 @@ Status SchemaTableOptionsScanner::get_next_block(vectorized::Block* block, bool* RETURN_IF_ERROR(mblock.add_rows(_tableoptions_block.get(), _row_idx, current_batch_rows)); _row_idx += current_batch_rows; - *eos = _row_idx == _total_rows; + if (!check_and_mark_eos(eos)) { + *eos = false; + } return Status::OK(); } diff --git a/be/src/exec/schema_scanner/schema_table_options_scanner.h b/be/src/exec/schema_scanner/schema_table_options_scanner.h index d40f1b73c633c5a..92d2e4572f9886e 100644 --- a/be/src/exec/schema_scanner/schema_table_options_scanner.h +++ b/be/src/exec/schema_scanner/schema_table_options_scanner.h @@ -16,6 +16,7 @@ // under the License. #pragma once +#include #include @@ -36,14 +37,16 @@ class SchemaTableOptionsScanner : public SchemaScanner { ~SchemaTableOptionsScanner() override = default; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; private: - Status get_block_from_fe(); - + Status get_onedb_info_from_fe(int64_t dbId); + bool check_and_mark_eos(bool* eos) const; int _block_rows_limit = 4096; + int _db_index = 0; + TGetDbsResult _db_result; int _row_idx = 0; int _total_rows = 0; std::unique_ptr _tableoptions_block = nullptr; diff --git a/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp index fe8aa725b73b803..cdeac2b70dcadde 100644 --- a/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp @@ -84,7 +84,7 @@ Status SchemaTablePrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaTablePrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaTablePrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_table_privileges_scanner.h b/be/src/exec/schema_scanner/schema_table_privileges_scanner.h index aa79c88304b7c58..4cfcc16d3583ce3 100644 --- a/be/src/exec/schema_scanner/schema_table_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_table_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaTablePrivilegesScanner : public SchemaScanner { ~SchemaTablePrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_table_properties_scanner.cpp b/be/src/exec/schema_scanner/schema_table_properties_scanner.cpp new file mode 100644 index 000000000000000..749113da1b507f0 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_table_properties_scanner.cpp @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/schema_scanner/schema_table_properties_scanner.h" + +#include "exec/schema_scanner/schema_helper.h" +#include "runtime/client_cache.h" +#include "runtime/exec_env.h" +#include "runtime/runtime_state.h" +#include "util/thrift_rpc_helper.h" +#include "vec/common/string_ref.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_factory.hpp" + +namespace doris { +std::vector SchemaTablePropertiesScanner::_s_tbls_columns = { + {"TABLE_CATALOG", TYPE_VARCHAR, sizeof(StringRef), true}, + {"TABLE_SCHEMA", TYPE_VARCHAR, sizeof(StringRef), true}, + {"TABLE_NAME", TYPE_VARCHAR, sizeof(StringRef), true}, + {"PROPERTY_NAME", TYPE_STRING, sizeof(StringRef), true}, + {"PROPERTY_VALUE", TYPE_STRING, sizeof(StringRef), true}, +}; + +SchemaTablePropertiesScanner::SchemaTablePropertiesScanner() + : SchemaScanner(_s_tbls_columns, TSchemaTableType::SCH_TABLE_PROPERTIES) {} + +Status SchemaTablePropertiesScanner::start(RuntimeState* state) { + if (!_is_init) { + return Status::InternalError("used before initialized."); + } + + // first get the all the database specific to current catalog + SCOPED_TIMER(_get_db_timer); + TGetDbsParams db_params; + + if (_param->common_param->catalog) { + db_params.__set_catalog(*(_param->common_param->catalog)); + } + if (_param->common_param->current_user_ident) { + db_params.__set_current_user_ident(*(_param->common_param->current_user_ident)); + } + + if (_param->common_param->ip && 0 != _param->common_param->port) { + RETURN_IF_ERROR(SchemaHelper::get_db_names( + *(_param->common_param->ip), _param->common_param->port, db_params, &_db_result)); + } else { + return Status::InternalError("IP or port doesn't exists"); + } + _block_rows_limit = state->batch_size(); + _rpc_timeout_ms = state->execution_timeout() * 1000; + + return Status::OK(); +} + +Status SchemaTablePropertiesScanner::get_onedb_info_from_fe(int64_t dbId) { + TNetworkAddress master_addr = ExecEnv::GetInstance()->master_info()->network_address; + + TSchemaTableRequestParams schema_table_request_params; + for (int i = 0; i < _s_tbls_columns.size(); i++) { + schema_table_request_params.__isset.columns_name = true; + schema_table_request_params.columns_name.emplace_back(_s_tbls_columns[i].name); + } + + schema_table_request_params.__set_current_user_ident(*_param->common_param->current_user_ident); + schema_table_request_params.__set_catalog(*_param->common_param->catalog); + schema_table_request_params.__set_dbId(dbId); + + TFetchSchemaTableDataRequest request; + request.__set_schema_table_name(TSchemaTableName::TABLE_PROPERTIES); + request.__set_schema_table_params(schema_table_request_params); + + TFetchSchemaTableDataResult result; + + RETURN_IF_ERROR(ThriftRpcHelper::rpc( + master_addr.hostname, master_addr.port, + [&request, &result](FrontendServiceConnection& client) { + client->fetchSchemaTableData(result, request); + }, + _rpc_timeout_ms)); + + Status status(Status::create(result.status)); + if (!status.ok()) { + LOG(WARNING) << "fetch table options from FE failed, errmsg=" << status; + return status; + } + std::vector result_data = result.data_batch; + + _tableproperties_block = vectorized::Block::create_unique(); + for (int i = 0; i < _s_tbls_columns.size(); ++i) { + TypeDescriptor descriptor(_s_tbls_columns[i].type); + auto data_type = vectorized::DataTypeFactory::instance().create_data_type(descriptor, true); + _tableproperties_block->insert(vectorized::ColumnWithTypeAndName( + data_type->create_column(), data_type, _s_tbls_columns[i].name)); + } + _tableproperties_block->reserve(_block_rows_limit); + if (result_data.size() > 0) { + int col_size = result_data[0].column_value.size(); + if (col_size != _s_tbls_columns.size()) { + return Status::InternalError("table options schema is not match for FE and BE"); + } + } + + for (int i = 0; i < result_data.size(); i++) { + TRow row = result_data[i]; + for (int j = 0; j < _s_tbls_columns.size(); j++) { + RETURN_IF_ERROR(insert_block_column( + row.column_value[j], j, _tableproperties_block.get(), _s_tbls_columns[j].type)); + } + } + return Status::OK(); +} + +bool SchemaTablePropertiesScanner::check_and_mark_eos(bool* eos) const { + if (_row_idx == _total_rows) { + *eos = true; + if (_db_index < _db_result.db_ids.size()) { + *eos = false; + } + return true; + } + return false; +} + +Status SchemaTablePropertiesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { + if (!_is_init) { + return Status::InternalError("Used before initialized."); + } + + if (nullptr == block || nullptr == eos) { + return Status::InternalError("input pointer is nullptr."); + } + + if ((_tableproperties_block == nullptr) || (_row_idx == _total_rows)) { + if (_db_index < _db_result.db_ids.size()) { + RETURN_IF_ERROR(get_onedb_info_from_fe(_db_result.db_ids[_db_index])); + _row_idx = 0; // reset row index so that it start filling for next block. + _total_rows = _tableproperties_block->rows(); + _db_index++; + } + } + + if (check_and_mark_eos(eos)) { + return Status::OK(); + } + + int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); + vectorized::MutableBlock mblock = vectorized::MutableBlock::build_mutable_block(block); + RETURN_IF_ERROR(mblock.add_rows(_tableproperties_block.get(), _row_idx, current_batch_rows)); + _row_idx += current_batch_rows; + + if (!check_and_mark_eos(eos)) { + *eos = false; + } + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/exec/schema_scanner/schema_table_properties_scanner.h b/be/src/exec/schema_scanner/schema_table_properties_scanner.h new file mode 100644 index 000000000000000..0820fee96287c95 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_table_properties_scanner.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include + +#include + +#include "common/status.h" +#include "exec/schema_scanner.h" + +namespace doris { +class RuntimeState; +namespace vectorized { +class Block; +} // namespace vectorized + +class SchemaTablePropertiesScanner : public SchemaScanner { + ENABLE_FACTORY_CREATOR(SchemaTablePropertiesScanner); + +public: + SchemaTablePropertiesScanner(); + ~SchemaTablePropertiesScanner() override = default; + + Status start(RuntimeState* state) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; + + static std::vector _s_tbls_columns; + +private: + Status get_onedb_info_from_fe(int64_t dbId); + bool check_and_mark_eos(bool* eos) const; + int _block_rows_limit = 4096; + int _row_idx = 0; + int _total_rows = 0; + int _db_index = 0; + TGetDbsResult _db_result; + std::unique_ptr _tableproperties_block = nullptr; + int _rpc_timeout_ms = 3000; +}; +}; // namespace doris diff --git a/be/src/exec/schema_scanner/schema_tables_scanner.cpp b/be/src/exec/schema_scanner/schema_tables_scanner.cpp index 093acf9cecbcb1f..23710b81971c151 100644 --- a/be/src/exec/schema_scanner/schema_tables_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_tables_scanner.cpp @@ -342,7 +342,7 @@ Status SchemaTablesScanner::_fill_block_impl(vectorized::Block* block) { return Status::OK(); } -Status SchemaTablesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaTablesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_tables_scanner.h b/be/src/exec/schema_scanner/schema_tables_scanner.h index 11a96bf65d5271b..7f8eb11f397e060 100644 --- a/be/src/exec/schema_scanner/schema_tables_scanner.h +++ b/be/src/exec/schema_scanner/schema_tables_scanner.h @@ -39,7 +39,7 @@ class SchemaTablesScanner : public SchemaScanner { ~SchemaTablesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp index 6a12d846fbd560a..3eeabc0e4a09177 100644 --- a/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp @@ -81,7 +81,7 @@ Status SchemaUserPrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaUserPrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaUserPrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_user_privileges_scanner.h b/be/src/exec/schema_scanner/schema_user_privileges_scanner.h index eb8f3c63f1433bf..ffc3840db676c48 100644 --- a/be/src/exec/schema_scanner/schema_user_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_user_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaUserPrivilegesScanner : public SchemaScanner { ~SchemaUserPrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_user_scanner.cpp b/be/src/exec/schema_scanner/schema_user_scanner.cpp index 9b153414380350a..e56f18f05aea930 100644 --- a/be/src/exec/schema_scanner/schema_user_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_user_scanner.cpp @@ -76,7 +76,7 @@ Status SchemaUserScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaUserScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaUserScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_user_scanner.h b/be/src/exec/schema_scanner/schema_user_scanner.h index c55f216804d5ddc..bdc618eb5a03327 100644 --- a/be/src/exec/schema_scanner/schema_user_scanner.h +++ b/be/src/exec/schema_scanner/schema_user_scanner.h @@ -40,7 +40,7 @@ class SchemaUserScanner : public SchemaScanner { ~SchemaUserScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_user_columns; diff --git a/be/src/exec/schema_scanner/schema_variables_scanner.cpp b/be/src/exec/schema_scanner/schema_variables_scanner.cpp index 546a0a471cfb016..ad4d5d072cb03f4 100644 --- a/be/src/exec/schema_scanner/schema_variables_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_variables_scanner.cpp @@ -40,7 +40,8 @@ std::vector SchemaVariablesScanner::_s_vars_columns = // name, type, size {"VARIABLE_NAME", TYPE_VARCHAR, sizeof(StringRef), false}, {"VARIABLE_VALUE", TYPE_VARCHAR, sizeof(StringRef), false}, -}; + {"DEFAULT_VALUE", TYPE_VARCHAR, sizeof(StringRef), false}, + {"CHANGED", TYPE_VARCHAR, sizeof(StringRef), false}}; SchemaVariablesScanner::SchemaVariablesScanner(TVarType::type type) : SchemaScanner(_s_vars_columns, TSchemaTableType::SCH_VARIABLES), _type(type) {} @@ -70,7 +71,7 @@ Status SchemaVariablesScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaVariablesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaVariablesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } @@ -94,7 +95,7 @@ Status SchemaVariablesScanner::_fill_block_impl(vectorized::Block* block) { std::vector strs(row_num); int idx = 0; for (auto& it : _var_result.variables) { - strs[idx] = StringRef(it.first.c_str(), it.first.size()); + strs[idx] = StringRef(it[0].c_str(), it[0].size()); datas[idx] = strs.data() + idx; ++idx; } @@ -105,12 +106,34 @@ Status SchemaVariablesScanner::_fill_block_impl(vectorized::Block* block) { std::vector strs(row_num); int idx = 0; for (auto& it : _var_result.variables) { - strs[idx] = StringRef(it.second.c_str(), it.second.size()); + strs[idx] = StringRef(it[1].c_str(), it[1].size()); datas[idx] = strs.data() + idx; ++idx; } RETURN_IF_ERROR(fill_dest_column_for_range(block, 1, datas)); } + // default value + { + std::vector strs(row_num); + int idx = 0; + for (auto& it : _var_result.variables) { + strs[idx] = StringRef(it[2].c_str(), it[2].size()); + datas[idx] = strs.data() + idx; + ++idx; + } + RETURN_IF_ERROR(fill_dest_column_for_range(block, 2, datas)); + } + // changed + { + std::vector strs(row_num); + int idx = 0; + for (auto& it : _var_result.variables) { + strs[idx] = StringRef(it[3].c_str(), it[3].size()); + datas[idx] = strs.data() + idx; + ++idx; + } + RETURN_IF_ERROR(fill_dest_column_for_range(block, 3, datas)); + } return Status::OK(); } diff --git a/be/src/exec/schema_scanner/schema_variables_scanner.h b/be/src/exec/schema_scanner/schema_variables_scanner.h index 2d207ff8b2e6c22..31bbacf713be0f9 100644 --- a/be/src/exec/schema_scanner/schema_variables_scanner.h +++ b/be/src/exec/schema_scanner/schema_variables_scanner.h @@ -40,7 +40,7 @@ class SchemaVariablesScanner : public SchemaScanner { ~SchemaVariablesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct VariableStruct { diff --git a/be/src/exec/schema_scanner/schema_views_scanner.cpp b/be/src/exec/schema_scanner/schema_views_scanner.cpp index 6c3b5f2e21bc3a4..f47766ef3567adb 100644 --- a/be/src/exec/schema_scanner/schema_views_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_views_scanner.cpp @@ -113,7 +113,7 @@ Status SchemaViewsScanner::_get_new_table() { return Status::OK(); } -Status SchemaViewsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaViewsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_views_scanner.h b/be/src/exec/schema_scanner/schema_views_scanner.h index bc473057905a129..b86ad922e5e76a7 100644 --- a/be/src/exec/schema_scanner/schema_views_scanner.h +++ b/be/src/exec/schema_scanner/schema_views_scanner.h @@ -38,7 +38,7 @@ class SchemaViewsScanner : public SchemaScanner { ~SchemaViewsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp b/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp new file mode 100644 index 000000000000000..a1d4568d9053cd4 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_workload_group_privileges.cpp @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/schema_scanner/schema_workload_group_privileges.h" + +#include "runtime/client_cache.h" +#include "runtime/exec_env.h" +#include "runtime/runtime_state.h" +#include "util/thrift_rpc_helper.h" +#include "vec/common/string_ref.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_factory.hpp" + +namespace doris { +std::vector SchemaWorkloadGroupPrivilegesScanner::_s_tbls_columns = { + {"GRANTEE", TYPE_VARCHAR, sizeof(StringRef), true}, + {"WORKLOAD_GROUP_NAME", TYPE_VARCHAR, sizeof(StringRef), true}, + {"PRIVILEGE_TYPE", TYPE_VARCHAR, sizeof(StringRef), true}, + {"IS_GRANTABLE", TYPE_VARCHAR, sizeof(StringRef), true}, +}; + +SchemaWorkloadGroupPrivilegesScanner::SchemaWorkloadGroupPrivilegesScanner() + : SchemaScanner(_s_tbls_columns, TSchemaTableType::SCH_WORKLOAD_GROUPS) {} + +SchemaWorkloadGroupPrivilegesScanner::~SchemaWorkloadGroupPrivilegesScanner() {} + +Status SchemaWorkloadGroupPrivilegesScanner::start(RuntimeState* state) { + _block_rows_limit = state->batch_size(); + _rpc_timeout = state->execution_timeout() * 1000; + return Status::OK(); +} + +Status SchemaWorkloadGroupPrivilegesScanner::_get_workload_group_privs_block_from_fe() { + TNetworkAddress master_addr = ExecEnv::GetInstance()->master_info()->network_address; + + TSchemaTableRequestParams schema_table_request_params; + for (int i = 0; i < _s_tbls_columns.size(); i++) { + schema_table_request_params.__isset.columns_name = true; + schema_table_request_params.columns_name.emplace_back(_s_tbls_columns[i].name); + } + schema_table_request_params.__set_current_user_ident(*_param->common_param->current_user_ident); + + TFetchSchemaTableDataRequest request; + request.__set_schema_table_name(TSchemaTableName::WORKLOAD_GROUP_PRIVILEGES); + request.__set_schema_table_params(schema_table_request_params); + + TFetchSchemaTableDataResult result; + + RETURN_IF_ERROR(ThriftRpcHelper::rpc( + master_addr.hostname, master_addr.port, + [&request, &result](FrontendServiceConnection& client) { + client->fetchSchemaTableData(result, request); + }, + _rpc_timeout)); + + Status status(Status::create(result.status)); + if (!status.ok()) { + LOG(WARNING) << "fetch workload group privileges from FE failed, errmsg=" << status; + return status; + } + std::vector result_data = result.data_batch; + + _workload_groups_privs_block = vectorized::Block::create_unique(); + for (int i = 0; i < _s_tbls_columns.size(); ++i) { + TypeDescriptor descriptor(_s_tbls_columns[i].type); + auto data_type = vectorized::DataTypeFactory::instance().create_data_type(descriptor, true); + _workload_groups_privs_block->insert(vectorized::ColumnWithTypeAndName( + data_type->create_column(), data_type, _s_tbls_columns[i].name)); + } + + if (result_data.size() > 0) { + int col_size = result_data[0].column_value.size(); + if (col_size != _s_tbls_columns.size()) { + return Status::InternalError( + "workload group privileges schema is not match for FE and BE"); + } + } + + _workload_groups_privs_block->reserve(result_data.size()); + + for (int i = 0; i < result_data.size(); i++) { + TRow row = result_data[i]; + + for (int j = 0; j < _s_tbls_columns.size(); j++) { + RETURN_IF_ERROR(insert_block_column(row.column_value[j], j, + _workload_groups_privs_block.get(), + _s_tbls_columns[j].type)); + } + } + return Status::OK(); +} + +Status SchemaWorkloadGroupPrivilegesScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { + if (!_is_init) { + return Status::InternalError("Used before initialized."); + } + + if (nullptr == block || nullptr == eos) { + return Status::InternalError("input pointer is nullptr."); + } + + if (_workload_groups_privs_block == nullptr) { + RETURN_IF_ERROR(_get_workload_group_privs_block_from_fe()); + _total_rows = _workload_groups_privs_block->rows(); + } + + if (_row_idx == _total_rows) { + *eos = true; + return Status::OK(); + } + + int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); + vectorized::MutableBlock mblock = vectorized::MutableBlock::build_mutable_block(block); + RETURN_IF_ERROR( + mblock.add_rows(_workload_groups_privs_block.get(), _row_idx, current_batch_rows)); + _row_idx += current_batch_rows; + + *eos = _row_idx == _total_rows; + return Status::OK(); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/exec/schema_scanner/schema_workload_group_privileges.h b/be/src/exec/schema_scanner/schema_workload_group_privileges.h new file mode 100644 index 000000000000000..0a7bf1258eed1f1 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_workload_group_privileges.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/status.h" +#include "exec/schema_scanner.h" + +namespace doris { +class RuntimeState; +namespace vectorized { +class Block; +} // namespace vectorized + +class SchemaWorkloadGroupPrivilegesScanner : public SchemaScanner { + ENABLE_FACTORY_CREATOR(SchemaWorkloadGroupPrivilegesScanner); + +public: + SchemaWorkloadGroupPrivilegesScanner(); + ~SchemaWorkloadGroupPrivilegesScanner() override; + + Status start(RuntimeState* state) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; + + static std::vector _s_tbls_columns; + +private: + Status _get_workload_group_privs_block_from_fe(); + + int _block_rows_limit = 4096; + int _row_idx = 0; + int _total_rows = 0; + std::unique_ptr _workload_groups_privs_block = nullptr; + int _rpc_timeout = 3000; +}; +}; // namespace doris \ No newline at end of file diff --git a/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.cpp new file mode 100644 index 000000000000000..ca339044e98a5f5 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.cpp @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/schema_scanner/schema_workload_group_resource_usage_scanner.h" + +#include +#include + +#include "runtime/exec_env.h" +#include "runtime/runtime_state.h" +#include "runtime/workload_group/workload_group_manager.h" +#include "vec/common/string_ref.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_factory.hpp" + +namespace doris { +std::vector SchemaBackendWorkloadGroupResourceUsage::_s_tbls_columns = { + // name, type, size + {"BE_ID", TYPE_BIGINT, sizeof(int64_t), false}, + {"WORKLOAD_GROUP_ID", TYPE_BIGINT, sizeof(int64_t), false}, + {"MEMORY_USAGE_BYTES", TYPE_BIGINT, sizeof(int64_t), false}, + {"CPU_USAGE_PERCENT", TYPE_DOUBLE, sizeof(double), false}, + {"LOCAL_SCAN_BYTES_PER_SECOND", TYPE_BIGINT, sizeof(int64_t), false}, + {"REMOTE_SCAN_BYTES_PER_SECOND", TYPE_BIGINT, sizeof(int64_t), false}, +}; + +SchemaBackendWorkloadGroupResourceUsage::SchemaBackendWorkloadGroupResourceUsage() + : SchemaScanner(_s_tbls_columns, TSchemaTableType::SCH_WORKLOAD_GROUP_RESOURCE_USAGE) {} + +SchemaBackendWorkloadGroupResourceUsage::~SchemaBackendWorkloadGroupResourceUsage() {} + +Status SchemaBackendWorkloadGroupResourceUsage::start(RuntimeState* state) { + _block_rows_limit = state->batch_size(); + return Status::OK(); +} + +Status SchemaBackendWorkloadGroupResourceUsage::get_next_block_internal(vectorized::Block* block, + bool* eos) { + if (!_is_init) { + return Status::InternalError("Used before initialized."); + } + + if (nullptr == block || nullptr == eos) { + return Status::InternalError("input pointer is nullptr."); + } + + if (_block == nullptr) { + _block = vectorized::Block::create_unique(); + + for (int i = 0; i < _s_tbls_columns.size(); ++i) { + TypeDescriptor descriptor(_s_tbls_columns[i].type); + auto data_type = + vectorized::DataTypeFactory::instance().create_data_type(descriptor, true); + _block->insert(vectorized::ColumnWithTypeAndName(data_type->create_column(), data_type, + _s_tbls_columns[i].name)); + } + + ExecEnv::GetInstance()->workload_group_mgr()->get_wg_resource_usage(_block.get()); + _total_rows = _block->rows(); + } + + if (_row_idx == _total_rows) { + *eos = true; + return Status::OK(); + } + + int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); + vectorized::MutableBlock mblock = vectorized::MutableBlock::build_mutable_block(block); + RETURN_IF_ERROR(mblock.add_rows(_block.get(), _row_idx, current_batch_rows)); + _row_idx += current_batch_rows; + + *eos = _row_idx == _total_rows; + return Status::OK(); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.h b/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.h new file mode 100644 index 000000000000000..236dd69999fbb37 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_workload_group_resource_usage_scanner.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/status.h" +#include "exec/schema_scanner.h" + +namespace doris { +class RuntimeState; +namespace vectorized { +class Block; +} // namespace vectorized + +class SchemaBackendWorkloadGroupResourceUsage : public SchemaScanner { + ENABLE_FACTORY_CREATOR(SchemaBackendWorkloadGroupResourceUsage); + +public: + SchemaBackendWorkloadGroupResourceUsage(); + ~SchemaBackendWorkloadGroupResourceUsage() override; + + Status start(RuntimeState* state) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; + + static std::vector _s_tbls_columns; + +private: + int _block_rows_limit = 4096; + int _row_idx = 0; + int _total_rows = 0; + std::unique_ptr _block = nullptr; +}; +}; // namespace doris \ No newline at end of file diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp index ad9be85ad2e832f..dd81a3ecb267dac 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp @@ -116,7 +116,7 @@ Status SchemaWorkloadGroupsScanner::_get_workload_groups_block_from_fe() { return Status::OK(); } -Status SchemaWorkloadGroupsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaWorkloadGroupsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.h b/be/src/exec/schema_scanner/schema_workload_groups_scanner.h index bf7a103526dc803..3121c4dbac149e2 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.h +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.h @@ -36,7 +36,7 @@ class SchemaWorkloadGroupsScanner : public SchemaScanner { ~SchemaWorkloadGroupsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp index 035d3bfe217aece..2d91f151f5f2bb7 100644 --- a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp @@ -106,7 +106,8 @@ Status SchemaWorkloadSchedulePolicyScanner::_get_workload_schedule_policy_block_ return Status::OK(); } -Status SchemaWorkloadSchedulePolicyScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaWorkloadSchedulePolicyScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h index 5284975fe66b314..da8d9f15c4989ec 100644 --- a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h +++ b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h @@ -36,7 +36,7 @@ class SchemaWorkloadSchedulePolicyScanner : public SchemaScanner { ~SchemaWorkloadSchedulePolicyScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index a8efbd338a32aaf..3d73bf1bd886de8 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -515,6 +515,11 @@ static Status _create_partition_key(const TExprNode& t_expr, BlockRow* part_key, } case TExprNodeType::NULL_LITERAL: { // insert a null literal + if (!column->is_nullable()) { + // https://github.com/apache/doris/pull/39449 have forbid this cause. always add this check as protective measures + return Status::InternalError("The column {} is not null, can't insert into NULL value.", + part_key->first->get_by_position(pos).name); + } column->insert_data(nullptr, 0); break; } diff --git a/be/src/exprs/block_bloom_filter.hpp b/be/src/exprs/block_bloom_filter.hpp index f31d7f7d4c0517a..b7d488a3003c3d4 100644 --- a/be/src/exprs/block_bloom_filter.hpp +++ b/be/src/exprs/block_bloom_filter.hpp @@ -124,6 +124,39 @@ class BlockBloomFilter { return false; } +#ifdef __ARM_NEON + void make_find_mask(uint32_t key, uint32x4_t* masks) const noexcept { + uint32x4_t hash_data_1 = vdupq_n_u32(key); + uint32x4_t hash_data_2 = vdupq_n_u32(key); + + uint32x4_t rehash_1 = vld1q_u32(&kRehash[0]); + uint32x4_t rehash_2 = vld1q_u32(&kRehash[4]); + + // masks[i] = key * kRehash[i]; + hash_data_1 = vmulq_u32(rehash_1, hash_data_1); + hash_data_2 = vmulq_u32(rehash_2, hash_data_2); + // masks[i] = masks[i] >> shift_num; + hash_data_1 = vshrq_n_u32(hash_data_1, shift_num); + hash_data_2 = vshrq_n_u32(hash_data_2, shift_num); + + const uint32x4_t ones = vdupq_n_u32(1); + + // masks[i] = 0x1 << masks[i]; + masks[0] = vshlq_u32(ones, reinterpret_cast(hash_data_1)); + masks[1] = vshlq_u32(ones, reinterpret_cast(hash_data_2)); + } +#else + void make_find_mask(uint32_t key, uint32_t* masks) const noexcept { + for (int i = 0; i < kBucketWords; ++i) { + masks[i] = key * kRehash[i]; + + masks[i] = masks[i] >> shift_num; + + masks[i] = 0x1 << masks[i]; + } + } +#endif + // Computes the logical OR of this filter with 'other' and stores the result in this // filter. // Notes: @@ -163,7 +196,8 @@ class BlockBloomFilter { // log2(number of bits in a BucketWord) static constexpr int kLogBucketWordBits = 5; static constexpr BucketWord kBucketWordMask = (1 << kLogBucketWordBits) - 1; - + // (>> 27) is equivalent to (mod 32) + static constexpr auto shift_num = ((1 << kLogBucketWordBits) - kLogBucketWordBits); // log2(number of bytes in a bucket) static constexpr int kLogBucketByteSize = 5; // Bucket size in bytes. diff --git a/be/src/exprs/block_bloom_filter_impl.cc b/be/src/exprs/block_bloom_filter_impl.cc index d285edcb310e1b4..e89b91422662a95 100644 --- a/be/src/exprs/block_bloom_filter_impl.cc +++ b/be/src/exprs/block_bloom_filter_impl.cc @@ -138,14 +138,37 @@ void BlockBloomFilter::bucket_insert(const uint32_t bucket_idx, const uint32_t h } bool BlockBloomFilter::bucket_find(const uint32_t bucket_idx, const uint32_t hash) const noexcept { +#if defined(__ARM_NEON) + uint32x4_t masks[2]; + + uint32x4_t directory_1 = vld1q_u32(&_directory[bucket_idx][0]); + uint32x4_t directory_2 = vld1q_u32(&_directory[bucket_idx][4]); + + make_find_mask(hash, masks); + // The condition for returning true is that all the bits in _directory[bucket_idx][i] specified by masks[i] are 1. + // This can be equivalently expressed as all the bits in not( _directory[bucket_idx][i]) specified by masks[i] are 0. + // vbicq_u32(vec1, vec2) : Result of (vec1 AND NOT vec2) + // If true is returned, out_1 and out_2 should be all zeros. + uint32x4_t out_1 = vbicq_u32(masks[0], directory_1); + uint32x4_t out_2 = vbicq_u32(masks[1], directory_2); + + out_1 = vorrq_u32(out_1, out_2); + + uint32x2_t low = vget_low_u32(out_1); + uint32x2_t high = vget_high_u32(out_1); + low = vorr_u32(low, high); + uint32_t res = vget_lane_u32(low, 0) | vget_lane_u32(low, 1); + return !(res); +#else + uint32_t masks[kBucketWords]; + make_find_mask(hash, masks); for (int i = 0; i < kBucketWords; ++i) { - BucketWord hval = (kRehash[i] * hash) >> ((1 << kLogBucketWordBits) - kLogBucketWordBits); - hval = 1U << hval; - if (!(DCHECK_NOTNULL(_directory)[bucket_idx][i] & hval)) { + if ((DCHECK_NOTNULL(_directory)[bucket_idx][i] & masks[i]) == 0) { return false; } } return true; +#endif } void BlockBloomFilter::insert_no_avx2(const uint32_t hash) noexcept { diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index e88f692a23db8a6..6d452bbe9922dc4 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -100,14 +100,14 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { virtual ~BloomFilterFuncBase() = default; void init_params(const RuntimeFilterParams* params) { - _bloom_filter_length = - params->runtime_bloom_filter_min_size > 0 - ? std::max(params->bloom_filter_size, params->runtime_bloom_filter_min_size) - : params->bloom_filter_size; + _bloom_filter_length = params->bloom_filter_size; + _build_bf_exactly = params->build_bf_exactly; _runtime_bloom_filter_min_size = params->runtime_bloom_filter_min_size; + _runtime_bloom_filter_max_size = params->runtime_bloom_filter_max_size; _null_aware = params->null_aware; _bloom_filter_size_calculated_by_ndv = params->bloom_filter_size_calculated_by_ndv; + _limit_length(); } Status init_with_fixed_length() { return init_with_fixed_length(_bloom_filter_length); } @@ -128,17 +128,11 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { // if FE do use ndv stat to predict the bf size, BE only use the row count. FE have more // exactly row count stat. which one is min is more correctly. if (_bloom_filter_size_calculated_by_ndv) { - _bloom_filter_length = - _runtime_bloom_filter_min_size > 0 - ? std::max(_runtime_bloom_filter_min_size, - std::min(be_calculate_size, _bloom_filter_length)) - : std::min(be_calculate_size, _bloom_filter_length); + _bloom_filter_length = std::min(be_calculate_size, _bloom_filter_length); } else { - _bloom_filter_length = - _runtime_bloom_filter_min_size > 0 - ? std::max(_runtime_bloom_filter_min_size, be_calculate_size) - : be_calculate_size; + _bloom_filter_length = be_calculate_size; } + _limit_length(); } return init_with_fixed_length(_bloom_filter_length); } @@ -194,6 +188,7 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { } _bloom_filter_alloced = data_size; + _inited = true; return _bloom_filter->init(data, data_size); } @@ -228,6 +223,16 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { uint16_t* offsets, int number, bool is_parse_column) = 0; +private: + void _limit_length() { + if (_runtime_bloom_filter_min_size > 0) { + _bloom_filter_length = std::max(_bloom_filter_length, _runtime_bloom_filter_min_size); + } + if (_runtime_bloom_filter_max_size > 0) { + _bloom_filter_length = std::min(_bloom_filter_length, _runtime_bloom_filter_max_size); + } + } + protected: // bloom filter size int32_t _bloom_filter_alloced; @@ -235,6 +240,7 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { bool _inited = false; int64_t _bloom_filter_length; int64_t _runtime_bloom_filter_min_size; + int64_t _runtime_bloom_filter_max_size; bool _build_bf_exactly = false; bool _bloom_filter_size_calculated_by_ndv = false; }; diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h index b75cc81ebf1f144..f0977a652b1cbe6 100644 --- a/be/src/exprs/hybrid_set.h +++ b/be/src/exprs/hybrid_set.h @@ -17,7 +17,13 @@ #pragma once +#include + +#include + +#include "common/exception.h" #include "common/object_pool.h" +#include "common/status.h" #include "exprs/runtime_filter.h" #include "runtime/decimalv2_value.h" #include "runtime/define_primitive_type.h" @@ -60,8 +66,16 @@ class FixedContainer { } } + void check_size() { + if (N != _size) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "invalid size of FixedContainer<{}>: {}", N, _size); + } + } + // Use '|' instead of '||' has better performance by test. ALWAYS_INLINE bool find(const T& value) const { + DCHECK_EQ(N, _size); if constexpr (N == 0) { return false; } @@ -144,6 +158,12 @@ class FixedContainer { size_t _size {}; }; +template +struct IsFixedContainer : std::false_type {}; + +template +struct IsFixedContainer> : std::true_type {}; + /** * Dynamic Container uses phmap::flat_hash_set. * @tparam T Element Type @@ -354,6 +374,11 @@ class HybridSet : public HybridSetBase { if constexpr (is_nullable) { null_map_data = null_map->data(); } + + if constexpr (IsFixedContainer::value) { + _set.check_size(); + } + auto* __restrict result_data = results.data(); for (size_t i = 0; i < rows; ++i) { if constexpr (!is_nullable && !is_negative) { @@ -507,6 +532,11 @@ class StringSet : public HybridSetBase { if constexpr (is_nullable) { null_map_data = null_map->data(); } + + if constexpr (IsFixedContainer::value) { + _set.check_size(); + } + auto* __restrict result_data = results.data(); for (size_t i = 0; i < rows; ++i) { const auto& string_data = col.get_data_at(i).to_string(); @@ -675,6 +705,11 @@ class StringValueSet : public HybridSetBase { if constexpr (is_nullable) { null_map_data = null_map->data(); } + + if constexpr (IsFixedContainer::value) { + _set.check_size(); + } + auto* __restrict result_data = results.data(); for (size_t i = 0; i < rows; ++i) { uint32_t len = offset[i] - offset[i - 1]; diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index 205ee5a5d20b926..7bbb5493d8127e7 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include // IWYU pragma: keep #include @@ -259,13 +260,17 @@ Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj, const std::vector& jsonpath, simdjson::ondemand::value* value) noexcept { // Return DataQualityError when it's a malformed json. -// Otherwise the path was not found, due to array out of bound or not exist +// Otherwise the path was not found, due to +// 1. array out of bound +// 2. not exist such field in object +// 3. the input type is not object but could be null or other types and lead to simdjson::INCORRECT_TYPE #define HANDLE_SIMDJSON_ERROR(err, msg) \ do { \ const simdjson::error_code& _err = err; \ const std::string& _msg = msg; \ if (UNLIKELY(_err)) { \ - if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS) { \ + if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS || \ + _err == simdjson::INCORRECT_TYPE) { \ return Status::NotFound( \ fmt::format("Not found target filed, err: {}, msg: {}", \ simdjson::error_message(_err), _msg)); \ @@ -353,4 +358,9 @@ void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value } } +// root path "$." +bool JsonFunctions::is_root_path(const std::vector& json_path) { + return json_path.size() == 2 && json_path[0].key == "$" && json_path[1].key.empty(); +} + } // namespace doris diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h index 72aa522ff374fa9..11970eb8c46c565 100644 --- a/be/src/exprs/json_functions.h +++ b/be/src/exprs/json_functions.h @@ -116,6 +116,8 @@ class JsonFunctions { static std::string print_json_value(const rapidjson::Value& value); + static bool is_root_path(const std::vector& json_path); + private: static rapidjson::Value* match_value(const std::vector& parsed_paths, rapidjson::Value* document, diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index f61cebc8c054bcd..93d9c159759b0e8 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -694,8 +694,10 @@ class RuntimePredicateWrapper { case TYPE_CHAR: case TYPE_STRING: { batch_assign(in_filter, [](std::shared_ptr& set, PColumnValue& column) { - const auto& string_val_ref = column.stringval(); - set->insert(&string_val_ref); + const std::string& string_value = column.stringval(); + // string_value is std::string, call insert(data, size) function in StringSet will not cast as StringRef + // so could avoid some cast error at different class object. + set->insert((void*)string_value.data(), string_value.size()); }); break; } @@ -1005,7 +1007,10 @@ Status IRuntimeFilter::publish(bool publish_local) { class SyncSizeClosure : public AutoReleaseClosure> { std::shared_ptr _dependency; - RuntimeFilterContextSPtr _rf_context; + // Should use weak ptr here, because when query context deconstructs, should also delete runtime filter + // context, it not the memory is not released. And rpc is in another thread, it will hold rf context + // after query context because the rpc is not returned. + std::weak_ptr _rf_context; std::string _rf_debug_info; using Base = AutoReleaseClosure>; @@ -1021,7 +1026,13 @@ class SyncSizeClosure : public AutoReleaseClosuresub(); if (status.is()) { // rf merger backend may finished before rf's send_filter_size, we just ignore filter in this case. - _rf_context->ignored = true; + auto ctx = _rf_context.lock(); + if (ctx) { + ctx->ignored = true; + } else { + LOG(WARNING) << "sync filter size returned but context is released, filter=" + << _rf_debug_info; + } } else { LOG(WARNING) << "sync filter size meet error status, filter=" << _rf_debug_info; Base::_process_if_meet_error_status(status); @@ -1172,33 +1183,6 @@ Status IRuntimeFilter::get_push_expr_ctxs(std::listexecution_timeout * 1000; - auto runtime_filter_wait_time_ms = _state->runtime_filter_wait_time_ms; - // bitmap filter is precise filter and only filter once, so it must be applied. - int64_t wait_times_ms = _wrapper->get_real_type() == RuntimeFilterType::BITMAP_FILTER - ? execution_timeout - : runtime_filter_wait_time_ms; - auto expected = _rf_state_atomic.load(std::memory_order_acquire); - if (expected == RuntimeFilterState::NOT_READY) { - if (!_rf_state_atomic.compare_exchange_strong( - expected, - MonotonicMillis() - registration_time_ >= wait_times_ms - ? RuntimeFilterState::TIME_OUT - : RuntimeFilterState::NOT_READY, - std::memory_order_acq_rel)) { - DCHECK(expected == RuntimeFilterState::READY || - expected == RuntimeFilterState::TIME_OUT); - return (expected == RuntimeFilterState::READY); - } - return false; - } else if (expected == RuntimeFilterState::TIME_OUT) { - return false; - } - return true; -} - void IRuntimeFilter::update_state() { DCHECK(is_consumer()); auto execution_timeout = _state->execution_timeout * 1000; @@ -1307,6 +1291,9 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue params.runtime_bloom_filter_min_size = options->__isset.runtime_bloom_filter_min_size ? options->runtime_bloom_filter_min_size : 0; + params.runtime_bloom_filter_max_size = options->__isset.runtime_bloom_filter_max_size + ? options->runtime_bloom_filter_max_size + : 0; // We build runtime filter by exact distinct count iff three conditions are met: // 1. Only 1 join key // 2. Do not have remote target (e.g. do not need to merge), or broadcast join @@ -1645,8 +1632,10 @@ void IRuntimeFilter::to_protobuf(PInFilter* filter) { case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_STRING: { - batch_copy(filter, it, [](PColumnValue* column, const std::string* value) { - column->set_stringval(*value); + //const void* void_value = it->get_value(); + //Now the get_value return void* is StringRef + batch_copy(filter, it, [](PColumnValue* column, const StringRef* value) { + column->set_stringval(value->to_string()); }); return; } @@ -1666,8 +1655,8 @@ void IRuntimeFilter::to_protobuf(PMinMaxFilter* filter) { switch (_wrapper->column_type()) { case TYPE_BOOLEAN: { - filter->mutable_min_val()->set_boolval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_boolval(*reinterpret_cast(max_data)); + filter->mutable_min_val()->set_boolval(*reinterpret_cast(min_data)); + filter->mutable_max_val()->set_boolval(*reinterpret_cast(max_data)); return; } case TYPE_TINYINT: { @@ -1844,7 +1833,9 @@ Status RuntimePredicateWrapper::get_push_exprs( node.__set_is_nullable(false); auto in_pred = vectorized::VDirectInPredicate::create_shared(node, _context->hybrid_set); in_pred->add_child(probe_ctx->root()); - auto wrapper = vectorized::VRuntimeFilterWrapper::create_shared(node, in_pred, null_aware); + auto wrapper = vectorized::VRuntimeFilterWrapper::create_shared( + node, in_pred, get_in_list_ignore_thredhold(_context->hybrid_set->size()), + null_aware); container.push_back(wrapper); break; } @@ -1859,8 +1850,8 @@ Status RuntimePredicateWrapper::get_push_exprs( min_literal)); min_pred->add_child(probe_ctx->root()); min_pred->add_child(min_literal); - container.push_back( - vectorized::VRuntimeFilterWrapper::create_shared(min_pred_node, min_pred)); + container.push_back(vectorized::VRuntimeFilterWrapper::create_shared( + min_pred_node, min_pred, get_comparison_ignore_thredhold())); break; } case RuntimeFilterType::MAX_FILTER: { @@ -1874,8 +1865,8 @@ Status RuntimePredicateWrapper::get_push_exprs( max_literal)); max_pred->add_child(probe_ctx->root()); max_pred->add_child(max_literal); - container.push_back( - vectorized::VRuntimeFilterWrapper::create_shared(max_pred_node, max_pred)); + container.push_back(vectorized::VRuntimeFilterWrapper::create_shared( + max_pred_node, max_pred, get_comparison_ignore_thredhold())); break; } case RuntimeFilterType::MINMAX_FILTER: { @@ -1889,8 +1880,8 @@ Status RuntimePredicateWrapper::get_push_exprs( max_literal)); max_pred->add_child(probe_ctx->root()); max_pred->add_child(max_literal); - container.push_back(vectorized::VRuntimeFilterWrapper::create_shared(max_pred_node, - max_pred, null_aware)); + container.push_back(vectorized::VRuntimeFilterWrapper::create_shared( + max_pred_node, max_pred, get_comparison_ignore_thredhold(), null_aware)); vectorized::VExprContextSPtr new_probe_ctx; RETURN_IF_ERROR(vectorized::VExpr::create_expr_tree(probe_expr, new_probe_ctx)); @@ -1906,8 +1897,8 @@ Status RuntimePredicateWrapper::get_push_exprs( _context->minmax_func->get_min(), min_literal)); min_pred->add_child(new_probe_ctx->root()); min_pred->add_child(min_literal); - container.push_back(vectorized::VRuntimeFilterWrapper::create_shared(min_pred_node, - min_pred, null_aware)); + container.push_back(vectorized::VRuntimeFilterWrapper::create_shared( + min_pred_node, min_pred, get_comparison_ignore_thredhold(), null_aware)); break; } case RuntimeFilterType::BLOOM_FILTER: { @@ -1922,7 +1913,8 @@ Status RuntimePredicateWrapper::get_push_exprs( auto bloom_pred = vectorized::VBloomPredicate::create_shared(node); bloom_pred->set_filter(_context->bloom_filter_func); bloom_pred->add_child(probe_ctx->root()); - auto wrapper = vectorized::VRuntimeFilterWrapper::create_shared(node, bloom_pred); + auto wrapper = vectorized::VRuntimeFilterWrapper::create_shared( + node, bloom_pred, get_bloom_filter_ignore_thredhold()); container.push_back(wrapper); break; } @@ -1938,7 +1930,7 @@ Status RuntimePredicateWrapper::get_push_exprs( auto bitmap_pred = vectorized::VBitmapPredicate::create_shared(node); bitmap_pred->set_filter(_context->bitmap_filter_func); bitmap_pred->add_child(probe_ctx->root()); - auto wrapper = vectorized::VRuntimeFilterWrapper::create_shared(node, bitmap_pred); + auto wrapper = vectorized::VRuntimeFilterWrapper::create_shared(node, bitmap_pred, 0); container.push_back(wrapper); break; } diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index 9bf27025876f155..c4a38517ab4ba04 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -128,6 +128,7 @@ struct RuntimeFilterParams { int64_t bloom_filter_size; int32_t max_in_num; int64_t runtime_bloom_filter_min_size; + int64_t runtime_bloom_filter_max_size; int32_t filter_id; bool bitmap_filter_not_in; bool build_bf_exactly; @@ -254,12 +255,6 @@ class IRuntimeFilter { void set_role(const RuntimeFilterRole role) { _role = role; } int expr_order() const { return _expr_order; } - // only used for consumer - // if filter is not ready for filter data scan_node - // will wait util it ready or timeout - // This function will wait at most config::runtime_filter_shuffle_wait_time_ms - // if return true , filter is ready to use - bool await(); void update_state(); // this function will be called if a runtime filter sent by rpc // it will notify all wait threads diff --git a/be/src/http/action/check_rpc_channel_action.cpp b/be/src/http/action/check_rpc_channel_action.cpp index 4949b21b8f51c93..7b98db510e52b03 100644 --- a/be/src/http/action/check_rpc_channel_action.cpp +++ b/be/src/http/action/check_rpc_channel_action.cpp @@ -39,6 +39,7 @@ namespace doris { CheckRPCChannelAction::CheckRPCChannelAction(ExecEnv* exec_env, TPrivilegeHier::type hier, TPrivilegeType::type type) : HttpHandlerWithAuth(exec_env, hier, type) {} + void CheckRPCChannelAction::handle(HttpRequest* req) { std::string req_ip = req->param("ip"); std::string req_port = req->param("port"); diff --git a/be/src/http/action/check_rpc_channel_action.h b/be/src/http/action/check_rpc_channel_action.h index 883180f02dfa493..07969c80f08e1d9 100644 --- a/be/src/http/action/check_rpc_channel_action.h +++ b/be/src/http/action/check_rpc_channel_action.h @@ -31,8 +31,5 @@ class CheckRPCChannelAction : public HttpHandlerWithAuth { ~CheckRPCChannelAction() override = default; void handle(HttpRequest* req) override; - -private: - ExecEnv* _exec_env; }; } // namespace doris diff --git a/be/src/http/action/clear_cache_action.cpp b/be/src/http/action/clear_cache_action.cpp index f42499090c42ae0..cb183a99cf15029 100644 --- a/be/src/http/action/clear_cache_action.cpp +++ b/be/src/http/action/clear_cache_action.cpp @@ -30,10 +30,37 @@ namespace doris { const static std::string HEADER_JSON = "application/json"; -void ClearDataCacheAction::handle(HttpRequest* req) { +void ClearCacheAction::handle(HttpRequest* req) { req->add_output_header(HttpHeaders::CONTENT_TYPE, "text/plain; version=0.0.4"); - CacheManager::instance()->clear_once(); - HttpChannel::send_reply(req, HttpStatus::OK, ""); + std::string cache_type_str = req->param("type"); + fmt::memory_buffer return_string_buffer; + int64_t freed_size = 0; + if (cache_type_str == "all") { + freed_size = CacheManager::instance()->for_each_cache_prune_all(nullptr, true); + } else { + CachePolicy::CacheType cache_type = CachePolicy::string_to_type(cache_type_str); + if (cache_type == CachePolicy::CacheType::NONE) { + fmt::format_to(return_string_buffer, + "ClearCacheAction not match type:{} of cache policy", cache_type_str); + LOG(WARNING) << fmt::to_string(return_string_buffer); + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + fmt::to_string(return_string_buffer)); + return; + } + freed_size = CacheManager::instance()->cache_prune_all(cache_type, true); + if (freed_size == -1) { + fmt::format_to(return_string_buffer, + "ClearCacheAction cache:{} is not allowed to be pruned", cache_type_str); + LOG(WARNING) << fmt::to_string(return_string_buffer); + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + fmt::to_string(return_string_buffer)); + return; + } + } + fmt::format_to(return_string_buffer, "ClearCacheAction cache:{} prune win, freed size {}", + cache_type_str, freed_size); + LOG(WARNING) << fmt::to_string(return_string_buffer); + HttpChannel::send_reply(req, HttpStatus::OK, fmt::to_string(return_string_buffer)); } } // end namespace doris diff --git a/be/src/http/action/clear_cache_action.h b/be/src/http/action/clear_cache_action.h index 3840f63593f98f5..3795a87b5d76ffc 100644 --- a/be/src/http/action/clear_cache_action.h +++ b/be/src/http/action/clear_cache_action.h @@ -23,11 +23,11 @@ namespace doris { class HttpRequest; -class ClearDataCacheAction : public HttpHandler { +class ClearCacheAction : public HttpHandler { public: - ClearDataCacheAction() = default; + ClearCacheAction() = default; - ~ClearDataCacheAction() override = default; + ~ClearCacheAction() override = default; void handle(HttpRequest* req) override; }; diff --git a/be/src/http/action/download_action.cpp b/be/src/http/action/download_action.cpp index 284314f421d2070..80a7bc28c588aa4 100644 --- a/be/src/http/action/download_action.cpp +++ b/be/src/http/action/download_action.cpp @@ -199,8 +199,10 @@ Status DownloadAction::check_token(HttpRequest* req) { return Status::NotAuthorized("token is not specified."); } - if (token_str != _exec_env->token()) { - return Status::NotAuthorized("invalid token."); + const std::string& local_token = _exec_env->token(); + if (token_str != local_token) { + LOG(WARNING) << "invalid download token: " << token_str << ", local token: " << local_token; + return Status::NotAuthorized("invalid token {}", token_str); } return Status::OK(); diff --git a/be/src/http/action/download_binlog_action.cpp b/be/src/http/action/download_binlog_action.cpp index 589932b116f72a0..e263112da26a447 100644 --- a/be/src/http/action/download_binlog_action.cpp +++ b/be/src/http/action/download_binlog_action.cpp @@ -244,8 +244,10 @@ Status DownloadBinlogAction::_check_token(HttpRequest* req) { return Status::InternalError("token is not specified."); } - if (token_str != _exec_env->token()) { - return Status::InternalError("invalid token."); + const std::string& local_token = _exec_env->token(); + if (token_str != local_token) { + LOG(WARNING) << "invalid download token: " << token_str << ", local token: " << local_token; + return Status::NotAuthorized("invalid token {}", token_str); } return Status::OK(); diff --git a/be/src/http/action/file_cache_action.cpp b/be/src/http/action/file_cache_action.cpp index acad2b3b7bf96c2..659be2537997f3f 100644 --- a/be/src/http/action/file_cache_action.cpp +++ b/be/src/http/action/file_cache_action.cpp @@ -63,7 +63,6 @@ Status FileCacheAction::_handle_header(HttpRequest* req, std::string* json_metri const std::string& sync = req->param(SYNC.data()); auto ret = io::FileCacheFactory::instance()->clear_file_caches(to_lower(sync) == "true"); } else if (operation == RESET) { - Status st; std::string capacity = req->param(CAPACITY.data()); int64_t new_capacity = 0; bool parse = true; diff --git a/be/src/http/action/http_stream.cpp b/be/src/http/action/http_stream.cpp index 87cc2f694eb102e..afeb251ca415557 100644 --- a/be/src/http/action/http_stream.cpp +++ b/be/src/http/action/http_stream.cpp @@ -18,9 +18,7 @@ #include "http/action/http_stream.h" #include -#include #include -#include #include // use string iequal @@ -30,10 +28,8 @@ #include #include -#include "cloud/cloud_storage_engine.h" #include "cloud/config.h" #include "common/config.h" -#include "common/consts.h" #include "common/logging.h" #include "common/status.h" #include "common/utils.h" @@ -44,7 +40,6 @@ #include "http/http_common.h" #include "http/http_headers.h" #include "http/http_request.h" -#include "http/http_response.h" #include "http/utils.h" #include "io/fs/stream_load_pipe.h" #include "olap/storage_engine.h" @@ -58,9 +53,7 @@ #include "runtime/stream_load/stream_load_executor.h" #include "runtime/stream_load/stream_load_recorder.h" #include "util/byte_buffer.h" -#include "util/debug_util.h" #include "util/doris_metrics.h" -#include "util/load_util.h" #include "util/metrics.h" #include "util/string_util.h" #include "util/thrift_rpc_helper.h" @@ -133,7 +126,7 @@ Status HttpStreamAction::_handle(HttpRequest* http_req, std::shared_ptrbody_bytes > 0 && ctx->receive_bytes != ctx->body_bytes) { LOG(WARNING) << "recevie body don't equal with body bytes, body_bytes=" << ctx->body_bytes << ", receive_bytes=" << ctx->receive_bytes << ", id=" << ctx->id; - return Status::InternalError("receive body don't equal with body bytes"); + return Status::Error("receive body don't equal with body bytes"); } RETURN_IF_ERROR(ctx->body_sink->finish()); @@ -196,7 +189,7 @@ Status HttpStreamAction::_on_header(HttpRequest* http_req, std::shared_ptrauth)) { LOG(WARNING) << "parse basic authorization failed." << ctx->brief(); - return Status::InternalError("no valid Basic authorization"); + return Status::NotAuthorized("no valid Basic authorization"); } // TODO(zs) : need Need to request an FE to obtain information such as format @@ -208,8 +201,10 @@ Status HttpStreamAction::_on_header(HttpRequest* http_req, std::shared_ptrbody_bytes > csv_max_body_bytes) { LOG(WARNING) << "body exceed max size." << ctx->brief(); - return Status::InternalError("body exceed max size: {}, data: {}", csv_max_body_bytes, - ctx->body_bytes); + return Status::Error( + "body size {} exceed BE's conf `streaming_load_max_mb` {}. increase it if you " + "are sure this load is reasonable", + ctx->body_bytes, csv_max_body_bytes); } } @@ -239,31 +234,40 @@ void HttpStreamAction::on_chunk_data(HttpRequest* req) { struct evhttp_request* ev_req = req->get_evhttp_request(); auto evbuf = evhttp_request_get_input_buffer(ev_req); + SCOPED_ATTACH_TASK(ExecEnv::GetInstance()->stream_load_pipe_tracker()); + int64_t start_read_data_time = MonotonicNanos(); while (evbuffer_get_length(evbuf) > 0) { - auto bb = ByteBuffer::allocate(128 * 1024); - auto remove_bytes = evbuffer_remove(evbuf, bb->ptr, bb->capacity); - bb->pos = remove_bytes; - bb->flip(); - auto st = ctx->body_sink->append(bb); - // schema_buffer stores 1M of data for parsing column information - // need to determine whether to cache for the first time - if (ctx->is_read_schema) { - if (ctx->schema_buffer->pos + remove_bytes < config::stream_tvf_buffer_size) { - ctx->schema_buffer->put_bytes(bb->ptr, remove_bytes); - } else { - LOG(INFO) << "use a portion of data to request fe to obtain column information"; - ctx->is_read_schema = false; - ctx->status = process_put(req, ctx); + try { + auto bb = ByteBuffer::allocate(128 * 1024); + auto remove_bytes = evbuffer_remove(evbuf, bb->ptr, bb->capacity); + bb->pos = remove_bytes; + bb->flip(); + auto st = ctx->body_sink->append(bb); + // schema_buffer stores 1M of data for parsing column information + // need to determine whether to cache for the first time + if (ctx->is_read_schema) { + if (ctx->schema_buffer->pos + remove_bytes < config::stream_tvf_buffer_size) { + ctx->schema_buffer->put_bytes(bb->ptr, remove_bytes); + } else { + LOG(INFO) << "use a portion of data to request fe to obtain column information"; + ctx->is_read_schema = false; + ctx->status = process_put(req, ctx); + } } + if (!st.ok() && !ctx->status.ok()) { + LOG(WARNING) << "append body content failed. errmsg=" << st << ", " << ctx->brief(); + ctx->status = st; + return; + } + ctx->receive_bytes += remove_bytes; + } catch (const doris::Exception& e) { + if (e.code() == doris::ErrorCode::MEM_ALLOC_FAILED) { + ctx->status = Status::MemoryLimitExceeded( + fmt::format("PreCatch error code:{}, {}, ", e.code(), e.to_string())); + } + ctx->status = Status::Error(e.code(), e.to_string()); } - - if (!st.ok() && !ctx->status.ok()) { - LOG(WARNING) << "append body content failed. errmsg=" << st << ", " << ctx->brief(); - ctx->status = st; - return; - } - ctx->receive_bytes += remove_bytes; } // after all the data has been read and it has not reached 1M, it will execute here if (ctx->is_read_schema) { @@ -386,7 +390,8 @@ Status HttpStreamAction::_handle_group_commit(HttpRequest* req, std::string group_commit_mode = req->header(HTTP_GROUP_COMMIT); if (!group_commit_mode.empty() && !iequal(group_commit_mode, "sync_mode") && !iequal(group_commit_mode, "async_mode") && !iequal(group_commit_mode, "off_mode")) { - return Status::InternalError("group_commit can only be [async_mode, sync_mode, off_mode]"); + return Status::InvalidArgument( + "group_commit can only be [async_mode, sync_mode, off_mode]"); } if (config::wait_internal_group_commit_finish) { group_commit_mode = "sync_mode"; @@ -399,7 +404,7 @@ Status HttpStreamAction::_handle_group_commit(HttpRequest* req, ss << "This http load content length <0 (" << content_length << "), please check your content length."; LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); + return Status::InvalidArgument(ss.str()); } // allow chunked stream load in flink auto is_chunk = @@ -421,7 +426,7 @@ Status HttpStreamAction::_handle_group_commit(HttpRequest* req, auto partitions = !req->header(HTTP_PARTITIONS).empty(); if (!partial_columns && !partitions && !temp_partitions && !ctx->two_phase_commit) { if (!config::wait_internal_group_commit_finish && !ctx->label.empty()) { - return Status::InternalError("label and group_commit can't be set at the same time"); + return Status::InvalidArgument("label and group_commit can't be set at the same time"); } ctx->group_commit = true; if (iequal(group_commit_mode, "async_mode")) { diff --git a/be/src/http/action/reset_rpc_channel_action.cpp b/be/src/http/action/reset_rpc_channel_action.cpp index e1b180a61d420ab..a9aa6ec950e0f29 100644 --- a/be/src/http/action/reset_rpc_channel_action.cpp +++ b/be/src/http/action/reset_rpc_channel_action.cpp @@ -35,6 +35,7 @@ namespace doris { ResetRPCChannelAction::ResetRPCChannelAction(ExecEnv* exec_env, TPrivilegeHier::type hier, TPrivilegeType::type type) : HttpHandlerWithAuth(exec_env, hier, type) {} + void ResetRPCChannelAction::handle(HttpRequest* req) { std::string endpoints = req->param("endpoints"); if (iequal(endpoints, "all")) { diff --git a/be/src/http/action/reset_rpc_channel_action.h b/be/src/http/action/reset_rpc_channel_action.h index 16efecfee2646ab..ba13c6be7c6333e 100644 --- a/be/src/http/action/reset_rpc_channel_action.h +++ b/be/src/http/action/reset_rpc_channel_action.h @@ -31,8 +31,5 @@ class ResetRPCChannelAction : public HttpHandlerWithAuth { ~ResetRPCChannelAction() override = default; void handle(HttpRequest* req) override; - -private: - ExecEnv* _exec_env; }; } // namespace doris diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index 64becf8d7e3369d..d0c5dff2075c6fb 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -26,20 +26,17 @@ #include #include #include -#include -#include #include #include -#include -#include +#include +#include +#include #include -#include #include #include #include -#include "cloud/cloud_storage_engine.h" #include "cloud/config.h" #include "common/config.h" #include "common/consts.h" @@ -122,7 +119,7 @@ void StreamLoadAction::handle(HttpRequest* req) { _exec_env->stream_load_executor()->rollback_txn(ctx.get()); ctx->need_rollback = false; } - if (ctx->body_sink.get() != nullptr) { + if (ctx->body_sink != nullptr) { ctx->body_sink->cancel(ctx->status.to_string()); } } @@ -146,7 +143,7 @@ Status StreamLoadAction::_handle(std::shared_ptr ctx) { if (ctx->body_bytes > 0 && ctx->receive_bytes != ctx->body_bytes) { LOG(WARNING) << "recevie body don't equal with body bytes, body_bytes=" << ctx->body_bytes << ", receive_bytes=" << ctx->receive_bytes << ", id=" << ctx->id; - return Status::InternalError("receive body don't equal with body bytes"); + return Status::Error("receive body don't equal with body bytes"); } // if we use non-streaming, MessageBodyFileSink.finish will close the file @@ -210,7 +207,7 @@ int StreamLoadAction::on_header(HttpRequest* req) { _exec_env->stream_load_executor()->rollback_txn(ctx.get()); ctx->need_rollback = false; } - if (ctx->body_sink.get() != nullptr) { + if (ctx->body_sink != nullptr) { ctx->body_sink->cancel(ctx->status.to_string()); } auto str = ctx->to_json(); @@ -232,13 +229,13 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrauth)) { LOG(WARNING) << "parse basic authorization failed." << ctx->brief(); - return Status::InternalError("no valid Basic authorization"); + return Status::NotAuthorized("no valid Basic authorization"); } // get format of this put if (!http_req->header(HTTP_COMPRESS_TYPE).empty() && iequal(http_req->header(HTTP_FORMAT_KEY), "JSON")) { - return Status::InternalError("compress data of JSON format is not supported."); + return Status::NotSupported("compress data of JSON format is not supported."); } std::string format_str = http_req->header(HTTP_FORMAT_KEY); if (iequal(format_str, BeConsts::CSV_WITH_NAMES) || @@ -254,8 +251,8 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrheader(HTTP_COMPRESS_TYPE), &ctx->format, &ctx->compress_type); if (ctx->format == TFileFormatType::FORMAT_UNKNOWN) { - return Status::InternalError("unknown data format, format={}", - http_req->header(HTTP_FORMAT_KEY)); + return Status::Error("unknown data format, format={}", + http_req->header(HTTP_FORMAT_KEY)); } // check content length @@ -273,16 +270,18 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrformat == TFileFormatType::FORMAT_JSON) && (ctx->body_bytes > json_max_body_bytes) && !read_json_by_line) { - return Status::InternalError( - "The size of this batch exceed the max size [{}] of json type data " - " data [ {} ]. Split the file, or use 'read_json_by_line'", - json_max_body_bytes, ctx->body_bytes); + return Status::Error( + "json body size {} exceed BE's conf `streaming_load_json_max_mb` {}. increase " + "it if you are sure this load is reasonable", + ctx->body_bytes, json_max_body_bytes); } // csv max body size else if (ctx->body_bytes > csv_max_body_bytes) { LOG(WARNING) << "body exceed max size." << ctx->brief(); - return Status::InternalError("body exceed max size: {}, data: {}", - csv_max_body_bytes, ctx->body_bytes); + return Status::Error( + "body size {} exceed BE's conf `streaming_load_max_mb` {}. increase it if you " + "are sure this load is reasonable", + ctx->body_bytes, csv_max_body_bytes); } } else { #ifndef BE_TEST @@ -300,13 +299,13 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptris_chunked_transfer))) { LOG(WARNING) << "content_length is empty and transfer-encoding!=chunked, please set " "content_length or transfer-encoding=chunked"; - return Status::InternalError( + return Status::InvalidArgument( "content_length is empty and transfer-encoding!=chunked, please set content_length " "or transfer-encoding=chunked"); } else if (UNLIKELY(!http_req->header(HttpHeaders::CONTENT_LENGTH).empty() && ctx->is_chunked_transfer)) { LOG(WARNING) << "please do not set both content_length and transfer-encoding"; - return Status::InternalError( + return Status::InvalidArgument( "please do not set both content_length and transfer-encoding"); } @@ -341,19 +340,29 @@ void StreamLoadAction::on_chunk_data(HttpRequest* req) { struct evhttp_request* ev_req = req->get_evhttp_request(); auto evbuf = evhttp_request_get_input_buffer(ev_req); + SCOPED_ATTACH_TASK(ExecEnv::GetInstance()->stream_load_pipe_tracker()); + int64_t start_read_data_time = MonotonicNanos(); while (evbuffer_get_length(evbuf) > 0) { - auto bb = ByteBuffer::allocate(128 * 1024); - auto remove_bytes = evbuffer_remove(evbuf, bb->ptr, bb->capacity); - bb->pos = remove_bytes; - bb->flip(); - auto st = ctx->body_sink->append(bb); - if (!st.ok()) { - LOG(WARNING) << "append body content failed. errmsg=" << st << ", " << ctx->brief(); - ctx->status = st; - return; - } - ctx->receive_bytes += remove_bytes; + try { + auto bb = ByteBuffer::allocate(128 * 1024); + auto remove_bytes = evbuffer_remove(evbuf, bb->ptr, bb->capacity); + bb->pos = remove_bytes; + bb->flip(); + auto st = ctx->body_sink->append(bb); + if (!st.ok()) { + LOG(WARNING) << "append body content failed. errmsg=" << st << ", " << ctx->brief(); + ctx->status = st; + return; + } + ctx->receive_bytes += remove_bytes; + } catch (const doris::Exception& e) { + if (e.code() == doris::ErrorCode::MEM_ALLOC_FAILED) { + ctx->status = Status::MemoryLimitExceeded( + fmt::format("PreCatch error code:{}, {}, ", e.code(), e.to_string())); + } + ctx->status = Status::Error(e.code(), e.to_string()); + } } int64_t read_data_time = MonotonicNanos() - start_read_data_time; int64_t last_receive_and_read_data_cost_nanos = ctx->receive_and_read_data_cost_nanos; @@ -430,7 +439,7 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, if (!http_req->header(HTTP_LINE_DELIMITER).empty()) { request.__set_line_delimiter(http_req->header(HTTP_LINE_DELIMITER)); } - if (!http_req->header(HTTP_ENCLOSE).empty() && http_req->header(HTTP_ENCLOSE).size() > 0) { + if (!http_req->header(HTTP_ENCLOSE).empty() && !http_req->header(HTTP_ENCLOSE).empty()) { const auto& enclose_str = http_req->header(HTTP_ENCLOSE); if (enclose_str.length() != 1) { return Status::InvalidArgument("enclose must be single-char, actually is {}", @@ -438,7 +447,7 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, } request.__set_enclose(http_req->header(HTTP_ENCLOSE)[0]); } - if (!http_req->header(HTTP_ESCAPE).empty() && http_req->header(HTTP_ESCAPE).size() > 0) { + if (!http_req->header(HTTP_ESCAPE).empty() && !http_req->header(HTTP_ESCAPE).empty()) { const auto& escape_str = http_req->header(HTTP_ESCAPE); if (escape_str.length() != 1) { return Status::InvalidArgument("escape must be single-char, actually is {}", @@ -728,7 +737,7 @@ Status StreamLoadAction::_handle_group_commit(HttpRequest* req, std::string group_commit_mode = req->header(HTTP_GROUP_COMMIT); if (!group_commit_mode.empty() && !iequal(group_commit_mode, "sync_mode") && !iequal(group_commit_mode, "async_mode") && !iequal(group_commit_mode, "off_mode")) { - return Status::InternalError( + return Status::InvalidArgument( "group_commit can only be [async_mode, sync_mode, off_mode]"); } if (config::wait_internal_group_commit_finish) { @@ -742,7 +751,7 @@ Status StreamLoadAction::_handle_group_commit(HttpRequest* req, ss << "This stream load content length <0 (" << content_length << "), please check your content length."; LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); + return Status::InvalidArgument(ss.str()); } // allow chunked stream load in flink auto is_chunk = !req->header(HttpHeaders::TRANSFER_ENCODING).empty() && @@ -763,8 +772,7 @@ Status StreamLoadAction::_handle_group_commit(HttpRequest* req, auto partitions = !req->header(HTTP_PARTITIONS).empty(); if (!partial_columns && !partitions && !temp_partitions && !ctx->two_phase_commit) { if (!config::wait_internal_group_commit_finish && !ctx->label.empty()) { - return Status::InternalError( - "label and group_commit can't be set at the same time"); + return Status::InvalidArgument("label and group_commit can't be set at the same time"); } ctx->group_commit = true; if (iequal(group_commit_mode, "async_mode")) { diff --git a/be/src/http/http_client.cpp b/be/src/http/http_client.cpp index 218802878bd2c04..e94614788f52362 100644 --- a/be/src/http/http_client.cpp +++ b/be/src/http/http_client.cpp @@ -131,8 +131,11 @@ Status HttpClient::init(const std::string& url, bool set_fail_on_error) { LOG(WARNING) << "fail to set CURLOPT_WRITEDATA, msg=" << _to_errmsg(code); return Status::InternalError("fail to set CURLOPT_WRITEDATA"); } + + std::string escaped_url; + RETURN_IF_ERROR(_escape_url(url, &escaped_url)); // set url - code = curl_easy_setopt(_curl, CURLOPT_URL, url.c_str()); + code = curl_easy_setopt(_curl, CURLOPT_URL, escaped_url.c_str()); if (code != CURLE_OK) { LOG(WARNING) << "failed to set CURLOPT_URL, errmsg=" << _to_errmsg(code); return Status::InternalError("fail to set CURLOPT_URL"); @@ -290,4 +293,59 @@ Status HttpClient::execute_with_retry(int retry_times, int sleep_time, return status; } +// http://example.com/page?param1=value1¶m2=value+with+spaces#section +Status HttpClient::_escape_url(const std::string& url, std::string* escaped_url) { + size_t query_pos = url.find('?'); + if (query_pos == std::string::npos) { + *escaped_url = url; + return Status::OK(); + } + size_t fragment_pos = url.find('#'); + std::string query; + std::string fragment; + + if (fragment_pos == std::string::npos) { + query = url.substr(query_pos + 1, url.length() - query_pos - 1); + } else { + query = url.substr(query_pos + 1, fragment_pos - query_pos - 1); + fragment = url.substr(fragment_pos, url.length() - fragment_pos); + } + + std::string encoded_query; + size_t ampersand_pos = query.find('&'); + size_t equal_pos; + + if (ampersand_pos == std::string::npos) { + ampersand_pos = query.length(); + } + + while (true) { + equal_pos = query.find('='); + if (equal_pos != std::string::npos) { + std::string key = query.substr(0, equal_pos); + std::string value = query.substr(equal_pos + 1, ampersand_pos - equal_pos - 1); + + auto encoded_value = std::unique_ptr( + curl_easy_escape(_curl, value.c_str(), value.length()), &curl_free); + if (encoded_value) { + encoded_query += key + "=" + std::string(encoded_value.get()); + } else { + return Status::InternalError("escape url failed, url={}", url); + } + } else { + encoded_query += query.substr(0, ampersand_pos); + } + + if (ampersand_pos == query.length() || ampersand_pos == std::string::npos) { + break; + } + + encoded_query += "&"; + query = query.substr(ampersand_pos + 1); + ampersand_pos = query.find('&'); + } + *escaped_url = url.substr(0, query_pos + 1) + encoded_query + fragment; + return Status::OK(); +} + } // namespace doris diff --git a/be/src/http/http_client.h b/be/src/http/http_client.h index 9659de13cfcac5a..f6a1a17ec29e0f8 100644 --- a/be/src/http/http_client.h +++ b/be/src/http/http_client.h @@ -146,6 +146,15 @@ class HttpClient { size_t on_response_data(const void* data, size_t length); + // The file name of the variant column with the inverted index contains % + // such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx + // {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx + // We need to handle %, otherwise it will cause an HTTP 404 error. + // Because the percent ("%") character serves as the indicator for percent-encoded octets, + // it must be percent-encoded as "%25" for that octet to be used as data within a URI. + // https://datatracker.ietf.org/doc/html/rfc3986 + Status _escape_url(const std::string& url, std::string* escaped_url); + private: const char* _to_errmsg(CURLcode code); diff --git a/be/src/http/http_handler_with_auth.h b/be/src/http/http_handler_with_auth.h index 178971560c015b0..894a3a81e50d287 100644 --- a/be/src/http/http_handler_with_auth.h +++ b/be/src/http/http_handler_with_auth.h @@ -51,8 +51,10 @@ class HttpHandlerWithAuth : public HttpHandler { return true; } -private: +protected: ExecEnv* _exec_env; + +private: TPrivilegeHier::type _hier; TPrivilegeType::type _type; }; diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 33858e9ac538eef..d7dfc743a8766e5 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -157,7 +157,7 @@ void BlockFileCache::remove_query_context(const TUniqueId& query_id) { std::lock_guard cache_lock(_mutex); const auto& query_iter = _query_map.find(query_id); - if (query_iter != _query_map.end() && query_iter->second.unique()) { + if (query_iter != _query_map.end() && query_iter->second.use_count() <= 1) { _query_map.erase(query_iter); } } @@ -280,6 +280,12 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte DCHECK(!file_blocks.empty()); // change to ttl if the blocks aren't ttl if (context.cache_type == FileCacheType::TTL && _key_to_time.find(hash) == _key_to_time.end()) { + for (auto& [_, cell] : file_blocks) { + Status st = cell.file_block->update_expiration_time(context.expiration_time); + if (!st.ok()) { + LOG_WARNING("Failed to change key meta").error(st); + } + } for (auto& [_, cell] : file_blocks) { FileCacheType origin_type = cell.file_block->cache_type(); if (origin_type == FileCacheType::TTL) continue; @@ -295,9 +301,7 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte } else { cell.queue_iterator.reset(); } - st = cell.file_block->update_expiration_time(context.expiration_time); - } - if (!st.ok()) { + } else { LOG_WARNING("Failed to change key meta").error(st); } } @@ -324,7 +328,10 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte } if (context.expiration_time == 0) { for (auto& [_, cell] : file_blocks) { - if (cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL)) { + auto cache_type = cell.file_block->cache_type(); + if (cache_type != FileCacheType::TTL) continue; + auto st = cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL); + if (st.ok()) { if (config::enable_ttl_cache_evict_using_lru) { auto& ttl_queue = get_queue(FileCacheType::TTL); ttl_queue.remove(cell.queue_iterator.value(), cache_lock); @@ -333,6 +340,8 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte cell.queue_iterator = queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), cell.file_block->range().size(), cache_lock); + } else { + LOG_WARNING("Failed to change key meta").error(st); } } _key_to_time.erase(iter); @@ -681,10 +690,6 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha << ".\nCurrent cache structure: " << dump_structure_unlocked(hash, cache_lock); auto& offsets = _files[hash]; - DCHECK((context.expiration_time == 0 && context.cache_type != FileCacheType::TTL) || - (context.cache_type == FileCacheType::TTL && context.expiration_time != 0)) - << fmt::format("expiration time {}, cache type {}", context.expiration_time, - context.cache_type); FileCacheKey key; key.hash = hash; @@ -692,11 +697,23 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha key.meta.type = context.cache_type; key.meta.expiration_time = context.expiration_time; FileBlockCell cell(std::make_shared(key, size, this, state), cache_lock); - if (context.cache_type != FileCacheType::TTL || config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(context.cache_type); + Status st; + if (context.expiration_time == 0 && context.cache_type == FileCacheType::TTL) { + st = cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL); + } else if (context.cache_type != FileCacheType::TTL && context.expiration_time != 0) { + st = cell.file_block->change_cache_type_by_mgr(FileCacheType::TTL); + } + if (!st.ok()) { + LOG(WARNING) << "Cannot change cache type. expiration_time=" << context.expiration_time + << " cache_type=" << cache_type_to_string(context.cache_type) + << " error=" << st.msg(); + } + if (cell.file_block->cache_type() != FileCacheType::TTL || + config::enable_ttl_cache_evict_using_lru) { + auto& queue = get_queue(cell.file_block->cache_type()); cell.queue_iterator = queue.add(hash, offset, size, cache_lock); } - if (context.cache_type == FileCacheType::TTL) { + if (cell.file_block->cache_type() == FileCacheType::TTL) { if (_key_to_time.find(hash) == _key_to_time.end()) { _key_to_time[hash] = context.expiration_time; _time_to_key.insert(std::make_pair(context.expiration_time, hash)); @@ -1005,19 +1022,18 @@ bool BlockFileCache::remove_if_ttl_file_unlock(const UInt128Wrapper& file_key, b } } for (auto& [_, cell] : _files[file_key]) { - if (cell.file_block->cache_type() == FileCacheType::TTL) { - auto st = cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL); - if (st.ok()) { - if (config::enable_ttl_cache_evict_using_lru) { - ttl_queue.remove(cell.queue_iterator.value(), cache_lock); - } - auto& queue = get_queue(FileCacheType::NORMAL); - cell.queue_iterator = queue.add( - cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - } else { - LOG_WARNING("Failed to change cache type to normal").error(st); + if (cell.file_block->cache_type() == FileCacheType::NORMAL) continue; + auto st = cell.file_block->change_cache_type_by_mgr(FileCacheType::NORMAL); + if (st.ok()) { + if (config::enable_ttl_cache_evict_using_lru) { + ttl_queue.remove(cell.queue_iterator.value(), cache_lock); } + auto& queue = get_queue(FileCacheType::NORMAL); + cell.queue_iterator = + queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), + cell.file_block->range().size(), cache_lock); + } else { + LOG_WARNING("Failed to change cache type to normal").error(st); } } } else { @@ -1579,6 +1595,7 @@ void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, for (auto& [_, cell] : _files[hash]) { Status st = cell.file_block->update_expiration_time(new_expiration_time); if (!st.ok()) { + LOG_WARNING("Failed to modify expiration time").error(st); } } @@ -1588,12 +1605,13 @@ void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, if (auto iter = _files.find(hash); iter != _files.end()) { for (auto& [_, cell] : iter->second) { Status st = cell.file_block->update_expiration_time(new_expiration_time); - if (!st.ok() && !st.is()) { + if (!st.ok()) { LOG_WARNING("").error(st); } } for (auto& [_, cell] : iter->second) { FileCacheType origin_type = cell.file_block->cache_type(); + if (origin_type == FileCacheType::TTL) continue; auto st = cell.file_block->change_cache_type_by_mgr(FileCacheType::TTL); if (st.ok()) { auto& queue = get_queue(origin_type); diff --git a/be/src/io/cache/block_file_cache_downloader.cpp b/be/src/io/cache/block_file_cache_downloader.cpp index 02e8f736828cb1d..026f7e2a01741da 100644 --- a/be/src/io/cache/block_file_cache_downloader.cpp +++ b/be/src/io/cache/block_file_cache_downloader.cpp @@ -130,6 +130,16 @@ void FileCacheBlockDownloader::check_download_task(const std::vector& t } } +std::unordered_map snapshot_rs_metas(BaseTablet* tablet) { + std::unordered_map id_to_rowset_meta_map; + auto visitor = [&id_to_rowset_meta_map](const RowsetSharedPtr& r) { + id_to_rowset_meta_map.emplace(r->rowset_meta()->rowset_id().to_string(), r->rowset_meta()); + }; + constexpr bool include_stale = false; + tablet->traverse_rowsets(visitor, include_stale); + return id_to_rowset_meta_map; +} + void FileCacheBlockDownloader::download_file_cache_block( const DownloadTask::FileCacheBlockMetaVec& metas) { std::ranges::for_each(metas, [&](const FileCacheBlockMeta& meta) { @@ -141,7 +151,7 @@ void FileCacheBlockDownloader::download_file_cache_block( tablet = std::move(res).value(); } - auto id_to_rowset_meta_map = tablet->tablet_meta()->snapshot_rs_metas(); + auto id_to_rowset_meta_map = snapshot_rs_metas(tablet.get()); auto find_it = id_to_rowset_meta_map.find(meta.rowset_id()); if (find_it == id_to_rowset_meta_map.end()) { return; @@ -171,7 +181,8 @@ void FileCacheBlockDownloader::download_file_cache_block( DownloadFileMeta download_meta { .path = storage_resource.value()->remote_segment_path(*find_it->second, meta.segment_id()), - .file_size = meta.offset() + meta.size(), // To avoid trigger get file size IO + .file_size = meta.has_file_size() ? meta.file_size() + : -1, // To avoid trigger get file size IO .offset = meta.offset(), .download_size = meta.size(), .file_system = storage_resource.value()->fs, diff --git a/be/src/io/cache/file_block.cpp b/be/src/io/cache/file_block.cpp index 5985aa95f7abdcd..6586dcf589bddee 100644 --- a/be/src/io/cache/file_block.cpp +++ b/be/src/io/cache/file_block.cpp @@ -25,6 +25,7 @@ #include #include "common/status.h" +#include "cpp/sync_point.h" #include "io/cache/block_file_cache.h" namespace doris { @@ -162,14 +163,14 @@ Status FileBlock::read(Slice buffer, size_t read_offset) { Status FileBlock::change_cache_type_by_mgr(FileCacheType new_type) { std::lock_guard block_lock(_mutex); - if (new_type == _key.meta.type) { - return Status::OK(); - } + DCHECK(new_type != _key.meta.type); if (_download_state == State::DOWNLOADED) { KeyMeta new_meta; new_meta.expiration_time = _key.meta.expiration_time; new_meta.type = new_type; - RETURN_IF_ERROR(_mgr->_storage->change_key_meta(_key, new_meta)); + auto st = _mgr->_storage->change_key_meta(_key, new_meta); + TEST_SYNC_POINT_CALLBACK("FileBlock::change_cache_type", &st); + if (!st.ok()) return st; } _key.meta.type = new_type; return Status::OK(); @@ -198,7 +199,10 @@ Status FileBlock::update_expiration_time(uint64_t expiration_time) { KeyMeta new_meta; new_meta.expiration_time = expiration_time; new_meta.type = _key.meta.type; - RETURN_IF_ERROR(_mgr->_storage->change_key_meta(_key, new_meta)); + auto st = _mgr->_storage->change_key_meta(_key, new_meta); + if (!st.ok() && !st.is()) { + return st; + } } _key.meta.expiration_time = expiration_time; return Status::OK(); diff --git a/be/src/io/fs/azure_obj_storage_client.cpp b/be/src/io/fs/azure_obj_storage_client.cpp index 043886672a2af3e..9f33db3400acdc1 100644 --- a/be/src/io/fs/azure_obj_storage_client.cpp +++ b/be/src/io/fs/azure_obj_storage_client.cpp @@ -42,6 +42,7 @@ #include "common/logging.h" #include "common/status.h" #include "io/fs/obj_storage_client.h" +#include "util/bvar_helper.h" #include "util/s3_util.h" using namespace Azure::Storage::Blobs; @@ -57,6 +58,28 @@ auto base64_encode_part_num(int part_num) { {reinterpret_cast(&part_num), sizeof(part_num)}); } +template +auto s3_rate_limit(doris::S3RateLimitType op, Func callback) -> decltype(callback()) { + if (!doris::config::enable_s3_rate_limiter) { + return callback(); + } + auto sleep_duration = doris::S3ClientFactory::instance().rate_limiter(op)->add(1); + if (sleep_duration < 0) { + throw std::runtime_error("Azure exceeds request limit"); + } + return callback(); +} + +template +auto s3_get_rate_limit(Func callback) -> decltype(callback()) { + return s3_rate_limit(doris::S3RateLimitType::GET, std::move(callback)); +} + +template +auto s3_put_rate_limit(Func callback) -> decltype(callback()) { + return s3_rate_limit(doris::S3RateLimitType::PUT, std::move(callback)); +} + constexpr char SAS_TOKEN_URL_TEMPLATE[] = "https://{}.blob.core.windows.net/{}/{}{}"; constexpr char BlobNotFound[] = "BlobNotFound"; } // namespace @@ -101,7 +124,14 @@ struct AzureBatchDeleter { if (deferred_resps.empty()) { return ObjectStorageResponse::OK(); } - auto resp = do_azure_client_call([&]() { _client->SubmitBatch(_batch); }, _opts); + auto resp = do_azure_client_call( + [&]() { + s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_objects_latency); + _client->SubmitBatch(_batch); + }); + }, + _opts); if (resp.status.code != ErrorCode::OK) { return resp; } @@ -156,7 +186,11 @@ ObjectStorageResponse AzureObjStorageClient::put_object(const ObjectStoragePathO auto client = _client->GetBlockBlobClient(opts.key); return do_azure_client_call( [&]() { - client.UploadFrom(reinterpret_cast(stream.data()), stream.size()); + s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_put_latency); + client.UploadFrom(reinterpret_cast(stream.data()), + stream.size()); + }); }, opts); } @@ -169,7 +203,10 @@ ObjectStorageUploadResponse AzureObjStorageClient::upload_part(const ObjectStora Azure::Core::IO::MemoryBodyStream memory_body( reinterpret_cast(stream.data()), stream.size()); // The blockId must be base64 encoded - client.StageBlock(base64_encode_part_num(part_num), memory_body); + s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); + client.StageBlock(base64_encode_part_num(part_num), memory_body); + }); } catch (Azure::Core::RequestFailedException& e) { auto msg = fmt::format( "Azure request failed because {}, error msg {}, http code {}, path msg {}", @@ -200,13 +237,22 @@ ObjectStorageResponse AzureObjStorageClient::complete_multipart_upload( std::ranges::transform( completed_parts, std::back_inserter(string_block_ids), [](const ObjectCompleteMultiPart& i) { return base64_encode_part_num(i.part_num); }); - return do_azure_client_call([&]() { client.CommitBlockList(string_block_ids); }, opts); + return do_azure_client_call( + [&]() { + s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); + client.CommitBlockList(string_block_ids); + }); + }, + opts); } ObjectStorageHeadResponse AzureObjStorageClient::head_object(const ObjectStoragePathOptions& opts) { try { - Models::BlobProperties properties = - _client->GetBlockBlobClient(opts.key).GetProperties().Value; + Models::BlobProperties properties = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_head_latency); + return _client->GetBlockBlobClient(opts.key).GetProperties().Value; + }); return {.file_size = properties.BlobSize}; } catch (Azure::Core::RequestFailedException& e) { if (e.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { @@ -238,8 +284,11 @@ ObjectStorageResponse AzureObjStorageClient::get_object(const ObjectStoragePathO DownloadBlobToOptions download_opts; Azure::Core::Http::HttpRange range {static_cast(offset), bytes_read}; download_opts.Range = range; - auto resp = client.DownloadTo(reinterpret_cast(buffer), bytes_read, - download_opts); + auto resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_get_latency); + return client.DownloadTo(reinterpret_cast(buffer), bytes_read, + download_opts); + }); *size_return = resp.Value.ContentRange.Length.Value(); }, opts); @@ -257,11 +306,17 @@ ObjectStorageResponse AzureObjStorageClient::list_objects(const ObjectStoragePat [&]() { ListBlobsOptions list_opts; list_opts.Prefix = opts.prefix; - auto resp = _client->ListBlobs(list_opts); + auto resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); + return _client->ListBlobs(list_opts); + }); get_file_file(resp); while (!resp.NextPageToken->empty()) { list_opts.ContinuationToken = resp.NextPageToken; - resp = _client->ListBlobs(list_opts); + resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); + return _client->ListBlobs(list_opts); + }); get_file_file(resp); } }, @@ -297,7 +352,10 @@ ObjectStorageResponse AzureObjStorageClient::delete_objects(const ObjectStorageP ObjectStorageResponse AzureObjStorageClient::delete_object(const ObjectStoragePathOptions& opts) { return do_azure_client_call( [&]() { - auto resp = _client->DeleteBlob(opts.key); + auto resp = s3_put_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_object_latency); + return _client->DeleteBlob(opts.key); + }); if (!resp.Value.Deleted) { throw Exception(Status::IOError("Delete azure blob failed")); } @@ -321,14 +379,20 @@ ObjectStorageResponse AzureObjStorageClient::delete_objects_recursively( } return ObjectStorageResponse::OK(); }; - auto resp = _client->ListBlobs(list_opts); + auto resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); + return _client->ListBlobs(list_opts); + }); if (auto response = delete_func(resp.Blobs); response.status.code != ErrorCode::OK) { return response; } while (!resp.NextPageToken->empty()) { list_opts.ContinuationToken = resp.NextPageToken; - resp = _client->ListBlobs(list_opts); + resp = s3_get_rate_limit([&]() { + SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); + return _client->ListBlobs(list_opts); + }); if (auto response = delete_func(resp.Blobs); response.status.code != ErrorCode::OK) { return response; diff --git a/be/src/io/fs/buffered_reader.cpp b/be/src/io/fs/buffered_reader.cpp index 664997088d98357..43445ed42efd3b9 100644 --- a/be/src/io/fs/buffered_reader.cpp +++ b/be/src/io/fs/buffered_reader.cpp @@ -28,6 +28,8 @@ #include "common/config.h" #include "common/status.h" #include "runtime/exec_env.h" +#include "runtime/thread_context.h" +#include "runtime/workload_management/io_throttle.h" #include "util/runtime_profile.h" #include "util/threadpool.h" @@ -585,15 +587,19 @@ Status PrefetchBuffer::read_buffer(size_t off, const char* out, size_t buf_len, if (UNLIKELY(0 == _len || _offset + _len < off)) { return Status::OK(); } - // [0]: maximum len trying to read, [1] maximum length buffer can provide, [2] actual len buffer has - size_t read_len = std::min({buf_len, _offset + _size - off, _offset + _len - off}); + { - SCOPED_RAW_TIMER(&_statis.copy_time); - memcpy((void*)out, _buf.get() + (off - _offset), read_len); + LIMIT_REMOTE_SCAN_IO(bytes_read); + // [0]: maximum len trying to read, [1] maximum length buffer can provide, [2] actual len buffer has + size_t read_len = std::min({buf_len, _offset + _size - off, _offset + _len - off}); + { + SCOPED_RAW_TIMER(&_statis.copy_time); + memcpy((void*)out, _buf.get() + (off - _offset), read_len); + } + *bytes_read = read_len; + _statis.request_io += 1; + _statis.request_bytes += read_len; } - *bytes_read = read_len; - _statis.request_io += 1; - _statis.request_bytes += read_len; if (off + *bytes_read == _offset + _len) { reset_offset(_offset + _whole_buffer_size); } @@ -742,7 +748,8 @@ Status InMemoryFileReader::_close_internal() { Status InMemoryFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) { if (_data == nullptr) { - _data = std::make_unique(_size); + _data = std::make_unique_for_overwrite(_size); + size_t file_size = 0; RETURN_IF_ERROR(_reader->read_at(0, Slice(_data.get(), _size), &file_size, io_ctx)); DCHECK_EQ(file_size, _size); diff --git a/be/src/io/fs/file_reader.cpp b/be/src/io/fs/file_reader.cpp index 966df6ec7ee33ba..86596fd88f7020c 100644 --- a/be/src/io/fs/file_reader.cpp +++ b/be/src/io/fs/file_reader.cpp @@ -22,8 +22,6 @@ #include "io/cache/cached_remote_file_reader.h" #include "io/fs/file_system.h" -#include "runtime/thread_context.h" -#include "runtime/workload_management/io_throttle.h" #include "util/async_io.h" namespace doris::io { @@ -33,17 +31,7 @@ const std::string FileReader::VIRTUAL_REMOTE_DATA_DIR = "virtual_remote_data_dir Status FileReader::read_at(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) { DCHECK(bthread_self() == 0); - std::shared_ptr iot = nullptr; - if (auto* t_ctx = doris::thread_context(true)) { - iot = t_ctx->io_throttle(get_data_dir_path()); - } - if (iot) { - iot->acquire(-1); - } Status st = read_at_impl(offset, result, bytes_read, io_ctx); - if (iot) { - iot->update_next_io_time(*bytes_read); - } if (!st) { LOG(WARNING) << st; } diff --git a/be/src/io/fs/hdfs_file_reader.cpp b/be/src/io/fs/hdfs_file_reader.cpp index bf4e55be13692c0..d43cfae1c28228a 100644 --- a/be/src/io/fs/hdfs_file_reader.cpp +++ b/be/src/io/fs/hdfs_file_reader.cpp @@ -31,6 +31,8 @@ #include "cpp/sync_point.h" #include "io/fs/err_utils.h" #include "io/hdfs_util.h" +#include "runtime/thread_context.h" +#include "runtime/workload_management/io_throttle.h" #include "service/backend_options.h" #include "util/doris_metrics.h" @@ -132,6 +134,8 @@ Status HdfsFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_r return Status::OK(); } + LIMIT_REMOTE_SCAN_IO(bytes_read); + size_t has_read = 0; while (has_read < bytes_req) { tSize loop_read = hdfsPread(_handle->fs(), _handle->file(), offset + has_read, @@ -196,6 +200,8 @@ Status HdfsFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_r return Status::OK(); } + LIMIT_REMOTE_SCAN_IO(bytes_read); + size_t has_read = 0; while (has_read < bytes_req) { int64_t loop_read = diff --git a/be/src/io/fs/hdfs_file_writer.cpp b/be/src/io/fs/hdfs_file_writer.cpp index ceff2cc429a7d56..ff68d1c837ae921 100644 --- a/be/src/io/fs/hdfs_file_writer.cpp +++ b/be/src/io/fs/hdfs_file_writer.cpp @@ -50,6 +50,9 @@ bvar::Adder hdfs_file_writer_total("hdfs_file_writer_total_num"); bvar::Adder hdfs_bytes_written_total("hdfs_file_writer_bytes_written"); bvar::Adder hdfs_file_created_total("hdfs_file_writer_file_created"); bvar::Adder inflight_hdfs_file_writer("inflight_hdfs_file_writer"); +bvar::Adder hdfs_file_writer_async_close_queuing("hdfs_file_writer_async_close_queuing"); +bvar::Adder hdfs_file_writer_async_close_processing( + "hdfs_file_writer_async_close_processing"); static constexpr size_t MB = 1024 * 1024; #ifndef USE_LIBHDFS3 @@ -122,7 +125,11 @@ class HdfsWriteMemUsageRecorder { } private: - size_t max_jvm_heap_size() const { return JniUtil::get_max_jni_heap_memory_size(); } + // clang-format off + size_t max_jvm_heap_size() const { + return JniUtil::get_max_jni_heap_memory_size(); + } + // clang-format on [[maybe_unused]] std::size_t cur_memory_comsuption {0}; std::mutex cur_memory_latch; std::condition_variable cv; @@ -230,8 +237,13 @@ Status HdfsFileWriter::close(bool non_block) { _state = State::ASYNC_CLOSING; _async_close_pack = std::make_unique(); _async_close_pack->future = _async_close_pack->promise.get_future(); - return ExecEnv::GetInstance()->non_block_close_thread_pool()->submit_func( - [&]() { _async_close_pack->promise.set_value(_close_impl()); }); + hdfs_file_writer_async_close_queuing << 1; + return ExecEnv::GetInstance()->non_block_close_thread_pool()->submit_func([&]() { + hdfs_file_writer_async_close_queuing << -1; + hdfs_file_writer_async_close_processing << 1; + _async_close_pack->promise.set_value(_close_impl()); + hdfs_file_writer_async_close_processing << -1; + }); } _st = _close_impl(); _state = State::CLOSED; diff --git a/be/src/io/fs/local_file_reader.cpp b/be/src/io/fs/local_file_reader.cpp index 17937bcbd6f41cc..b4f144a633048e4 100644 --- a/be/src/io/fs/local_file_reader.cpp +++ b/be/src/io/fs/local_file_reader.cpp @@ -36,6 +36,8 @@ #include "io/fs/err_utils.h" #include "olap/olap_common.h" #include "olap/options.h" +#include "runtime/thread_context.h" +#include "runtime/workload_management/io_throttle.h" #include "util/async_io.h" #include "util/doris_metrics.h" @@ -58,12 +60,14 @@ void BeConfDataDirReader::get_data_dir_by_file_path(io::Path* file_path, void BeConfDataDirReader::init_be_conf_data_dir( const std::vector& store_paths, - const std::vector& spill_store_paths) { + const std::vector& spill_store_paths, + const std::vector& cache_paths) { for (int i = 0; i < store_paths.size(); i++) { DataDirInfo data_dir_info; data_dir_info.path = store_paths[i].path; data_dir_info.storage_medium = store_paths[i].storage_medium; data_dir_info.data_dir_type = DataDirType::OLAP_DATA_DIR; + data_dir_info.bvar_name = "local_data_dir_" + std::to_string(i); be_config_data_dir_list.push_back(data_dir_info); } @@ -72,6 +76,16 @@ void BeConfDataDirReader::init_be_conf_data_dir( data_dir_info.path = spill_store_paths[i].path; data_dir_info.storage_medium = spill_store_paths[i].storage_medium; data_dir_info.data_dir_type = doris::DataDirType::SPILL_DISK_DIR; + data_dir_info.bvar_name = "spill_data_dir_" + std::to_string(i); + be_config_data_dir_list.push_back(data_dir_info); + } + + for (int i = 0; i < cache_paths.size(); i++) { + doris::DataDirInfo data_dir_info; + data_dir_info.path = cache_paths[i].path; + data_dir_info.storage_medium = TStorageMedium::REMOTE_CACHE; + data_dir_info.data_dir_type = doris::DataDirType::DATA_CACHE_DIR; + data_dir_info.bvar_name = "local_cache_dir_" + std::to_string(i); be_config_data_dir_list.push_back(data_dir_info); } } @@ -120,6 +134,8 @@ Status LocalFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_ bytes_req = std::min(bytes_req, _file_size - offset); *bytes_read = 0; + LIMIT_LOCAL_SCAN_IO(get_data_dir_path(), bytes_read); + while (bytes_req != 0) { auto res = SYNC_POINT_HOOK_RETURN_VALUE(::pread(_fd, to, bytes_req, offset), "LocalFileReader::pread", _fd, to); diff --git a/be/src/io/fs/local_file_reader.h b/be/src/io/fs/local_file_reader.h index 4191374111b8af5..0ffd6ccde9e0295 100644 --- a/be/src/io/fs/local_file_reader.h +++ b/be/src/io/fs/local_file_reader.h @@ -29,6 +29,7 @@ namespace doris { struct StorePath; struct DataDirInfo; +struct CachePath; } // namespace doris namespace doris::io { @@ -39,7 +40,8 @@ struct BeConfDataDirReader { static void get_data_dir_by_file_path(Path* file_path, std::string* data_dir_arg); static void init_be_conf_data_dir(const std::vector& store_paths, - const std::vector& spill_store_paths); + const std::vector& spill_store_paths, + const std::vector& cache_paths); }; struct IOContext; diff --git a/be/src/io/fs/multi_table_pipe.cpp b/be/src/io/fs/multi_table_pipe.cpp index d7fdd8a738b2746..357abee9d0fdb1d 100644 --- a/be/src/io/fs/multi_table_pipe.cpp +++ b/be/src/io/fs/multi_table_pipe.cpp @@ -251,7 +251,7 @@ Status MultiTablePipe::exec_plans(ExecEnv* exec_env, std::vector para _inflight_cnt++; RETURN_IF_ERROR(exec_env->fragment_mgr()->exec_plan_fragment( - plan, [this, plan](RuntimeState* state, Status* status) { + plan, QuerySource::ROUTINE_LOAD, [this, plan](RuntimeState* state, Status* status) { DCHECK(state); auto pair = _planned_tables.find(plan.table_name); if (pair == _planned_tables.end()) { diff --git a/be/src/io/fs/s3_file_reader.cpp b/be/src/io/fs/s3_file_reader.cpp index a5c6ec09162cf45..86590d91632162a 100644 --- a/be/src/io/fs/s3_file_reader.cpp +++ b/be/src/io/fs/s3_file_reader.cpp @@ -35,6 +35,8 @@ #include "io/fs/err_utils.h" #include "io/fs/obj_storage_client.h" #include "io/fs/s3_common.h" +#include "runtime/thread_context.h" +#include "runtime/workload_management/io_throttle.h" #include "util/bvar_helper.h" #include "util/doris_metrics.h" #include "util/runtime_profile.h" @@ -120,24 +122,14 @@ Status S3FileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_rea if (!client) { return Status::InternalError("init s3 client error"); } - // // clang-format off - // auto resp = client->get_object( { .bucket = _bucket, .key = _key, }, - // to, offset, bytes_req, bytes_read); - // // clang-format on - // if (resp.status.code != ErrorCode::OK) { - // return std::move(Status(resp.status.code, std::move(resp.status.msg)) - // .append(fmt::format("failed to read from {}", _path.native()))); - // } - // if (*bytes_read != bytes_req) { - // return Status::InternalError("failed to read from {}(bytes read: {}, bytes req: {})", - // _path.native(), *bytes_read, bytes_req); - SCOPED_BVAR_LATENCY(s3_bvar::s3_get_latency); int retry_count = 0; const int base_wait_time = config::s3_read_base_wait_time_ms; // Base wait time in milliseconds const int max_wait_time = config::s3_read_max_wait_time_ms; // Maximum wait time in milliseconds const int max_retries = config::max_s3_client_retry; // wait 1s, 2s, 4s, 8s for each backoff + LIMIT_REMOTE_SCAN_IO(bytes_read); + int total_sleep_time = 0; while (retry_count <= max_retries) { s3_file_reader_read_counter << 1; diff --git a/be/src/io/fs/s3_file_system.cpp b/be/src/io/fs/s3_file_system.cpp index 93f36429485c328..3a5fffb2549938d 100644 --- a/be/src/io/fs/s3_file_system.cpp +++ b/be/src/io/fs/s3_file_system.cpp @@ -18,9 +18,8 @@ #include "io/fs/s3_file_system.h" #include -#include -#include +#include #include "common/compiler_util.h" // IWYU pragma: keep // IWYU pragma: no_include @@ -32,7 +31,6 @@ #include // IWYU pragma: keep #include #include -#include #include "common/config.h" #include "common/logging.h" @@ -46,7 +44,7 @@ #include "io/fs/s3_file_reader.h" #include "io/fs/s3_file_writer.h" #include "io/fs/s3_obj_storage_client.h" -#include "util/bvar_helper.h" +#include "runtime/exec_env.h" #include "util/s3_uri.h" #include "util/s3_util.h" @@ -69,13 +67,6 @@ Result get_key(const Path& full_path) { return uri.get_key(); } -// TODO(plat1ko): AwsTransferManager will be deprecated -std::shared_ptr& default_executor() { - static auto executor = Aws::MakeShared( - "default", config::s3_transfer_executor_pool_size); - return executor; -} - } // namespace ObjClientHolder::ObjClientHolder(S3ClientConf conf) : _conf(std::move(conf)) {} @@ -329,7 +320,8 @@ Status S3FileSystem::upload_impl(const Path& local_file, const Path& remote_file FileReaderSPtr local_reader; RETURN_IF_ERROR(io::global_local_filesystem()->open_file(local_file, &local_reader)); size_t local_buffer_size = config::s3_file_system_local_upload_buffer_size; - std::unique_ptr write_buffer = std::make_unique(local_buffer_size); + std::unique_ptr write_buffer = + std::make_unique_for_overwrite(local_buffer_size); size_t cur_read = 0; while (cur_read < local_reader->size()) { size_t bytes_read = 0; @@ -370,7 +362,8 @@ Status S3FileSystem::batch_upload_impl(const std::vector& local_files, FileReaderSPtr local_reader; RETURN_IF_ERROR(io::global_local_filesystem()->open_file(local_file, &local_reader)); size_t local_buffer_size = config::s3_file_system_local_upload_buffer_size; - std::unique_ptr write_buffer = std::make_unique(local_buffer_size); + std::unique_ptr write_buffer = + std::make_unique_for_overwrite(local_buffer_size); size_t cur_read = 0; while (cur_read < local_reader->size()) { size_t bytes_read = 0; @@ -383,13 +376,19 @@ Status S3FileSystem::batch_upload_impl(const std::vector& local_files, return Status::OK(); }; + Status s = Status::OK(); std::vector> futures; for (int i = 0; i < local_files.size(); ++i) { auto task = std::make_shared>(upload_task); futures.emplace_back(task->get_future()); - default_executor()->Submit([t = std::move(task), idx = i]() mutable { (*t)(idx); }); + auto st = ExecEnv::GetInstance()->s3_file_system_thread_pool()->submit_func( + [t = std::move(task), idx = i]() mutable { (*t)(idx); }); + // We shouldn't return immediately since the previous submitted tasks might still be running in the thread pool + if (!st.ok()) { + s = st; + break; + } } - Status s = Status::OK(); for (auto&& f : futures) { auto cur_s = f.get(); if (!cur_s.ok()) { @@ -405,7 +404,7 @@ Status S3FileSystem::download_impl(const Path& remote_file, const Path& local_fi auto key = DORIS_TRY(get_key(remote_file)); int64_t size; RETURN_IF_ERROR(file_size(remote_file, &size)); - std::unique_ptr buf = std::make_unique(size); + std::unique_ptr buf = std::make_unique_for_overwrite(size); size_t bytes_read = 0; // clang-format off auto resp = client->get_object( {.bucket = _bucket, .key = key,}, diff --git a/be/src/io/fs/s3_file_writer.cpp b/be/src/io/fs/s3_file_writer.cpp index 655a94a6eb4c0d1..24b72a4b6c902c2 100644 --- a/be/src/io/fs/s3_file_writer.cpp +++ b/be/src/io/fs/s3_file_writer.cpp @@ -43,10 +43,13 @@ namespace doris::io { -bvar::Adder s3_file_writer_total("s3_file_writer", "total_num"); -bvar::Adder s3_bytes_written_total("s3_file_writer", "bytes_written"); -bvar::Adder s3_file_created_total("s3_file_writer", "file_created"); -bvar::Adder s3_file_being_written("s3_file_writer", "file_being_written"); +bvar::Adder s3_file_writer_total("s3_file_writer_total_num"); +bvar::Adder s3_bytes_written_total("s3_file_writer_bytes_written"); +bvar::Adder s3_file_created_total("s3_file_writer_file_created"); +bvar::Adder s3_file_being_written("s3_file_writer_file_being_written"); +bvar::Adder s3_file_writer_async_close_queuing("s3_file_writer_async_close_queuing"); +bvar::Adder s3_file_writer_async_close_processing( + "s3_file_writer_async_close_processing"); S3FileWriter::S3FileWriter(std::shared_ptr client, std::string bucket, std::string key, const FileWriterOptions* opts) @@ -141,14 +144,63 @@ Status S3FileWriter::close(bool non_block) { _state = State::ASYNC_CLOSING; _async_close_pack = std::make_unique(); _async_close_pack->future = _async_close_pack->promise.get_future(); - return ExecEnv::GetInstance()->non_block_close_thread_pool()->submit_func( - [&]() { _async_close_pack->promise.set_value(_close_impl()); }); + s3_file_writer_async_close_queuing << 1; + return ExecEnv::GetInstance()->non_block_close_thread_pool()->submit_func([&]() { + s3_file_writer_async_close_queuing << -1; + s3_file_writer_async_close_processing << 1; + _async_close_pack->promise.set_value(_close_impl()); + s3_file_writer_async_close_processing << -1; + }); } _st = _close_impl(); _state = State::CLOSED; return _st; } +bool S3FileWriter::_complete_part_task_callback(Status s) { + bool ret = false; + if (!s.ok()) [[unlikely]] { + VLOG_NOTICE << "failed at key: " << _obj_storage_path_opts.key + << ", status: " << s.to_string(); + std::unique_lock _lck {_completed_lock}; + _failed = true; + ret = true; + _st = std::move(s); + } + // After the signal, there is a scenario where the previous invocation of _wait_until_finish + // returns to the caller, and subsequently, the S3 file writer is destructed. + // This means that accessing _failed afterwards would result in a heap use after free vulnerability. + _countdown_event.signal(); + return ret; +} + +Status S3FileWriter::_build_upload_buffer() { + auto builder = FileBufferBuilder(); + builder.set_type(BufferType::UPLOAD) + .set_upload_callback([part_num = _cur_part_num, this](UploadFileBuffer& buf) { + _upload_one_part(part_num, buf); + }) + .set_file_offset(_bytes_appended) + .set_sync_after_complete_task([this](auto&& PH1) { + return _complete_part_task_callback(std::forward(PH1)); + }) + .set_is_cancelled([this]() { return _failed.load(); }); + if (_cache_builder != nullptr) { + // We would load the data into file cache asynchronously which indicates + // that this instance of S3FileWriter might have been destructed when we + // try to do writing into file cache, so we make the lambda capture the variable + // we need by value to extend their lifetime + builder.set_allocate_file_blocks_holder( + [builder = *_cache_builder, offset = _bytes_appended]() -> FileBlocksHolderPtr { + return builder.allocate_cache_holder(offset, config::s3_write_buffer_size); + }); + } + RETURN_IF_ERROR(builder.build(&_pending_buf)); + auto* buf = dynamic_cast(_pending_buf.get()); + DCHECK(buf != nullptr); + return Status::OK(); +} + Status S3FileWriter::_close_impl() { VLOG_DEBUG << "S3FileWriter::close, path: " << _obj_storage_path_opts.path.native(); @@ -157,35 +209,13 @@ Status S3FileWriter::_close_impl() { } if (_bytes_appended == 0) { + DCHECK(_cur_part_num == 1); // No data written, but need to create an empty file - auto builder = FileBufferBuilder(); - builder.set_type(BufferType::UPLOAD) - .set_upload_callback([this](UploadFileBuffer& buf) { _put_object(buf); }) - .set_sync_after_complete_task([this](Status s) { - bool ret = false; - if (!s.ok()) [[unlikely]] { - VLOG_NOTICE << "failed at key: " << _obj_storage_path_opts.key - << ", status: " << s.to_string(); - std::unique_lock _lck {_completed_lock}; - _failed = true; - ret = true; - this->_st = std::move(s); - } - // After the signal, there is a scenario where the previous invocation of _wait_until_finish - // returns to the caller, and subsequently, the S3 file writer is destructed. - // This means that accessing _failed afterwards would result in a heap use after free vulnerability. - _countdown_event.signal(); - return ret; - }) - .set_is_cancelled([this]() { return _failed.load(); }); - RETURN_IF_ERROR(builder.build(&_pending_buf)); - auto* buf = dynamic_cast(_pending_buf.get()); - DCHECK(buf != nullptr); - if (_used_by_s3_committer) { - buf->set_upload_to_remote([part_num = _cur_part_num, this](UploadFileBuffer& buf) { - _upload_one_part(part_num, buf); - }); - DCHECK(_cur_part_num == 1); + RETURN_IF_ERROR(_build_upload_buffer()); + if (!_used_by_s3_committer) { + auto* pending_buf = dynamic_cast(_pending_buf.get()); + pending_buf->set_upload_to_remote([this](UploadFileBuffer& buf) { _put_object(buf); }); + } else { RETURN_IF_ERROR(_create_multi_upload_request()); } } @@ -217,43 +247,7 @@ Status S3FileWriter::appendv(const Slice* data, size_t data_cnt) { return _st; } if (!_pending_buf) { - auto builder = FileBufferBuilder(); - builder.set_type(BufferType::UPLOAD) - .set_upload_callback( - [part_num = _cur_part_num, this](UploadFileBuffer& buf) { - _upload_one_part(part_num, buf); - }) - .set_file_offset(_bytes_appended) - .set_sync_after_complete_task([this, part_num = _cur_part_num](Status s) { - bool ret = false; - if (!s.ok()) [[unlikely]] { - VLOG_NOTICE << "failed at key: " << _obj_storage_path_opts.key - << ", load part " << part_num << ", st " << s; - std::unique_lock _lck {_completed_lock}; - _failed = true; - ret = true; - this->_st = std::move(s); - } - // After the signal, there is a scenario where the previous invocation of _wait_until_finish - // returns to the caller, and subsequently, the S3 file writer is destructed. - // This means that accessing _failed afterwards would result in a heap use after free vulnerability. - _countdown_event.signal(); - return ret; - }) - .set_is_cancelled([this]() { return _failed.load(); }); - if (_cache_builder != nullptr) { - // We would load the data into file cache asynchronously which indicates - // that this instance of S3FileWriter might have been destructed when we - // try to do writing into file cache, so we make the lambda capture the variable - // we need by value to extend their lifetime - builder.set_allocate_file_blocks_holder( - [builder = *_cache_builder, - offset = _bytes_appended]() -> FileBlocksHolderPtr { - return builder.allocate_cache_holder(offset, - config::s3_write_buffer_size); - }); - } - RETURN_IF_ERROR(builder.build(&_pending_buf)); + RETURN_IF_ERROR(_build_upload_buffer()); } // we need to make sure all parts except the last one to be 5MB or more // and shouldn't be larger than buf diff --git a/be/src/io/fs/s3_file_writer.h b/be/src/io/fs/s3_file_writer.h index c67c79ce5366b03..95ad52ddb670812 100644 --- a/be/src/io/fs/s3_file_writer.h +++ b/be/src/io/fs/s3_file_writer.h @@ -84,6 +84,8 @@ class S3FileWriter final : public FileWriter { Status _set_upload_to_remote_less_than_buffer_size(); void _put_object(UploadFileBuffer& buf); void _upload_one_part(int64_t part_num, UploadFileBuffer& buf); + bool _complete_part_task_callback(Status s); + Status _build_upload_buffer(); ObjectStoragePathOptions _obj_storage_path_opts; diff --git a/be/src/io/fs/s3_obj_storage_client.cpp b/be/src/io/fs/s3_obj_storage_client.cpp index 2bed3241e302782..2c66e819833b941 100644 --- a/be/src/io/fs/s3_obj_storage_client.cpp +++ b/be/src/io/fs/s3_obj_storage_client.cpp @@ -71,6 +71,35 @@ #include "io/fs/s3_common.h" #include "util/bvar_helper.h" +namespace { +inline ::Aws::Client::AWSError<::Aws::S3::S3Errors> s3_error_factory() { + return {::Aws::S3::S3Errors::INTERNAL_FAILURE, "exceeds limit", "exceeds limit", false}; +} + +template +auto s3_rate_limit(doris::S3RateLimitType op, Func callback) -> decltype(callback()) { + using T = decltype(callback()); + if (!doris::config::enable_s3_rate_limiter) { + return callback(); + } + auto sleep_duration = doris::S3ClientFactory::instance().rate_limiter(op)->add(1); + if (sleep_duration < 0) { + return T(s3_error_factory()); + } + return callback(); +} + +template +auto s3_get_rate_limit(Func callback) -> decltype(callback()) { + return s3_rate_limit(doris::S3RateLimitType::GET, std::move(callback)); +} + +template +auto s3_put_rate_limit(Func callback) -> decltype(callback()) { + return s3_rate_limit(doris::S3RateLimitType::PUT, std::move(callback)); +} +} // namespace + namespace Aws::S3::Model { class DeleteObjectRequest; } // namespace Aws::S3::Model @@ -92,9 +121,9 @@ ObjectStorageUploadResponse S3ObjStorageClient::create_multipart_upload( create_request.SetContentType("application/octet-stream"); SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); - auto outcome = SYNC_POINT_HOOK_RETURN_VALUE(_client->CreateMultipartUpload(create_request), - "s3_file_writer::create_multi_part_upload", - std::cref(create_request).get()); + auto outcome = SYNC_POINT_HOOK_RETURN_VALUE( + s3_put_rate_limit([&]() { return _client->CreateMultipartUpload(create_request); }), + "s3_file_writer::create_multi_part_upload", std::cref(create_request).get()); SYNC_POINT_CALLBACK("s3_file_writer::_open", &outcome); if (outcome.IsSuccess()) { @@ -122,9 +151,9 @@ ObjectStorageResponse S3ObjStorageClient::put_object(const ObjectStoragePathOpti request.SetContentLength(stream.size()); request.SetContentType("application/octet-stream"); SCOPED_BVAR_LATENCY(s3_bvar::s3_put_latency); - auto response = - SYNC_POINT_HOOK_RETURN_VALUE(_client->PutObject(request), "s3_file_writer::put_object", - std::cref(request).get(), &stream); + auto response = SYNC_POINT_HOOK_RETURN_VALUE( + s3_put_rate_limit([&]() { return _client->PutObject(request); }), + "s3_file_writer::put_object", std::cref(request).get(), &stream); if (!response.IsSuccess()) { auto st = s3fs_error(response.GetError(), fmt::format("failed to put object {}", opts.path.native())); @@ -157,8 +186,8 @@ ObjectStorageUploadResponse S3ObjStorageClient::upload_part(const ObjectStorageP { SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); upload_part_outcome = SYNC_POINT_HOOK_RETURN_VALUE( - _client->UploadPart(upload_request), "s3_file_writer::upload_part", - std::cref(upload_request).get(), &stream); + s3_put_rate_limit([&]() { return _client->UploadPart(upload_request); }), + "s3_file_writer::upload_part", std::cref(upload_request).get(), &stream); } TEST_SYNC_POINT_CALLBACK("S3FileWriter::_upload_one_part", &upload_part_outcome); if (!upload_part_outcome.IsSuccess()) { @@ -199,7 +228,7 @@ ObjectStorageResponse S3ObjStorageClient::complete_multipart_upload( TEST_SYNC_POINT_RETURN_WITH_VALUE("S3FileWriter::_complete:3", ObjectStorageResponse(), this); SCOPED_BVAR_LATENCY(s3_bvar::s3_multi_part_upload_latency); auto complete_outcome = SYNC_POINT_HOOK_RETURN_VALUE( - _client->CompleteMultipartUpload(complete_request), + s3_put_rate_limit([&]() { return _client->CompleteMultipartUpload(complete_request); }), "s3_file_writer::complete_multi_part", std::cref(complete_request).get()); if (!complete_outcome.IsSuccess()) { @@ -220,7 +249,8 @@ ObjectStorageHeadResponse S3ObjStorageClient::head_object(const ObjectStoragePat SCOPED_BVAR_LATENCY(s3_bvar::s3_head_latency); auto outcome = SYNC_POINT_HOOK_RETURN_VALUE( - _client->HeadObject(request), "s3_file_system::head_object", std::ref(request).get()); + s3_get_rate_limit([&]() { return _client->HeadObject(request); }), + "s3_file_system::head_object", std::ref(request).get()); if (outcome.IsSuccess()) { return {.resp = {convert_to_obj_response(Status::OK())}, .file_size = outcome.GetResult().GetContentLength()}; @@ -247,7 +277,7 @@ ObjectStorageResponse S3ObjStorageClient::get_object(const ObjectStoragePathOpti request.SetResponseStreamFactory(AwsWriteableStreamFactory(buffer, bytes_read)); SCOPED_BVAR_LATENCY(s3_bvar::s3_get_latency); - auto outcome = _client->GetObject(request); + auto outcome = s3_get_rate_limit([&]() { return _client->GetObject(request); }); if (!outcome.IsSuccess()) { return {convert_to_obj_response( s3fs_error(outcome.GetError(), @@ -273,7 +303,7 @@ ObjectStorageResponse S3ObjStorageClient::list_objects(const ObjectStoragePathOp Aws::S3::Model::ListObjectsV2Outcome outcome; { SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); - outcome = _client->ListObjectsV2(request); + outcome = s3_get_rate_limit([&]() { return _client->ListObjectsV2(request); }); } if (!outcome.IsSuccess()) { files->clear(); @@ -310,8 +340,9 @@ ObjectStorageResponse S3ObjStorageClient::delete_objects(const ObjectStoragePath }); del.WithObjects(std::move(objects)).SetQuiet(true); delete_request.SetDelete(std::move(del)); - SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_latency); - auto delete_outcome = _client->DeleteObjects(delete_request); + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_objects_latency); + auto delete_outcome = + s3_put_rate_limit([&]() { return _client->DeleteObjects(delete_request); }); if (!delete_outcome.IsSuccess()) { return {convert_to_obj_response( s3fs_error(delete_outcome.GetError(), @@ -331,8 +362,8 @@ ObjectStorageResponse S3ObjStorageClient::delete_object(const ObjectStoragePathO Aws::S3::Model::DeleteObjectRequest request; request.WithBucket(opts.bucket).WithKey(opts.key); - SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_latency); - auto outcome = _client->DeleteObject(request); + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_object_latency); + auto outcome = s3_put_rate_limit([&]() { return _client->DeleteObject(request); }); if (outcome.IsSuccess() || outcome.GetError().GetResponseCode() == Aws::Http::HttpResponseCode::NOT_FOUND) { return ObjectStorageResponse::OK(); @@ -354,7 +385,7 @@ ObjectStorageResponse S3ObjStorageClient::delete_objects_recursively( Aws::S3::Model::ListObjectsV2Outcome outcome; { SCOPED_BVAR_LATENCY(s3_bvar::s3_list_latency); - outcome = _client->ListObjectsV2(request); + outcome = s3_get_rate_limit([&]() { return _client->ListObjectsV2(request); }); } if (!outcome.IsSuccess()) { return {convert_to_obj_response(s3fs_error( @@ -373,8 +404,9 @@ ObjectStorageResponse S3ObjStorageClient::delete_objects_recursively( Aws::S3::Model::Delete del; del.WithObjects(std::move(objects)).SetQuiet(true); delete_request.SetDelete(std::move(del)); - SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_latency); - auto delete_outcome = _client->DeleteObjects(delete_request); + SCOPED_BVAR_LATENCY(s3_bvar::s3_delete_objects_latency); + auto delete_outcome = + s3_put_rate_limit([&]() { return _client->DeleteObjects(delete_request); }); if (!delete_outcome.IsSuccess()) { return {convert_to_obj_response( s3fs_error(delete_outcome.GetError(), diff --git a/be/src/io/fs/stream_load_pipe.cpp b/be/src/io/fs/stream_load_pipe.cpp index 21c3856a8156906..ce91a2e839113ca 100644 --- a/be/src/io/fs/stream_load_pipe.cpp +++ b/be/src/io/fs/stream_load_pipe.cpp @@ -111,7 +111,9 @@ Status StreamLoadPipe::read_one_message(std::unique_ptr* data, size_t } Status StreamLoadPipe::append_and_flush(const char* data, size_t size, size_t proto_byte_size) { - ByteBufferPtr buf = ByteBuffer::allocate(BitUtil::RoundUpToPowerOfTwo(size + 1)); + SCOPED_ATTACH_TASK(ExecEnv::GetInstance()->stream_load_pipe_tracker()); + ByteBufferPtr buf; + RETURN_IF_ERROR_OR_CATCH_EXCEPTION(ByteBuffer::create_and_allocate(buf, 128 * 1024)); buf->put_bytes(data, size); buf->flip(); return _append(buf, proto_byte_size); @@ -145,7 +147,8 @@ Status StreamLoadPipe::append(const char* data, size_t size) { // need to allocate a new chunk, min chunk is 64k size_t chunk_size = std::max(_min_chunk_size, size - pos); chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size); - _write_buf = ByteBuffer::allocate(chunk_size); + SCOPED_ATTACH_TASK(ExecEnv::GetInstance()->stream_load_pipe_tracker()); + RETURN_IF_ERROR_OR_CATCH_EXCEPTION(ByteBuffer::create_and_allocate(_write_buf, chunk_size)); _write_buf->put_bytes(data + pos, size - pos); return Status::OK(); } diff --git a/be/src/olap/accept_null_predicate.h b/be/src/olap/accept_null_predicate.h index 89d26e2684c1fbf..112101f46ddf057 100644 --- a/be/src/olap/accept_null_predicate.h +++ b/be/src/olap/accept_null_predicate.h @@ -34,6 +34,7 @@ namespace doris { * but pass (set/return true) for NULL value rows. * * At parent, it's used for topn runtime predicate. + * Eg: original input indexs is '1,2,3,7,8,9' and value of index9 is null, we get nested predicate output index is '1,2,3', but we finally output '1,2,3,9' */ class AcceptNullPredicate : public ColumnPredicate { ENABLE_FACTORY_CREATOR(AcceptNullPredicate); @@ -44,8 +45,6 @@ class AcceptNullPredicate : public ColumnPredicate { PredicateType type() const override { return _nested->type(); } - void set_nested(ColumnPredicate* nested) { _nested.reset(nested); } - Status evaluate(BitmapIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* roaring) const override { return _nested->evaluate(iterator, num_rows, roaring); @@ -64,11 +63,14 @@ class AcceptNullPredicate : public ColumnPredicate { void evaluate_and(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const override { if (column.has_null()) { + std::vector original_flags(size); + memcpy(original_flags.data(), flags, size); + const auto& nullable_col = assert_cast(column); _nested->evaluate_and(nullable_col.get_nested_column(), sel, size, flags); const auto& nullmap = nullable_col.get_null_map_data(); for (uint16_t i = 0; i < size; ++i) { - flags[i] |= nullmap[sel[i]]; + flags[i] |= (original_flags[i] && nullmap[sel[i]]); } } else { _nested->evaluate_and(column, sel, size, flags); @@ -77,20 +79,7 @@ class AcceptNullPredicate : public ColumnPredicate { void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const override { - if (column.has_null()) { - const auto& nullable_col = assert_cast(column); - _nested->evaluate_or(nullable_col.get_nested_column(), sel, size, flags); - - // call evaluate_or and set true for NULL rows - for (uint16_t i = 0; i < size; ++i) { - uint16_t idx = sel[i]; - if (!flags[i] && nullable_col.is_null_at(idx)) { - flags[i] = true; - } - } - } else { - _nested->evaluate_or(column, sel, size, flags); - } + DCHECK(false) << "should not reach here"; } bool evaluate_and(const std::pair& statistic) const override { @@ -158,6 +147,8 @@ class AcceptNullPredicate : public ColumnPredicate { } // create selected_flags uint16_t max_idx = sel[size - 1]; + std::vector old_sel(size); + memcpy(old_sel.data(), sel, sizeof(uint16_t) * size); const auto& nullable_col = assert_cast(column); // call nested predicate evaluate @@ -165,14 +156,18 @@ class AcceptNullPredicate : public ColumnPredicate { // process NULL values if (new_size < size) { - std::vector selected(max_idx + 1); - memcpy(selected.data(), nullable_col.get_null_map_data().data(), - (max_idx + 1) * sizeof(bool)); + std::vector selected(max_idx + 1, 0); + const auto* nullmap = nullable_col.get_null_map_data().data(); // add rows selected by _nested->evaluate for (uint16_t i = 0; i < new_size; ++i) { uint16_t row_idx = sel[i]; selected[row_idx] = true; } + // reset null from original data + for (uint16_t i = 0; i < size; ++i) { + uint16_t row_idx = old_sel[i]; + selected[row_idx] |= nullmap[row_idx]; + } // recaculate new_size and sel array new_size = 0; diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index 22940b40206de42..934b00f56698b83 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -24,6 +24,7 @@ #include "olap/calc_delete_bitmap_executor.h" #include "olap/delete_bitmap_calculator.h" #include "olap/memtable.h" +#include "olap/partial_update_info.h" #include "olap/primary_key_index.h" #include "olap/rowid_conversion.h" #include "olap/rowset/beta_rowset.h" @@ -36,6 +37,7 @@ #include "util/crc32c.h" #include "util/debug_points.h" #include "util/doris_metrics.h" +#include "vec/common/assert_cast.h" #include "vec/common/schema_util.h" #include "vec/data_types/data_type_factory.hpp" #include "vec/jsonb/serialize.h" @@ -55,51 +57,6 @@ bvar::LatencyRecorder g_tablet_update_delete_bitmap_latency("doris_pk", "update_ static bvar::Adder g_total_tablet_num("doris_total_tablet_num"); -// read columns by read plan -// read_index: ori_pos-> block_idx -Status read_columns_by_plan(TabletSchemaSPtr tablet_schema, - const std::vector cids_to_read, - const PartialUpdateReadPlan& read_plan, - const std::map& rsid_to_rowset, - vectorized::Block& block, std::map* read_index) { - bool has_row_column = tablet_schema->has_row_store_for_all_columns(); - auto mutable_columns = block.mutate_columns(); - size_t read_idx = 0; - for (auto rs_it : read_plan) { - for (auto seg_it : rs_it.second) { - auto rowset_iter = rsid_to_rowset.find(rs_it.first); - CHECK(rowset_iter != rsid_to_rowset.end()); - std::vector rids; - for (auto id_and_pos : seg_it.second) { - rids.emplace_back(id_and_pos.rid); - (*read_index)[id_and_pos.pos] = read_idx++; - } - if (has_row_column) { - auto st = BaseTablet::fetch_value_through_row_column(rowset_iter->second, - *tablet_schema, seg_it.first, - rids, cids_to_read, block); - if (!st.ok()) { - LOG(WARNING) << "failed to fetch value through row column"; - return st; - } - continue; - } - for (size_t cid = 0; cid < mutable_columns.size(); ++cid) { - TabletColumn tablet_column = tablet_schema->column(cids_to_read[cid]); - auto st = BaseTablet::fetch_value_by_rowids(rowset_iter->second, seg_it.first, rids, - tablet_column, mutable_columns[cid]); - // set read value to output block - if (!st.ok()) { - LOG(WARNING) << "failed to fetch value"; - return st; - } - } - } - } - block.set_columns(std::move(mutable_columns)); - return Status::OK(); -} - Status _get_segment_column_iterator(const BetaRowsetSharedPtr& rowset, uint32_t segid, const TabletColumn& target_column, SegmentCacheHandle* segment_cache_handle, @@ -554,27 +511,6 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, bool with_seq_col, return Status::Error("can't find key in all rowsets"); } -void BaseTablet::prepare_to_read(const RowLocation& row_location, size_t pos, - PartialUpdateReadPlan* read_plan) { - auto rs_it = read_plan->find(row_location.rowset_id); - if (rs_it == read_plan->end()) { - std::map> segid_to_rid; - std::vector rid_pos; - rid_pos.emplace_back(RidAndPos {row_location.row_id, pos}); - segid_to_rid.emplace(row_location.segment_id, rid_pos); - read_plan->emplace(row_location.rowset_id, segid_to_rid); - return; - } - auto seg_it = rs_it->second.find(row_location.segment_id); - if (seg_it == rs_it->second.end()) { - std::vector rid_pos; - rid_pos.emplace_back(RidAndPos {row_location.row_id, pos}); - rs_it->second.emplace(row_location.segment_id, rid_pos); - return; - } - seg_it->second.emplace_back(RidAndPos {row_location.row_id, pos}); -} - // if user pass a token, then all calculation works will submit to a threadpool, // user can get all delete bitmaps from that token. // if `token` is nullptr, the calculation will run in local, and user can get the result @@ -753,8 +689,8 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, // So here we should read version 5's columns and build a new row, which is // consists of version 6's update columns and version 5's origin columns // here we build 2 read plan for ori values and update values - prepare_to_read(loc, pos, &read_plan_ori); - prepare_to_read(RowLocation {rowset_id, seg->id(), row_id}, pos, &read_plan_update); + read_plan_ori.prepare_to_read(loc, pos); + read_plan_update.prepare_to_read(RowLocation {rowset_id, seg->id(), row_id}, pos); rsid_to_rowset[rowset_find->rowset_id()] = rowset_find; ++pos; // delete bitmap will be calculate when memtable flush and @@ -925,6 +861,40 @@ Status BaseTablet::fetch_value_by_rowids(RowsetSharedPtr input_rowset, uint32_t return Status::OK(); } +const signed char* BaseTablet::get_delete_sign_column_data(vectorized::Block& block, + size_t rows_at_least) { + if (const vectorized::ColumnWithTypeAndName* delete_sign_column = + block.try_get_by_name(DELETE_SIGN); + delete_sign_column != nullptr) { + const auto& delete_sign_col = + reinterpret_cast(*(delete_sign_column->column)); + if (delete_sign_col.size() >= rows_at_least) { + return delete_sign_col.get_data().data(); + } + } + return nullptr; +}; + +Status BaseTablet::generate_default_value_block(const TabletSchema& schema, + const std::vector& cids, + const std::vector& default_values, + const vectorized::Block& ref_block, + vectorized::Block& default_value_block) { + auto mutable_default_value_columns = default_value_block.mutate_columns(); + for (auto i = 0; i < cids.size(); ++i) { + const auto& column = schema.column(cids[i]); + if (column.has_default_value()) { + const auto& default_value = default_values[i]; + vectorized::ReadBuffer rb(const_cast(default_value.c_str()), + default_value.size()); + RETURN_IF_ERROR(ref_block.get_by_position(i).type->from_string( + rb, mutable_default_value_columns[i].get())); + } + } + default_value_block.set_columns(std::move(mutable_default_value_columns)); + return Status::OK(); +} + Status BaseTablet::generate_new_block_for_partial_update( TabletSchemaSPtr rowset_schema, const PartialUpdateInfo* partial_update_info, const PartialUpdateReadPlan& read_plan_ori, const PartialUpdateReadPlan& read_plan_update, @@ -942,75 +912,81 @@ Status BaseTablet::generate_new_block_for_partial_update( auto old_block = rowset_schema->create_block_by_cids(missing_cids); auto update_block = rowset_schema->create_block_by_cids(update_cids); - std::map read_index_old; - RETURN_IF_ERROR(read_columns_by_plan(rowset_schema, missing_cids, read_plan_ori, rsid_to_rowset, - old_block, &read_index_old)); - + // rowid in the final block(start from 0, increase continuously) -> rowid to read in update_block std::map read_index_update; - RETURN_IF_ERROR(read_columns_by_plan(rowset_schema, update_cids, read_plan_update, - rsid_to_rowset, update_block, &read_index_update)); - const vectorized::Int8* delete_sign_column_data = nullptr; - if (const vectorized::ColumnWithTypeAndName* delete_sign_column = - old_block.try_get_by_name(DELETE_SIGN); - delete_sign_column != nullptr) { - auto& delete_sign_col = - reinterpret_cast(*(delete_sign_column->column)); - delete_sign_column_data = delete_sign_col.get_data().data(); + // read current rowset first, if a row in the current rowset has delete sign mark + // we don't need to read values from old block + RETURN_IF_ERROR(read_plan_update.read_columns_by_plan( + *rowset_schema, update_cids, rsid_to_rowset, update_block, &read_index_update)); + size_t update_rows = read_index_update.size(); + for (auto i = 0; i < update_cids.size(); ++i) { + for (auto idx = 0; idx < update_rows; ++idx) { + full_mutable_columns[update_cids[i]]->insert_from( + *update_block.get_columns_with_type_and_name()[i].column.get(), + read_index_update[idx]); + } } + // if there is sequence column in the table, we need to read the sequence column, + // otherwise it may cause the merge-on-read based compaction policy to produce incorrect results + const auto* __restrict new_block_delete_signs = + rowset_schema->has_sequence_col() + ? nullptr + : get_delete_sign_column_data(update_block, update_rows); + + // rowid in the final block(start from 0, increase, may not continuous becasue we skip to read some rows) -> rowid to read in old_block + std::map read_index_old; + RETURN_IF_ERROR(read_plan_ori.read_columns_by_plan(*rowset_schema, missing_cids, rsid_to_rowset, + old_block, &read_index_old, + new_block_delete_signs)); + size_t old_rows = read_index_old.size(); + const auto* __restrict old_block_delete_signs = + get_delete_sign_column_data(old_block, old_rows); + // build default value block auto default_value_block = old_block.clone_empty(); - auto mutable_default_value_columns = default_value_block.mutate_columns(); - if (delete_sign_column_data != nullptr) { - for (auto i = 0; i < missing_cids.size(); ++i) { - const auto& column = rowset_schema->column(missing_cids[i]); - if (column.has_default_value()) { - const auto& default_value = partial_update_info->default_values[i]; - vectorized::ReadBuffer rb(const_cast(default_value.c_str()), - default_value.size()); - RETURN_IF_ERROR(old_block.get_by_position(i).type->from_string( - rb, mutable_default_value_columns[i].get())); - } - } + if (old_block_delete_signs != nullptr || new_block_delete_signs != nullptr) { + RETURN_IF_ERROR(BaseTablet::generate_default_value_block( + *rowset_schema, missing_cids, partial_update_info->default_values, old_block, + default_value_block)); } + auto mutable_default_value_columns = default_value_block.mutate_columns(); - // build full block - CHECK(read_index_old.size() == read_index_update.size()); + CHECK(update_rows >= old_rows); + // build full block for (auto i = 0; i < missing_cids.size(); ++i) { const auto& rs_column = rowset_schema->column(missing_cids[i]); - for (auto idx = 0; idx < read_index_old.size(); ++idx) { - // if the conflict update is a delete sign, which means that the key is - // not exist now, we should not read old values from the deleted data, - // and should use default value instead. - // NOTE: since now we are in the publishing phase, all data is commited - // before, even the `strict_mode` is true (which requires partial update - // load job can't insert new keys), this "new" key MUST be written into - // the new generated segment file. - if (delete_sign_column_data != nullptr && - delete_sign_column_data[read_index_old[idx]] != 0) { - auto& mutable_column = full_mutable_columns[missing_cids[i]]; + auto& mutable_column = full_mutable_columns[missing_cids[i]]; + for (auto idx = 0; idx < update_rows; ++idx) { + // There are two cases we don't need to read values from old data: + // 1. if the conflicting new row's delete sign is marked, which means the value columns + // of the row will not be read. So we don't need to read the missing values from the previous rows. + // 2. if the conflicting old row's delete sign is marked, which means that the key is not exist now, + // we should not read old values from the deleted data, and should use default value instead. + // NOTE: since now we are in the publishing phase, all data is commited + // before, even the `strict_mode` is true (which requires partial update + // load job can't insert new keys), this "new" key MUST be written into + // the new generated segment file. + if (new_block_delete_signs != nullptr && new_block_delete_signs[idx]) { + mutable_column->insert_default(); + } else if (old_block_delete_signs != nullptr && + old_block_delete_signs[read_index_old[idx]] != 0) { if (rs_column.has_default_value()) { mutable_column->insert_from(*mutable_default_value_columns[i].get(), 0); } else if (rs_column.is_nullable()) { - assert_cast(mutable_column.get()) + assert_cast( + mutable_column.get()) ->insert_null_elements(1); } else { mutable_column->insert_default(); } - continue; + } else { + mutable_column->insert_from( + *old_block.get_columns_with_type_and_name()[i].column.get(), + read_index_old[idx]); } - full_mutable_columns[missing_cids[i]]->insert_from( - *old_block.get_columns_with_type_and_name()[i].column.get(), - read_index_old[idx]); - } - } - for (auto i = 0; i < update_cids.size(); ++i) { - for (auto idx = 0; idx < read_index_update.size(); ++idx) { - full_mutable_columns[update_cids[i]]->insert_from( - *update_block.get_columns_with_type_and_name()[i].column.get(), - read_index_update[idx]); } } output_block->set_columns(std::move(full_mutable_columns)); @@ -1179,17 +1155,6 @@ Status BaseTablet::check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap return Status::OK(); } -void BaseTablet::_remove_sentinel_mark_from_delete_bitmap(DeleteBitmapPtr delete_bitmap) { - for (auto it = delete_bitmap->delete_bitmap.begin(), end = delete_bitmap->delete_bitmap.end(); - it != end;) { - if (std::get<1>(it->first) == DeleteBitmap::INVALID_SEGMENT_ID) { - it = delete_bitmap->delete_bitmap.erase(it); - } else { - ++it; - } - } -} - Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInfo* txn_info, int64_t txn_id, int64_t txn_expiration) { SCOPED_BVAR_LATENCY(g_tablet_update_delete_bitmap_latency); @@ -1201,7 +1166,9 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf std::unique_ptr transient_rs_writer; DeleteBitmapPtr delete_bitmap = txn_info->delete_bitmap; - if (txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update) { + bool is_partial_update = + txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update; + if (is_partial_update) { transient_rs_writer = DORIS_TRY(self->create_transient_rowset_writer( *rowset, txn_info->partial_update_info, txn_expiration)); // Partial update might generate new segments when there is conflicts while publish, and mark @@ -1242,6 +1209,52 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf } auto t3 = watch.get_elapse_time_us(); + // If a rowset is produced by compaction before the commit phase of the partial update load + // and is not included in txn_info->rowset_ids, we can skip the alignment process of that rowset + // because data remains the same before and after compaction. But we still need to calculate the + // the delete bitmap for that rowset. + std::vector rowsets_skip_alignment; + if (is_partial_update) { + int64_t max_version_in_flush_phase = + txn_info->partial_update_info->max_version_in_flush_phase; + DCHECK(max_version_in_flush_phase != -1); + std::vector remained_rowsets; + for (const auto& rowset : specified_rowsets) { + if (rowset->end_version() <= max_version_in_flush_phase && + rowset->produced_by_compaction()) { + rowsets_skip_alignment.emplace_back(rowset); + } else { + remained_rowsets.emplace_back(rowset); + } + } + if (!rowsets_skip_alignment.empty()) { + specified_rowsets = std::move(remained_rowsets); + } + } + + DBUG_EXECUTE_IF("BaseTablet::update_delete_bitmap.enable_spin_wait", { + auto token = dp->param("token", "invalid_token"); + while (DebugPoints::instance()->is_enable("BaseTablet::update_delete_bitmap.block")) { + auto block_dp = DebugPoints::instance()->get_debug_point( + "BaseTablet::update_delete_bitmap.block"); + if (block_dp) { + auto wait_token = block_dp->param("wait_token", ""); + if (wait_token != token) { + break; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + }); + + if (!rowsets_skip_alignment.empty()) { + auto token = self->calc_delete_bitmap_executor()->create_token(); + // set rowset_writer to nullptr to skip the alignment process + RETURN_IF_ERROR(calc_delete_bitmap(self, rowset, segments, rowsets_skip_alignment, + delete_bitmap, cur_version - 1, token.get(), nullptr)); + RETURN_IF_ERROR(token->wait()); + } + // When there is only one segment, it will be calculated in the current thread. // Otherwise, it will be submitted to the thread pool for calculation. if (segments.size() <= 1) { @@ -1433,7 +1446,8 @@ Status BaseTablet::update_delete_bitmap_without_lock( return Status::InternalError( "debug tablet update delete bitmap without lock random failed"); } else { - LOG(INFO) << "BaseTablet.update_delete_bitmap_without_lock.random_failed not triggered" + LOG(INFO) << "BaseTablet.update_delete_bitmap_without_lock.random_failed not " + "triggered" << ", rnd:" << rnd << ", percent: " << percent; } }); @@ -1481,7 +1495,7 @@ Status BaseTablet::update_delete_bitmap_without_lock( if (!st.ok()) { LOG(WARNING) << fmt::format("delete bitmap correctness check failed in publish phase!"); } - self->_remove_sentinel_mark_from_delete_bitmap(delete_bitmap); + delete_bitmap->remove_sentinel_marks(); } for (auto& iter : delete_bitmap->delete_bitmap) { self->_tablet_meta->delete_bitmap().merge( diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index f958d398fd5d00f..ab289822df891f1 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -24,7 +24,6 @@ #include "common/status.h" #include "olap/iterators.h" #include "olap/olap_common.h" -#include "olap/partial_update_info.h" #include "olap/rowset/segment_v2/segment.h" #include "olap/tablet_fwd.h" #include "olap/tablet_meta.h" @@ -39,6 +38,8 @@ class RowsetWriter; class CalcDeleteBitmapToken; class SegmentCacheHandle; class RowIdConversion; +struct PartialUpdateInfo; +class PartialUpdateReadPlan; struct TabletWithVersion { BaseTabletSPtr tablet; @@ -150,9 +151,6 @@ class BaseTablet { std::vector>& segment_caches, RowsetSharedPtr* rowset = nullptr, bool with_rowid = true); - static void prepare_to_read(const RowLocation& row_location, size_t pos, - PartialUpdateReadPlan* read_plan); - // calc delete bitmap when flush memtable, use a fake version to calc // For example, cur max version is 5, and we use version 6 to calc but // finally this rowset publish version with 8, we should make up data @@ -189,6 +187,15 @@ class BaseTablet { int64_t txn_id, const RowsetIdUnorderedSet& rowset_ids, std::vector* rowsets = nullptr); + static const signed char* get_delete_sign_column_data(vectorized::Block& block, + size_t rows_at_least = 0); + + static Status generate_default_value_block(const TabletSchema& schema, + const std::vector& cids, + const std::vector& default_values, + const vectorized::Block& ref_block, + vectorized::Block& default_value_block); + static Status generate_new_block_for_partial_update( TabletSchemaSPtr rowset_schema, const PartialUpdateInfo* partial_update_info, const PartialUpdateReadPlan& read_plan_ori, @@ -289,7 +296,6 @@ class BaseTablet { static void _rowset_ids_difference(const RowsetIdUnorderedSet& cur, const RowsetIdUnorderedSet& pre, RowsetIdUnorderedSet* to_add, RowsetIdUnorderedSet* to_del); - static void _remove_sentinel_mark_from_delete_bitmap(DeleteBitmapPtr delete_bitmap); Status _capture_consistent_rowsets_unlocked(const std::vector& version_path, std::vector* rowsets) const; diff --git a/be/src/olap/bitmap_filter_predicate.h b/be/src/olap/bitmap_filter_predicate.h index 7075595e66e4cf7..431182c4ce821ea 100644 --- a/be/src/olap/bitmap_filter_predicate.h +++ b/be/src/olap/bitmap_filter_predicate.h @@ -75,6 +75,8 @@ class BitmapFilterColumnPredicate : public ColumnPredicate { } private: + bool _can_ignore() const override { return false; } + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, uint16_t size) const override; diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 9cc95d7152aa321..0e2ae500ac6a583 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -25,6 +25,7 @@ #include "vec/columns/column_nullable.h" #include "vec/columns/column_vector.h" #include "vec/columns/predicate_column.h" +#include "vec/common/assert_cast.h" #include "vec/exprs/vruntimefilter_wrapper.h" namespace doris { @@ -54,6 +55,8 @@ class BloomFilterColumnPredicate : public ColumnPredicate { return input_type == T || (is_string_type(input_type) && is_string_type(T)); } + double get_ignore_threshold() const override { return get_bloom_filter_ignore_thredhold(); } + private: bool _can_ignore() const override { return _filter->is_runtime_filter(); } diff --git a/be/src/olap/column_mapping.h b/be/src/olap/column_mapping.h index 047af1e9d1190bb..bf3a6118d76bac0 100644 --- a/be/src/olap/column_mapping.h +++ b/be/src/olap/column_mapping.h @@ -30,11 +30,11 @@ struct ColumnMapping { ColumnMapping() = default; virtual ~ColumnMapping() = default; - bool has_reference() const { return expr != nullptr || ref_column >= 0; } + bool has_reference() const { return expr != nullptr || ref_column_idx >= 0; } // <0: use default value // >=0: use origin column - int32_t ref_column = -1; + int32_t ref_column_idx = -1; // normally for default value. stores values for filters WrapperField* default_value = nullptr; std::shared_ptr expr; diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index d5b5abe1501cbde..a4a8e637bc63a53 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -183,27 +183,21 @@ class ColumnPredicate { "Not Implemented evaluate with inverted index, please check the predicate"); } - virtual double get_ignore_threshold() const { - return vectorized::VRuntimeFilterWrapper::EXPECTED_FILTER_RATE; - } + virtual double get_ignore_threshold() const { return 0; } // evaluate predicate on IColumn // a short circuit eval way uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, uint16_t size) const { - if (_always_true) { + if (always_true(true)) { return size; } uint16_t new_size = _evaluate_inner(column, sel, size); _evaluated_rows += size; _passed_rows += new_size; - if (_can_ignore()) { - // If the pass rate is very high, for example > 50%, then the filter is useless. - // Some filter is useless, for example ssb 4.3, it consumes a lot of cpu but it is - // useless. - vectorized::VRuntimeFilterWrapper::calculate_filter( - get_ignore_threshold(), _evaluated_rows - _passed_rows, _evaluated_rows, - _has_calculate_filter, _always_true); + if (_can_ignore() && !_judge_counter) { + vectorized::VRuntimeFilterWrapper::judge_selectivity( + get_ignore_threshold(), size - new_size, size, _always_true, _judge_counter); } return new_size; } @@ -308,7 +302,15 @@ class ColumnPredicate { } } - bool always_true() const { return _always_true; } + bool always_true(bool update) const { + if (update) { + _judge_counter--; + if (!_judge_counter) { + _always_true = false; + } + } + return _always_true; + } protected: virtual std::string _debug_string() const = 0; @@ -330,8 +332,8 @@ class ColumnPredicate { std::shared_ptr _predicate_params; mutable uint64_t _evaluated_rows = 1; mutable uint64_t _passed_rows = 0; + mutable int _judge_counter = 0; mutable bool _always_true = false; - mutable bool _has_calculate_filter = false; }; } //namespace doris diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 5e25b39fe66a93a..9109c59e8c235e2 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -197,9 +197,6 @@ Status Compaction::merge_input_rowsets() { _tablet->last_compaction_status = res; if (!res.ok()) { - LOG(WARNING) << "fail to do " << compaction_name() << ". res=" << res - << ", tablet=" << _tablet->tablet_id() - << ", output_version=" << _output_version; return res; } @@ -352,7 +349,9 @@ bool CompactionMixin::handle_ordered_data_compaction() { // check delete version: if compaction type is base compaction and // has a delete version, use original compaction - if (compaction_type() == ReaderType::READER_BASE_COMPACTION) { + if (compaction_type() == ReaderType::READER_BASE_COMPACTION || + (_allow_delete_in_cumu_compaction && + compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION)) { for (auto& rowset : _input_rowsets) { if (rowset->rowset_meta()->has_delete_predicate()) { return false; @@ -400,15 +399,14 @@ Status CompactionMixin::execute_compact() { data_dir->disks_compaction_score_increment(permits); data_dir->disks_compaction_num_increment(1); - Status st = execute_compact_impl(permits); - _tablet->compaction_count.fetch_add(1, std::memory_order_relaxed); + auto record_compaction_stats = [&](const doris::Exception& ex) { + _tablet->compaction_count.fetch_add(1, std::memory_order_relaxed); + data_dir->disks_compaction_score_increment(-permits); + data_dir->disks_compaction_num_increment(-1); + }; - data_dir->disks_compaction_score_increment(-permits); - data_dir->disks_compaction_num_increment(-1); - - if (!st.ok()) { - return st; - } + HANDLE_EXCEPTION_IF_CATCH_EXCEPTION(execute_compact_impl(permits), record_compaction_stats); + record_compaction_stats(doris::Exception()); if (enable_compaction_checksum) { EngineChecksumTask checksum_task(_engine, _tablet->tablet_id(), _tablet->schema_hash(), @@ -511,8 +509,8 @@ Status Compaction::do_inverted_index_compaction() { } else { DCHECK(false) << err_msg; } + // log here just for debugging, do not return error LOG(WARNING) << err_msg; - return Status::InternalError(err_msg); } } @@ -688,6 +686,9 @@ Status Compaction::do_inverted_index_compaction() { << st; return st; } + for (const auto& writer : inverted_index_file_writers) { + writer->set_file_writer_opts(ctx.get_file_writer_options()); + } } // use tmp file dir to store index files @@ -762,15 +763,17 @@ Status Compaction::do_inverted_index_compaction() { } } + std::vector all_inverted_index_file_info(dest_segment_num); uint64_t inverted_index_file_size = 0; for (int seg_id = 0; seg_id < dest_segment_num; ++seg_id) { auto inverted_index_file_writer = inverted_index_file_writers[seg_id].get(); if (Status st = inverted_index_file_writer->close(); !st.ok()) { status = Status::Error(st.msg()); } else { - inverted_index_file_size += inverted_index_file_writer->get_index_file_size(); + inverted_index_file_size += inverted_index_file_writer->get_index_file_total_size(); inverted_index_file_size -= compacted_idx_file_size[seg_id]; } + all_inverted_index_file_info[seg_id] = inverted_index_file_writer->get_index_file_info(); } // check index compaction status. If status is not ok, we should return error and end this compaction round. if (!status.ok()) { @@ -785,6 +788,7 @@ Status Compaction::do_inverted_index_compaction() { _output_rowset->rowset_meta()->set_index_disk_size(_output_rowset->index_disk_size() + inverted_index_file_size); + _output_rowset->rowset_meta()->update_inverted_index_files_info(all_inverted_index_file_info); COUNTER_UPDATE(_output_rowset_data_size_counter, _output_rowset->data_disk_size()); LOG(INFO) << "succeed to do index compaction" @@ -1183,13 +1187,10 @@ Status CloudCompactionMixin::execute_compact_impl(int64_t permits) { Status CloudCompactionMixin::execute_compact() { TEST_INJECTION_POINT("Compaction::do_compaction"); int64_t permits = get_compaction_permits(); - Status st = execute_compact_impl(permits); - if (!st.ok()) { - garbage_collection(); - return st; - } + HANDLE_EXCEPTION_IF_CATCH_EXCEPTION(execute_compact_impl(permits), + [&](const doris::Exception& ex) { garbage_collection(); }); _load_segment_to_cache(); - return st; + return Status::OK(); } Status CloudCompactionMixin::modify_rowsets() { @@ -1223,8 +1224,10 @@ Status CloudCompactionMixin::construct_output_rowset_writer(RowsetWriterContext& ctx.write_type = DataWriteType::TYPE_COMPACTION; auto compaction_policy = _tablet->tablet_meta()->compaction_policy(); - ctx.compaction_level = - _engine.cumu_compaction_policy(compaction_policy)->new_compaction_level(_input_rowsets); + if (_tablet->tablet_meta()->time_series_compaction_level_threshold() >= 2) { + ctx.compaction_level = _engine.cumu_compaction_policy(compaction_policy) + ->new_compaction_level(_input_rowsets); + } ctx.write_file_cache = compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION; ctx.file_cache_ttl_sec = _tablet->ttl_seconds(); diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index 685d70f1e0b9aab..ece960f0250459d 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -246,14 +246,17 @@ class ComparisonPredicateBase : public ColumnPredicate { template __attribute__((flatten)) void _evaluate_vec_internal(const vectorized::IColumn& column, uint16_t size, bool* flags) const { - if (_can_ignore() && !_has_calculate_filter) { + uint16_t current_evaluated_rows = 0; + uint16_t current_passed_rows = 0; + if (_can_ignore()) { if (is_and) { for (uint16_t i = 0; i < size; i++) { - _evaluated_rows += flags[i]; + current_evaluated_rows += flags[i]; } } else { - _evaluated_rows += size; + current_evaluated_rows += size; } + _evaluated_rows += current_evaluated_rows; } if (column.is_nullable()) { @@ -336,13 +339,14 @@ class ComparisonPredicateBase : public ColumnPredicate { } } - if (_can_ignore() && !_has_calculate_filter) { + if (_can_ignore() && !_judge_counter) { for (uint16_t i = 0; i < size; i++) { - _passed_rows += flags[i]; + current_passed_rows += flags[i]; } - vectorized::VRuntimeFilterWrapper::calculate_filter( - get_ignore_threshold(), _evaluated_rows - _passed_rows, _evaluated_rows, - _has_calculate_filter, _always_true); + _passed_rows += current_passed_rows; + vectorized::VRuntimeFilterWrapper::judge_selectivity( + get_ignore_threshold(), current_evaluated_rows - current_passed_rows, + current_evaluated_rows, _always_true, _judge_counter); } } @@ -356,8 +360,7 @@ class ComparisonPredicateBase : public ColumnPredicate { _evaluate_vec_internal(column, size, flags); } - // todo: It may be necessary to set a more reasonable threshold - double get_ignore_threshold() const override { return 0.1; } + double get_ignore_threshold() const override { return get_comparison_ignore_thredhold(); } private: uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, @@ -449,12 +452,12 @@ class ComparisonPredicateBase : public ColumnPredicate { void _evaluate_bit(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const { if (column.is_nullable()) { - auto* nullable_column_ptr = + const auto* nullable_column_ptr = vectorized::check_and_get_column(column); - auto& nested_column = nullable_column_ptr->get_nested_column(); - auto& null_map = reinterpret_cast( - nullable_column_ptr->get_null_map_column()) - .get_data(); + const auto& nested_column = nullable_column_ptr->get_nested_column(); + const auto& null_map = reinterpret_cast( + nullable_column_ptr->get_null_map_column()) + .get_data(); _base_evaluate_bit(&nested_column, null_map.data(), sel, size, flags); } else { @@ -468,7 +471,7 @@ class ComparisonPredicateBase : public ColumnPredicate { const TArray* __restrict data_array, const TValue& value) const { //uint8_t helps compiler to generate vectorized code - uint8_t* flags = reinterpret_cast(bflags); + auto* flags = reinterpret_cast(bflags); if constexpr (is_and) { for (uint16_t i = 0; i < size; i++) { if constexpr (is_nullable) { @@ -514,9 +517,9 @@ class ComparisonPredicateBase : public ColumnPredicate { const uint16_t* sel, uint16_t size, bool* flags) const { if (column->is_column_dictionary()) { if constexpr (std::is_same_v) { - auto* dict_column_ptr = + const auto* dict_column_ptr = vectorized::check_and_get_column(column); - auto* data_array = dict_column_ptr->get_data().data(); + const auto* data_array = dict_column_ptr->get_data().data(); auto dict_code = _find_code_from_dictionary_column(*dict_column_ptr); _base_loop_bit(sel, size, flags, null_map, data_array, dict_code); @@ -540,10 +543,10 @@ class ComparisonPredicateBase : public ColumnPredicate { uint16_t* sel, uint16_t size) const { if (column->is_column_dictionary()) { if constexpr (std::is_same_v) { - auto* dict_column_ptr = + const auto* dict_column_ptr = vectorized::check_and_get_column(column); - auto& pred_col = dict_column_ptr->get_data(); - auto pred_col_data = pred_col.data(); + const auto& pred_col = dict_column_ptr->get_data(); + const auto* pred_col_data = pred_col.data(); auto dict_code = _find_code_from_dictionary_column(*dict_column_ptr); if constexpr (PT == PredicateType::EQ) { diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index 2c7e654787a650a..b762468b3455a47 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -24,6 +24,7 @@ #include "common/config.h" #include "common/logging.h" #include "olap/cumulative_compaction_policy.h" +#include "olap/cumulative_compaction_time_series_policy.h" #include "olap/olap_define.h" #include "olap/rowset/rowset_meta.h" #include "olap/tablet.h" @@ -35,16 +36,19 @@ namespace doris { using namespace ErrorCode; -namespace { - -void find_longest_consecutive_version(std::vector* rowsets, - std::vector* missing_version) { +void CumulativeCompaction::find_longest_consecutive_version(std::vector* rowsets, + std::vector* missing_version) { if (rowsets->empty()) { return; } RowsetSharedPtr prev_rowset = rowsets->front(); size_t i = 1; + int max_start = 0; + int max_length = 1; + + int start = 0; + int length = 1; for (; i < rowsets->size(); ++i) { RowsetSharedPtr rowset = (*rowsets)[i]; if (rowset->start_version() != prev_rowset->end_version() + 1) { @@ -52,16 +56,22 @@ void find_longest_consecutive_version(std::vector* rowsets, missing_version->push_back(prev_rowset->version()); missing_version->push_back(rowset->version()); } - break; + start = i; + length = 1; + } else { + length++; } + + if (length > max_length) { + max_start = start; + max_length = length; + } + prev_rowset = rowset; } - - rowsets->resize(i); + *rowsets = {rowsets->begin() + max_start, rowsets->begin() + max_start + max_length}; } -} // namespace - CumulativeCompaction::CumulativeCompaction(StorageEngine& engine, const TabletSharedPtr& tablet) : CompactionMixin(engine, tablet, "CumulativeCompaction:" + std::to_string(tablet->tablet_id())) {} @@ -100,16 +110,20 @@ Status CumulativeCompaction::execute_compact() { RETURN_IF_ERROR(CompactionMixin::execute_compact()); DCHECK_EQ(_state, CompactionState::SUCCESS); - - tablet()->cumulative_compaction_policy()->update_compaction_level(tablet(), _input_rowsets, - _output_rowset); + if (tablet()->tablet_meta()->time_series_compaction_level_threshold() >= 2) { + tablet()->cumulative_compaction_policy()->update_compaction_level(tablet(), _input_rowsets, + _output_rowset); + } tablet()->cumulative_compaction_policy()->update_cumulative_point( tablet(), _input_rowsets, _output_rowset, _last_delete_version); VLOG_CRITICAL << "after cumulative compaction, current cumulative point is " << tablet()->cumulative_layer_point() << ", tablet=" << _tablet->tablet_id(); - - tablet()->set_last_cumu_compaction_success_time(UnixMillis()); + // TIME_SERIES_POLICY, generating an empty rowset doesn't need to update the timestamp. + if (!(tablet()->tablet_meta()->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY && + _output_rowset->num_segments() == 0)) { + tablet()->set_last_cumu_compaction_success_time(UnixMillis()); + } DorisMetrics::instance()->cumulative_compaction_deltas_total->increment(_input_rowsets.size()); DorisMetrics::instance()->cumulative_compaction_bytes_total->increment(_input_rowsets_size); @@ -127,10 +141,11 @@ Status CumulativeCompaction::pick_rowsets_to_compact() { std::vector missing_versions; find_longest_consecutive_version(&candidate_rowsets, &missing_versions); if (!missing_versions.empty()) { - DCHECK(missing_versions.size() == 2); + DCHECK(missing_versions.size() % 2 == 0); LOG(WARNING) << "There are missed versions among rowsets. " - << "prev rowset verison=" << missing_versions[0] - << ", next rowset version=" << missing_versions[1] + << "total missed version size: " << missing_versions.size() / 2 + << " first missed version prev rowset verison=" << missing_versions[0] + << ", first missed version next rowset version=" << missing_versions[1] << ", tablet=" << _tablet->tablet_id(); } diff --git a/be/src/olap/cumulative_compaction.h b/be/src/olap/cumulative_compaction.h index 14527bf2faba608..276e3b3490311c6 100644 --- a/be/src/olap/cumulative_compaction.h +++ b/be/src/olap/cumulative_compaction.h @@ -44,6 +44,9 @@ class CumulativeCompaction final : public CompactionMixin { Status pick_rowsets_to_compact(); + void find_longest_consecutive_version(std::vector* rowsets, + std::vector* missing_version); + Version _last_delete_version {-1, -1}; }; diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp index 640c1aa4f81d392..4070bd1dd4340e3 100644 --- a/be/src/olap/data_dir.cpp +++ b/be/src/olap/data_dir.cpp @@ -1006,6 +1006,7 @@ void DataDir::perform_remote_rowset_gc() { auto st = fs->batch_delete(seg_paths); if (st.ok()) { deleted_keys.push_back(std::move(key)); + unused_remote_rowset_num << -1; } else { LOG(WARNING) << "failed to delete remote rowset. err=" << st; } diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 6819d7d90f3ef73..10c6f50b30098d8 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -21,8 +21,6 @@ #include #include -#include -#include #include #include @@ -40,12 +38,10 @@ using apache::thrift::ThriftDebugString; using std::vector; using std::string; -using std::stringstream; using ::google::protobuf::RepeatedPtrField; namespace doris { -using namespace ErrorCode; // construct sub condition from TCondition std::string construct_sub_predicate(const TCondition& condition) { @@ -124,18 +120,20 @@ Status DeleteHandler::generate_delete_predicate(const TabletSchema& schema, } else { // write sub predicate v1 for compactbility std::string condition_str = construct_sub_predicate(condition); - if (TCondition tmp; !DeleteHandler::parse_condition(condition_str, &tmp)) { - LOG(WARNING) << "failed to parse condition_str, condtion=" - << ThriftDebugString(condition); - return Status::Error( - "failed to parse condition_str, condtion={}", ThriftDebugString(condition)); - } VLOG_NOTICE << __PRETTY_FUNCTION__ << " condition_str: " << condition_str; del_pred->add_sub_predicates(condition_str); DeleteSubPredicatePB* sub_predicate = del_pred->add_sub_predicates_v2(); if (condition.__isset.column_unique_id) { + // only light schema change capable table set this field sub_predicate->set_column_unique_id(condition.column_unique_id); + } else if (TCondition tmp; !DeleteHandler::parse_condition(condition_str, &tmp)) { + // for non light shema change tables, check regex match for condition str + LOG(WARNING) << "failed to parse condition_str, condtion=" + << ThriftDebugString(condition); + return Status::Error( + "failed to parse condition_str, condtion={}", ThriftDebugString(condition)); } + sub_predicate->set_column_name(condition.column_name); sub_predicate->set_op(trans_op(condition.condition_op)); sub_predicate->set_cond_value(condition.condition_values[0]); @@ -312,38 +310,35 @@ Status DeleteHandler::parse_condition(const DeleteSubPredicatePB& sub_cond, TCon // value: matches "1597751948193618247 and length(source)<1;\n;\n" // // For more info, see DeleteHandler::construct_sub_predicates -// FIXME(gavin): support unicode. And this is a tricky implementation, it should -// not be the final resolution, refactor it. +// FIXME(gavin): This is a tricky implementation, it should not be the final resolution, refactor it. const char* const CONDITION_STR_PATTERN = - // .----------------- column-name ----------------. .----------------------- operator ------------------------. .------------ value ----------. - R"(([_a-zA-Z@0-9\s/][.a-zA-Z0-9_+-/?@#$%^&*"\s,:]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; - // '----------------- group 1 --------------------' '--------------------- group 2 ---------------------------' | '-- group 4--' | - // match any of: = != >> << >= <= *= " IS " '----------- group 3 ---------' - // match **ANY THING** without(4) - // or with(3) single quote -boost::regex DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); + // .----------------- column-name --------------------------. .----------------------- operator ------------------------. .------------ value ----------. + R"(([_a-zA-Z@0-9\s/\p{L}][.a-zA-Z0-9_+-/?@#$%^&*"\s,:\p{L}]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; + // '----------------- group 1 ------------------------------' '--------------------- group 2 ---------------------------' | '-- group 4--' | + // match any of: = != >> << >= <= *= " IS " '----------- group 3 ---------' + // match **ANY THING** without(4) + // or with(3) single quote // clang-format on +RE2 DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); Status DeleteHandler::parse_condition(const std::string& condition_str, TCondition* condition) { - bool matched = false; - boost::smatch what; - try { - VLOG_NOTICE << "condition_str: " << condition_str; - matched = boost::regex_match(condition_str, what, DELETE_HANDLER_REGEX) && - condition_str.size() == what[0].str().size(); // exact match - } catch (boost::regex_error& e) { - VLOG_NOTICE << "fail to parse expr. [expr=" << condition_str << "; error=" << e.what() - << "]"; - } + std::string col_name, op, value, g4; + + bool matched = RE2::FullMatch(condition_str, DELETE_HANDLER_REGEX, &col_name, &op, &value, + &g4); // exact match + if (!matched) { - return Status::Error("fail to sub condition. condition={}", - condition_str); + return Status::InvalidArgument("fail to sub condition. condition={}", condition_str); } - condition->column_name = what[1].str(); - condition->condition_op = what[2].str() == " IS " ? "IS" : what[2].str(); + condition->column_name = col_name; + condition->condition_op = op == " IS " ? "IS" : op; // match string with single quotes, a = b or a = 'b' - condition->condition_values.push_back(what[3 + !!what[4].matched].str()); + if (!g4.empty()) { + condition->condition_values.push_back(g4); + } else { + condition->condition_values.push_back(value); + } VLOG_NOTICE << "parsed condition_str: col_name={" << condition->column_name << "} op={" << condition->condition_op << "} val={" << condition->condition_values.back() << "}"; diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h index 7cffd0238c8c47c..bd91fe147fbb438 100644 --- a/be/src/olap/in_list_predicate.h +++ b/be/src/olap/in_list_predicate.h @@ -340,7 +340,9 @@ class InListPredicateBase : public ColumnPredicate { return PT == PredicateType::IN_LIST && !ngram; } - double get_ignore_threshold() const override { return std::log2(_values->size() + 1) / 64; } + double get_ignore_threshold() const override { + return get_in_list_ignore_thredhold(_values->size()); + } private: bool _can_ignore() const override { return _values->is_runtime_filter(); } diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index 0332e3f2e319d78..683e38775f34c2b 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -51,9 +51,9 @@ Status MatchPredicate::evaluate(const vectorized::IndexFieldNameAndTypePair& nam if (iterator == nullptr) { return Status::OK(); } - if (_skip_evaluate(iterator)) { - return Status::Error( - "match predicate evaluate skipped."); + if (_check_evaluate(iterator)) { + return Status::Error( + "phrase queries require setting support_phrase = true"); } auto type = name_with_type.second; const std::string& name = name_with_type.first; @@ -122,13 +122,14 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m return ret; } -bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const { - if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || - _match_type == MatchType::MATCH_PHRASE_EDGE) && - iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && - get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == - INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { - return true; +bool MatchPredicate::_check_evaluate(InvertedIndexIterator* iterator) const { + if (_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || + _match_type == MatchType::MATCH_PHRASE_EDGE) { + if (iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && + get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { + return true; + } } return false; } diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h index 17d8e76ac88e11c..ad202b7b2427cf4 100644 --- a/be/src/olap/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -79,7 +79,7 @@ class MatchPredicate : public ColumnPredicate { std::string info = "MatchPredicate"; return info; } - bool _skip_evaluate(InvertedIndexIterator* iterator) const; + bool _check_evaluate(InvertedIndexIterator* iterator) const; private: std::string _value; diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index 6d7143a293c1f25..cfcac0bcb8f8b37 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -140,6 +140,7 @@ Status FlushToken::_do_flush_memtable(MemTable* memtable, int32_t segment_id, in SCOPED_RAW_TIMER(&duration_ns); SCOPED_ATTACH_TASK(memtable->query_thread_context()); signal::set_signal_task_id(_rowset_writer->load_id()); + signal::tablet_id = memtable->tablet_id(); { SCOPED_CONSUME_MEM_TRACKER(memtable->flush_mem_tracker()); std::unique_ptr block; diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index ad70241ad87cea9..87792db93a6645e 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -336,16 +336,12 @@ Status Merger::vertical_compact_one_group( } // for segcompaction -Status Merger::vertical_compact_one_group(int64_t tablet_id, ReaderType reader_type, - const TabletSchema& tablet_schema, bool is_key, - const std::vector& column_group, - vectorized::RowSourcesBuffer* row_source_buf, - vectorized::VerticalBlockReader& src_block_reader, - segment_v2::SegmentWriter& dst_segment_writer, - int64_t max_rows_per_segment, Statistics* stats_output, - uint64_t* index_size, KeyBoundsPB& key_bounds) { - // build tablet reader - VLOG_NOTICE << "vertical compact one group, max_rows_per_segment=" << max_rows_per_segment; +Status Merger::vertical_compact_one_group( + int64_t tablet_id, ReaderType reader_type, const TabletSchema& tablet_schema, bool is_key, + const std::vector& column_group, vectorized::RowSourcesBuffer* row_source_buf, + vectorized::VerticalBlockReader& src_block_reader, + segment_v2::SegmentWriter& dst_segment_writer, Statistics* stats_output, + uint64_t* index_size, KeyBoundsPB& key_bounds, SimpleRowIdConversion* rowid_conversion) { // TODO: record_rowids vectorized::Block block = tablet_schema.create_block(column_group); size_t output_rows = 0; @@ -362,6 +358,9 @@ Status Merger::vertical_compact_one_group(int64_t tablet_id, ReaderType reader_t "failed to write block when merging rowsets of tablet " + std::to_string(tablet_id)); + if (is_key && rowid_conversion != nullptr) { + rowid_conversion->add(src_block_reader.current_block_row_locations()); + } output_rows += block.rows(); block.clear_column_data(); } diff --git a/be/src/olap/merger.h b/be/src/olap/merger.h index 7513c90fbd1217d..cb05162b3bc9a1e 100644 --- a/be/src/olap/merger.h +++ b/be/src/olap/merger.h @@ -23,6 +23,7 @@ #include "io/io_common.h" #include "olap/iterators.h" #include "olap/rowset/rowset_fwd.h" +#include "olap/simple_rowid_conversion.h" #include "olap/tablet_fwd.h" namespace doris { @@ -82,8 +83,9 @@ class Merger { vectorized::RowSourcesBuffer* row_source_buf, vectorized::VerticalBlockReader& src_block_reader, segment_v2::SegmentWriter& dst_segment_writer, - int64_t max_rows_per_segment, Statistics* stats_output, - uint64_t* index_size, KeyBoundsPB& key_bounds); + Statistics* stats_output, uint64_t* index_size, + KeyBoundsPB& key_bounds, + SimpleRowIdConversion* rowid_conversion); }; } // namespace doris diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index 5d188f40caf67ac..dac1750c24b54f2 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -33,6 +33,7 @@ #include #include #include +#include #include "io/io_common.h" #include "olap/olap_define.h" @@ -57,6 +58,7 @@ enum CompactionType { BASE_COMPACTION = 1, CUMULATIVE_COMPACTION = 2, FULL_COMPA enum DataDirType { SPILL_DISK_DIR, OLAP_DATA_DIR, + DATA_CACHE_DIR, }; struct DataDirInfo { @@ -70,6 +72,7 @@ struct DataDirInfo { bool is_used = false; // whether available mark TStorageMedium::type storage_medium = TStorageMedium::HDD; // Storage medium type: SSD|HDD DataDirType data_dir_type = DataDirType::OLAP_DATA_DIR; + std::string bvar_name; }; struct PredicateFilterInfo { int type = 0; @@ -367,10 +370,13 @@ struct OlapReaderStatistics { int64_t inverted_index_query_timer = 0; int64_t inverted_index_query_cache_hit = 0; int64_t inverted_index_query_cache_miss = 0; + int64_t inverted_index_query_null_bitmap_timer = 0; int64_t inverted_index_query_bitmap_copy_timer = 0; int64_t inverted_index_query_bitmap_op_timer = 0; int64_t inverted_index_searcher_open_timer = 0; int64_t inverted_index_searcher_search_timer = 0; + int64_t inverted_index_searcher_cache_hit = 0; + int64_t inverted_index_searcher_cache_miss = 0; int64_t output_index_result_column_timer = 0; // number of segment filtered by column stat when creating seg iterator @@ -503,12 +509,12 @@ class DeleteBitmap; // merge on write context struct MowContext { MowContext(int64_t version, int64_t txnid, const RowsetIdUnorderedSet& ids, - const std::vector& rowset_ptrs, std::shared_ptr db) + std::vector rowset_ptrs, std::shared_ptr db) : max_version(version), txn_id(txnid), rowset_ids(ids), - rowset_ptrs(rowset_ptrs), - delete_bitmap(db) {} + rowset_ptrs(std::move(rowset_ptrs)), + delete_bitmap(std::move(db)) {} int64_t max_version; int64_t txn_id; const RowsetIdUnorderedSet& rowset_ids; @@ -516,15 +522,6 @@ struct MowContext { std::shared_ptr delete_bitmap; }; -// used in mow partial update -struct RidAndPos { - uint32_t rid; - // pos in block - size_t pos; -}; - -using PartialUpdateReadPlan = std::map>>; - // used for controll compaction struct VersionWithTime { std::atomic version; diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp index 345c1bed4ff21f9..7c88156f74cef5b 100644 --- a/be/src/olap/olap_server.cpp +++ b/be/src/olap/olap_server.cpp @@ -301,12 +301,6 @@ Status StorageEngine::start_bg_threads() { [this]() { this->_tablet_path_check_callback(); }, &_tablet_path_check_thread)); LOG(INFO) << "tablet path check thread started"; - // cache clean thread - RETURN_IF_ERROR(Thread::create( - "StorageEngine", "cache_clean_thread", [this]() { this->_cache_clean_callback(); }, - &_cache_clean_thread)); - LOG(INFO) << "cache clean thread started"; - // path scan and gc thread if (config::path_gc_check) { for (auto data_dir : get_stores()) { @@ -359,42 +353,6 @@ Status StorageEngine::start_bg_threads() { return Status::OK(); } -void StorageEngine::_cache_clean_callback() { - int32_t interval = config::cache_periodic_prune_stale_sweep_sec; - while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))) { - if (interval <= 0) { - LOG(WARNING) << "config of cache clean interval is illegal: [" << interval - << "], force set to 3600 "; - interval = 3600; - } - if (config::disable_memory_gc) { - continue; - } - - CacheManager::instance()->for_each_cache_prune_stale(); - - // Dynamically modify the config to clear the cache, each time the disable cache will only be cleared once. - if (config::disable_segment_cache) { - if (!_clear_segment_cache) { - CacheManager::instance()->clear_once(CachePolicy::CacheType::SEGMENT_CACHE); - _clear_segment_cache = true; - } - } else { - _clear_segment_cache = false; - } - if (config::disable_storage_page_cache) { - if (!_clear_page_cache) { - CacheManager::instance()->clear_once(CachePolicy::CacheType::DATA_PAGE_CACHE); - CacheManager::instance()->clear_once(CachePolicy::CacheType::INDEXPAGE_CACHE); - CacheManager::instance()->clear_once(CachePolicy::CacheType::PK_INDEX_PAGE_CACHE); - _clear_page_cache = true; - } - } else { - _clear_page_cache = false; - } - } -} - void StorageEngine::_garbage_sweeper_thread_callback() { uint32_t max_interval = config::max_garbage_sweep_interval; uint32_t min_interval = config::min_garbage_sweep_interval; diff --git a/be/src/olap/parallel_scanner_builder.cpp b/be/src/olap/parallel_scanner_builder.cpp index ac57448ade7e6db..10bd61cd8d5d4b6 100644 --- a/be/src/olap/parallel_scanner_builder.cpp +++ b/be/src/olap/parallel_scanner_builder.cpp @@ -179,13 +179,17 @@ Status ParallelScannerBuilder::_load() { RETURN_IF_ERROR(tablet->capture_consistent_rowsets_unlocked({0, version}, &rowsets)); } + bool enable_segment_cache = _state->query_options().__isset.enable_segment_cache + ? _state->query_options().enable_segment_cache + : true; for (auto& rowset : rowsets) { RETURN_IF_ERROR(rowset->load()); const auto rowset_id = rowset->rowset_id(); auto& segment_cache_handle = _segment_cache_handles[rowset_id]; RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( - std::dynamic_pointer_cast(rowset), &segment_cache_handle, true)); + std::dynamic_pointer_cast(rowset), &segment_cache_handle, + enable_segment_cache, false)); _total_rows += rowset->num_rows(); } } @@ -204,4 +208,4 @@ std::shared_ptr ParallelScannerBuilder::_build_scanner( return NewOlapScanner::create_shared(_parent, std::move(params)); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/olap/partial_update_info.cpp b/be/src/olap/partial_update_info.cpp new file mode 100644 index 000000000000000..bff3f4196369db2 --- /dev/null +++ b/be/src/olap/partial_update_info.cpp @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/partial_update_info.h" + +#include + +#include "olap/base_tablet.h" +#include "olap/olap_common.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_writer_context.h" +#include "olap/tablet_schema.h" +#include "olap/utils.h" +#include "vec/common/assert_cast.h" +#include "vec/core/block.h" + +namespace doris { + +void PartialUpdateInfo::init(const TabletSchema& tablet_schema, bool partial_update, + const std::set& partial_update_cols, bool is_strict_mode, + int64_t timestamp_ms, const std::string& timezone, + const std::string& auto_increment_column, int64_t cur_max_version) { + is_partial_update = partial_update; + partial_update_input_columns = partial_update_cols; + max_version_in_flush_phase = cur_max_version; + this->timestamp_ms = timestamp_ms; + this->timezone = timezone; + missing_cids.clear(); + update_cids.clear(); + for (auto i = 0; i < tablet_schema.num_columns(); ++i) { + auto tablet_column = tablet_schema.column(i); + if (!partial_update_input_columns.contains(tablet_column.name())) { + missing_cids.emplace_back(i); + if (!tablet_column.has_default_value() && !tablet_column.is_nullable() && + tablet_schema.auto_increment_column() != tablet_column.name()) { + can_insert_new_rows_in_partial_update = false; + } + } else { + update_cids.emplace_back(i); + } + if (auto_increment_column == tablet_column.name()) { + is_schema_contains_auto_inc_column = true; + } + } + this->is_strict_mode = is_strict_mode; + is_input_columns_contains_auto_inc_column = + is_partial_update && partial_update_input_columns.contains(auto_increment_column); + _generate_default_values_for_missing_cids(tablet_schema); +} + +void PartialUpdateInfo::to_pb(PartialUpdateInfoPB* partial_update_info_pb) const { + partial_update_info_pb->set_is_partial_update(is_partial_update); + partial_update_info_pb->set_max_version_in_flush_phase(max_version_in_flush_phase); + for (const auto& col : partial_update_input_columns) { + partial_update_info_pb->add_partial_update_input_columns(col); + } + for (auto cid : missing_cids) { + partial_update_info_pb->add_missing_cids(cid); + } + for (auto cid : update_cids) { + partial_update_info_pb->add_update_cids(cid); + } + partial_update_info_pb->set_can_insert_new_rows_in_partial_update( + can_insert_new_rows_in_partial_update); + partial_update_info_pb->set_is_strict_mode(is_strict_mode); + partial_update_info_pb->set_timestamp_ms(timestamp_ms); + partial_update_info_pb->set_timezone(timezone); + partial_update_info_pb->set_is_input_columns_contains_auto_inc_column( + is_input_columns_contains_auto_inc_column); + partial_update_info_pb->set_is_schema_contains_auto_inc_column( + is_schema_contains_auto_inc_column); + for (const auto& value : default_values) { + partial_update_info_pb->add_default_values(value); + } +} + +void PartialUpdateInfo::from_pb(PartialUpdateInfoPB* partial_update_info_pb) { + is_partial_update = partial_update_info_pb->is_partial_update(); + max_version_in_flush_phase = partial_update_info_pb->has_max_version_in_flush_phase() + ? partial_update_info_pb->max_version_in_flush_phase() + : -1; + partial_update_input_columns.clear(); + for (const auto& col : partial_update_info_pb->partial_update_input_columns()) { + partial_update_input_columns.insert(col); + } + missing_cids.clear(); + for (auto cid : partial_update_info_pb->missing_cids()) { + missing_cids.push_back(cid); + } + update_cids.clear(); + for (auto cid : partial_update_info_pb->update_cids()) { + update_cids.push_back(cid); + } + can_insert_new_rows_in_partial_update = + partial_update_info_pb->can_insert_new_rows_in_partial_update(); + is_strict_mode = partial_update_info_pb->is_strict_mode(); + timestamp_ms = partial_update_info_pb->timestamp_ms(); + timezone = partial_update_info_pb->timezone(); + is_input_columns_contains_auto_inc_column = + partial_update_info_pb->is_input_columns_contains_auto_inc_column(); + is_schema_contains_auto_inc_column = + partial_update_info_pb->is_schema_contains_auto_inc_column(); + default_values.clear(); + for (const auto& value : partial_update_info_pb->default_values()) { + default_values.push_back(value); + } +} + +std::string PartialUpdateInfo::summary() const { + return fmt::format( + "update_cids={}, missing_cids={}, is_strict_mode={}, max_version_in_flush_phase={}", + update_cids.size(), missing_cids.size(), is_strict_mode, max_version_in_flush_phase); +} + +void PartialUpdateInfo::_generate_default_values_for_missing_cids( + const TabletSchema& tablet_schema) { + for (unsigned int cur_cid : missing_cids) { + const auto& column = tablet_schema.column(cur_cid); + if (column.has_default_value()) { + std::string default_value; + if (UNLIKELY(column.type() == FieldType::OLAP_FIELD_TYPE_DATETIMEV2 && + to_lower(column.default_value()).find(to_lower("CURRENT_TIMESTAMP")) != + std::string::npos)) { + DateV2Value dtv; + dtv.from_unixtime(timestamp_ms / 1000, timezone); + default_value = dtv.debug_string(); + } else if (UNLIKELY(column.type() == FieldType::OLAP_FIELD_TYPE_DATEV2 && + to_lower(column.default_value()).find(to_lower("CURRENT_DATE")) != + std::string::npos)) { + DateV2Value dv; + dv.from_unixtime(timestamp_ms / 1000, timezone); + default_value = dv.debug_string(); + } else { + default_value = column.default_value(); + } + default_values.emplace_back(default_value); + } else { + // place an empty string here + default_values.emplace_back(); + } + } + CHECK_EQ(missing_cids.size(), default_values.size()); +} + +void PartialUpdateReadPlan::prepare_to_read(const RowLocation& row_location, size_t pos) { + plan[row_location.rowset_id][row_location.segment_id].emplace_back(row_location.row_id, pos); +} + +// read columns by read plan +// read_index: ori_pos-> block_idx +Status PartialUpdateReadPlan::read_columns_by_plan( + const TabletSchema& tablet_schema, const std::vector cids_to_read, + const std::map& rsid_to_rowset, vectorized::Block& block, + std::map* read_index, const signed char* __restrict skip_map) const { + bool has_row_column = tablet_schema.has_row_store_for_all_columns(); + auto mutable_columns = block.mutate_columns(); + size_t read_idx = 0; + for (const auto& [rowset_id, segment_row_mappings] : plan) { + for (const auto& [segment_id, mappings] : segment_row_mappings) { + auto rowset_iter = rsid_to_rowset.find(rowset_id); + CHECK(rowset_iter != rsid_to_rowset.end()); + std::vector rids; + for (auto [rid, pos] : mappings) { + if (skip_map && skip_map[pos]) { + continue; + } + rids.emplace_back(rid); + (*read_index)[pos] = read_idx++; + } + if (has_row_column) { + auto st = doris::BaseTablet::fetch_value_through_row_column( + rowset_iter->second, tablet_schema, segment_id, rids, cids_to_read, block); + if (!st.ok()) { + LOG(WARNING) << "failed to fetch value through row column"; + return st; + } + continue; + } + for (size_t cid = 0; cid < mutable_columns.size(); ++cid) { + TabletColumn tablet_column = tablet_schema.column(cids_to_read[cid]); + auto st = doris::BaseTablet::fetch_value_by_rowids( + rowset_iter->second, segment_id, rids, tablet_column, mutable_columns[cid]); + // set read value to output block + if (!st.ok()) { + LOG(WARNING) << "failed to fetch value"; + return st; + } + } + } + } + block.set_columns(std::move(mutable_columns)); + return Status::OK(); +} + +Status PartialUpdateReadPlan::fill_missing_columns( + RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, vectorized::Block& full_block, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + const size_t& segment_start_pos, const vectorized::Block* block) const { + auto mutable_full_columns = full_block.mutate_columns(); + // create old value columns + const auto& missing_cids = rowset_ctx->partial_update_info->missing_cids; + auto old_value_block = tablet_schema.create_block_by_cids(missing_cids); + CHECK_EQ(missing_cids.size(), old_value_block.columns()); + + // record real pos, key is input line num, value is old_block line num + std::map read_index; + RETURN_IF_ERROR(read_columns_by_plan(tablet_schema, missing_cids, rsid_to_rowset, + old_value_block, &read_index, nullptr)); + + const auto* delete_sign_column_data = BaseTablet::get_delete_sign_column_data(old_value_block); + + // build default value columns + auto default_value_block = old_value_block.clone_empty(); + if (has_default_or_nullable || delete_sign_column_data != nullptr) { + RETURN_IF_ERROR(BaseTablet::generate_default_value_block( + tablet_schema, missing_cids, rowset_ctx->partial_update_info->default_values, + old_value_block, default_value_block)); + } + auto mutable_default_value_columns = default_value_block.mutate_columns(); + + // fill all missing value from mutable_old_columns, need to consider default value and null value + for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { + // `use_default_or_null_flag[idx] == false` doesn't mean that we should read values from the old row + // for the missing columns. For example, if a table has sequence column, the rows with DELETE_SIGN column + // marked will not be marked in delete bitmap(see https://github.com/apache/doris/pull/24011), so it will + // be found in Tablet::lookup_row_key() and `use_default_or_null_flag[idx]` will be false. But we should not + // read values from old rows for missing values in this occasion. So we should read the DELETE_SIGN column + // to check if a row REALLY exists in the table. + auto pos_in_old_block = read_index[idx + segment_start_pos]; + if (use_default_or_null_flag[idx] || (delete_sign_column_data != nullptr && + delete_sign_column_data[pos_in_old_block] != 0)) { + for (auto i = 0; i < missing_cids.size(); ++i) { + // if the column has default value, fill it with default value + // otherwise, if the column is nullable, fill it with null value + const auto& tablet_column = tablet_schema.column(missing_cids[i]); + auto& missing_col = mutable_full_columns[missing_cids[i]]; + // clang-format off + if (tablet_column.has_default_value()) { + missing_col->insert_from(*mutable_default_value_columns[i].get(), 0); + } else if (tablet_column.is_nullable()) { + auto* nullable_column = + assert_cast(missing_col.get()); + nullable_column->insert_null_elements(1); + } else if (tablet_schema.auto_increment_column() == tablet_column.name()) { + const auto& column = + *DORIS_TRY(rowset_ctx->tablet_schema->column(tablet_column.name())); + DCHECK(column.type() == FieldType::OLAP_FIELD_TYPE_BIGINT); + auto* auto_inc_column = + assert_cast(missing_col.get()); + auto_inc_column->insert( + (assert_cast( + block->get_by_name("__PARTIAL_UPDATE_AUTO_INC_COLUMN__").column.get()))->get_element(idx)); + } else { + // If the control flow reaches this branch, the column neither has default value + // nor is nullable. It means that the row's delete sign is marked, and the value + // columns are useless and won't be read. So we can just put arbitary values in the cells + missing_col->insert_default(); + } + // clang-format on + } + continue; + } + for (auto i = 0; i < missing_cids.size(); ++i) { + mutable_full_columns[missing_cids[i]]->insert_from( + *old_value_block.get_columns_with_type_and_name()[i].column.get(), + pos_in_old_block); + } + } + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/olap/partial_update_info.h b/be/src/olap/partial_update_info.h index f20f9680b0b57a5..a99bf7181184f40 100644 --- a/be/src/olap/partial_update_info.h +++ b/be/src/olap/partial_update_info.h @@ -16,81 +16,41 @@ // under the License. #pragma once +#include +#include +#include +#include +#include -#include "olap/tablet_schema.h" +#include "common/status.h" +#include "olap/rowset/rowset_fwd.h" +#include "olap/tablet_fwd.h" namespace doris { +class TabletSchema; +class PartialUpdateInfoPB; +struct RowLocation; +namespace vectorized { +class Block; +} +struct RowsetWriterContext; +struct RowsetId; struct PartialUpdateInfo { void init(const TabletSchema& tablet_schema, bool partial_update, - const std::set& partial_update_cols, bool is_strict_mode, + const std::set& partial_update_cols, bool is_strict_mode, int64_t timestamp_ms, const std::string& timezone, - const std::string& auto_increment_column) { - is_partial_update = partial_update; - partial_update_input_columns = partial_update_cols; - - this->timestamp_ms = timestamp_ms; - this->timezone = timezone; - missing_cids.clear(); - update_cids.clear(); - for (auto i = 0; i < tablet_schema.num_columns(); ++i) { - auto tablet_column = tablet_schema.column(i); - if (!partial_update_input_columns.contains(tablet_column.name())) { - missing_cids.emplace_back(i); - if (!tablet_column.has_default_value() && !tablet_column.is_nullable() && - tablet_schema.auto_increment_column() != tablet_column.name()) { - can_insert_new_rows_in_partial_update = false; - } - } else { - update_cids.emplace_back(i); - } - if (auto_increment_column == tablet_column.name()) { - is_schema_contains_auto_inc_column = true; - } - } - this->is_strict_mode = is_strict_mode; - is_input_columns_contains_auto_inc_column = - is_partial_update && partial_update_input_columns.contains(auto_increment_column); - _generate_default_values_for_missing_cids(tablet_schema); - } + const std::string& auto_increment_column, int64_t cur_max_version = -1); + void to_pb(PartialUpdateInfoPB* partial_update_info) const; + void from_pb(PartialUpdateInfoPB* partial_update_info); + std::string summary() const; private: - void _generate_default_values_for_missing_cids(const TabletSchema& tablet_schema) { - for (auto i = 0; i < missing_cids.size(); ++i) { - auto cur_cid = missing_cids[i]; - const auto& column = tablet_schema.column(cur_cid); - if (column.has_default_value()) { - std::string default_value; - if (UNLIKELY(tablet_schema.column(cur_cid).type() == - FieldType::OLAP_FIELD_TYPE_DATETIMEV2 && - to_lower(tablet_schema.column(cur_cid).default_value()) - .find(to_lower("CURRENT_TIMESTAMP")) != - std::string::npos)) { - DateV2Value dtv; - dtv.from_unixtime(timestamp_ms / 1000, timezone); - default_value = dtv.debug_string(); - } else if (UNLIKELY(tablet_schema.column(cur_cid).type() == - FieldType::OLAP_FIELD_TYPE_DATEV2 && - to_lower(tablet_schema.column(cur_cid).default_value()) - .find(to_lower("CURRENT_DATE")) != - std::string::npos)) { - DateV2Value dv; - dv.from_unixtime(timestamp_ms / 1000, timezone); - default_value = dv.debug_string(); - } else { - default_value = tablet_schema.column(cur_cid).default_value(); - } - default_values.emplace_back(default_value); - } else { - // place an empty string here - default_values.emplace_back(); - } - } - CHECK_EQ(missing_cids.size(), default_values.size()); - } + void _generate_default_values_for_missing_cids(const TabletSchema& tablet_schema); public: bool is_partial_update {false}; + int64_t max_version_in_flush_phase {-1}; std::set partial_update_input_columns; std::vector missing_cids; std::vector update_cids; @@ -106,4 +66,31 @@ struct PartialUpdateInfo { // default values for missing cids std::vector default_values; }; + +// used in mow partial update +struct RidAndPos { + uint32_t rid; + // pos in block + size_t pos; +}; + +class PartialUpdateReadPlan { +public: + void prepare_to_read(const RowLocation& row_location, size_t pos); + Status read_columns_by_plan(const TabletSchema& tablet_schema, + const std::vector cids_to_read, + const std::map& rsid_to_rowset, + vectorized::Block& block, std::map* read_index, + const signed char* __restrict skip_map = nullptr) const; + Status fill_missing_columns(RowsetWriterContext* rowset_ctx, + const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, vectorized::Block& full_block, + const std::vector& use_default_or_null_flag, + bool has_default_or_nullable, const size_t& segment_start_pos, + const vectorized::Block* block) const; + +private: + std::map>> plan; +}; + } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index 6d917c78d956ccc..b269051e43f4558 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -178,8 +178,9 @@ Status BetaRowset::load_segment(int64_t seg_id, segment_v2::SegmentSharedPtr* se .cache_base_path = "", .file_size = _rowset_meta->segment_file_size(seg_id), }; + auto s = segment_v2::Segment::open(fs, seg_path, seg_id, rowset_id(), _schema, reader_options, - segment); + segment, _rowset_meta->inverted_index_file_info(seg_id)); if (!s.ok()) { LOG(WARNING) << "failed to open segment. " << seg_path << " under rowset " << rowset_id() << " : " << s.to_string(); @@ -537,8 +538,10 @@ Status BetaRowset::check_current_rowset_segment() { .cache_base_path {}, .file_size = _rowset_meta->segment_file_size(seg_id), }; + auto s = segment_v2::Segment::open(fs, seg_path, seg_id, rowset_id(), _schema, - reader_options, &segment); + reader_options, &segment, + _rowset_meta->inverted_index_file_info(seg_id)); if (!s.ok()) { LOG(WARNING) << "segment can not be opened. file=" << seg_path; return s; diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 458b3d29547062f..42456bb862502d6 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -249,10 +249,20 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context } // load segments - bool should_use_cache = use_cache || _read_context->reader_type == ReaderType::READER_QUERY; + bool enable_segment_cache = true; + auto* state = read_context->runtime_state; + if (state != nullptr) { + enable_segment_cache = state->query_options().__isset.enable_segment_cache + ? state->query_options().enable_segment_cache + : true; + } + // When reader type is for query, session variable `enable_segment_cache` should be respected. + bool should_use_cache = use_cache || (_read_context->reader_type == ReaderType::READER_QUERY && + enable_segment_cache); SegmentCacheHandle segment_cache_handle; RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(_rowset, &segment_cache_handle, - should_use_cache)); + should_use_cache, + /*need_load_pk_index_and_bf*/ false)); // create iterator for each segment auto& segments = segment_cache_handle.get_segments(); diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 1f07ce098fa25a9..45f260bdfa15ef7 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -195,7 +195,7 @@ BaseBetaRowsetWriter::BaseBetaRowsetWriter() _num_rows_written(0), _total_data_size(0), _total_index_size(0), - _segment_creator(_context, _seg_files) {} + _segment_creator(_context, _seg_files, _idx_files_info) {} BetaRowsetWriter::BetaRowsetWriter(StorageEngine& engine) : _engine(engine), _segcompaction_worker(std::make_shared(this)) {} @@ -341,7 +341,12 @@ Status BetaRowsetWriter::_find_longest_consecutive_small_segment( if (is_large_segment) { if (segid == _segcompacted_point) { // skip large segments at the front + auto dst_seg_id = _num_segcompacted.load(); RETURN_IF_ERROR(_rename_compacted_segment_plain(_segcompacted_point++)); + if (_segcompaction_worker->need_convert_delete_bitmap()) { + _segcompaction_worker->convert_segment_delete_bitmap( + _context.mow_context->delete_bitmap, segid, dst_seg_id); + } continue; } else { // stop because we need consecutive segments @@ -366,7 +371,13 @@ Status BetaRowsetWriter::_find_longest_consecutive_small_segment( } if (s == 1) { // poor bachelor, let it go VLOG_DEBUG << "only one candidate segment"; + auto src_seg_id = _segcompacted_point.load(); + auto dst_seg_id = _num_segcompacted.load(); RETURN_IF_ERROR(_rename_compacted_segment_plain(_segcompacted_point++)); + if (_segcompaction_worker->need_convert_delete_bitmap()) { + _segcompaction_worker->convert_segment_delete_bitmap( + _context.mow_context->delete_bitmap, src_seg_id, dst_seg_id); + } segments->clear(); return Status::OK(); } @@ -554,7 +565,7 @@ Status BetaRowsetWriter::_segcompaction_rename_last_segments() { "code: {}", _segcompaction_status.load()); } - if (!_is_segcompacted() || _segcompacted_point == _num_segment) { + if (!is_segcompacted() || _segcompacted_point == _num_segment) { // no need if never segcompact before or all segcompacted return Status::OK(); } @@ -562,7 +573,12 @@ Status BetaRowsetWriter::_segcompaction_rename_last_segments() { // so that transaction can be committed ASAP VLOG_DEBUG << "segcompaction last few segments"; for (int32_t segid = _segcompacted_point; segid < _num_segment; segid++) { + auto dst_segid = _num_segcompacted.load(); RETURN_IF_ERROR(_rename_compacted_segment_plain(_segcompacted_point++)); + if (_segcompaction_worker->need_convert_delete_bitmap()) { + _segcompaction_worker->convert_segment_delete_bitmap( + _context.mow_context->delete_bitmap, segid, dst_segid); + } } return Status::OK(); } @@ -682,6 +698,20 @@ Status BetaRowsetWriter::_close_file_writers() { RETURN_NOT_OK_STATUS_WITH_WARN(seg_comp_file_writer->close(), "close segment compaction worker failed"); } + // process delete bitmap for mow table + if (is_segcompacted() && _segcompaction_worker->need_convert_delete_bitmap()) { + auto converted_delete_bitmap = _segcompaction_worker->get_converted_delete_bitmap(); + // which means the segment compaction is triggerd + if (converted_delete_bitmap != nullptr) { + RowsetIdUnorderedSet rowsetids; + rowsetids.insert(rowset_id()); + context().tablet->add_sentinel_mark_to_delete_bitmap(converted_delete_bitmap.get(), + rowsetids); + context().mow_context->delete_bitmap->remove({rowset_id(), 0, 0}, + {rowset_id(), UINT32_MAX, INT64_MAX}); + context().mow_context->delete_bitmap->merge(*converted_delete_bitmap); + } + } } return Status::OK(); } @@ -707,6 +737,14 @@ Status BetaRowsetWriter::build(RowsetSharedPtr& rowset) { : _context.tablet_schema; _rowset_meta->set_tablet_schema(rowset_schema); + if (auto idx_files_info = _idx_files_info.get_inverted_files_info(_segment_start_id); + !idx_files_info.has_value()) [[unlikely]] { + LOG(ERROR) << "expected inverted index files info, but none presents: " + << idx_files_info.error(); + } else { + _rowset_meta->add_inverted_index_files_info(idx_files_info.value()); + } + RETURN_NOT_OK_STATUS_WITH_WARN(RowsetFactory::create_rowset(rowset_schema, _context.tablet_path, _rowset_meta, &rowset), "rowset init failed when build new rowset"); @@ -719,7 +757,7 @@ int64_t BaseBetaRowsetWriter::_num_seg() const { } int64_t BetaRowsetWriter::_num_seg() const { - return _is_segcompacted() ? _num_segcompacted : _num_segment; + return is_segcompacted() ? _num_segcompacted : _num_segment; } // update tablet schema when meet variant columns, before commit_txn @@ -816,13 +854,7 @@ Status BaseBetaRowsetWriter::_build_tmp(RowsetSharedPtr& rowset_ptr) { Status BaseBetaRowsetWriter::_create_file_writer(const std::string& path, io::FileWriterPtr& file_writer) { - io::FileWriterOptions opts { - .write_file_cache = _context.write_file_cache, - .is_cold_data = _context.is_hot_data, - .file_cache_expiration = - _context.file_cache_ttl_sec > 0 && _context.newest_write_timestamp > 0 - ? _context.newest_write_timestamp + _context.file_cache_ttl_sec - : 0}; + io::FileWriterOptions opts = _context.get_file_writer_options(); Status st = _context.fs()->create_file(path, &file_writer, &opts); if (!st.ok()) { LOG(WARNING) << "failed to create writable file. path=" << path << ", err: " << st; @@ -861,10 +893,12 @@ Status BetaRowsetWriter::_create_segment_writer_for_segcompaction( writer_options.rowset_ctx = &_context; writer_options.write_type = _context.write_type; writer_options.write_type = DataWriteType::TYPE_COMPACTION; + writer_options.max_rows_per_segment = _context.max_rows_per_segment; + writer_options.mow_ctx = _context.mow_context; - *writer = std::make_unique( - file_writer.get(), _num_segcompacted, _context.tablet_schema, _context.tablet, - _context.data_dir, _context.max_rows_per_segment, writer_options, _context.mow_context); + *writer = std::make_unique(file_writer.get(), _num_segcompacted, + _context.tablet_schema, _context.tablet, + _context.data_dir, writer_options); if (auto& seg_writer = _segcompaction_worker->get_file_writer(); seg_writer != nullptr && seg_writer->state() != io::FileWriter::State::CLOSED) { RETURN_IF_ERROR(_segcompaction_worker->get_file_writer()->close()); @@ -881,9 +915,9 @@ Status BaseBetaRowsetWriter::_check_segment_number_limit() { if (UNLIKELY(total_segment_num > config::max_segment_num_per_rowset)) { return Status::Error( "too many segments in rowset. tablet_id:{}, rowset_id:{}, max:{}, " - "_num_segment:{}, ", + "_num_segment:{}, rowset_num_rows:{}", _context.tablet_id, _context.rowset_id.to_string(), - config::max_segment_num_per_rowset, _num_segment); + config::max_segment_num_per_rowset, _num_segment, get_rowset_num_rows()); } return Status::OK(); } @@ -895,10 +929,10 @@ Status BetaRowsetWriter::_check_segment_number_limit() { if (UNLIKELY(total_segment_num > config::max_segment_num_per_rowset)) { return Status::Error( "too many segments in rowset. tablet_id:{}, rowset_id:{}, max:{}, _num_segment:{}, " - "_segcompacted_point:{}, _num_segcompacted:{}", + "_segcompacted_point:{}, _num_segcompacted:{}, rowset_num_rows:{}", _context.tablet_id, _context.rowset_id.to_string(), config::max_segment_num_per_rowset, _num_segment, _segcompacted_point, - _num_segcompacted); + _num_segcompacted, get_rowset_num_rows()); } return Status::OK(); } @@ -963,8 +997,8 @@ Status BetaRowsetWriter::flush_segment_writer_for_segcompaction( SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + (*writer)->get_inverted_index_file_size(); - segstat.index_size = index_size + (*writer)->get_inverted_index_file_size(); + segstat.data_size = segment_size + (*writer)->get_inverted_index_total_size(); + segstat.index_size = index_size + (*writer)->get_inverted_index_total_size(); segstat.key_bounds = key_bounds; { std::lock_guard lock(_segid_statistics_map_mutex); diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index 98bb43c60926209..a7ec8fe87e90174 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include @@ -83,6 +84,60 @@ class SegmentFileCollection { bool _closed {false}; }; +// Collect the size of the inverted index files +class InvertedIndexFilesInfo { +public: + // Get inverted index file info in segment id order. + // Return the info of inverted index files from seg_id_offset to the last one. + Result> get_inverted_files_info(int seg_id_offset) { + std::lock_guard lock(_lock); + + Status st; + std::vector inverted_files_info(_inverted_index_files_info.size()); + bool succ = std::all_of( + _inverted_index_files_info.begin(), _inverted_index_files_info.end(), + [&](auto&& it) { + auto&& [seg_id, info] = it; + + int idx = seg_id - seg_id_offset; + if (idx >= inverted_files_info.size()) [[unlikely]] { + auto err_msg = fmt::format( + "invalid seg_id={} num_inverted_files_info={} seg_id_offset={}", + seg_id, inverted_files_info.size(), seg_id_offset); + DCHECK(false) << err_msg; + st = Status::InternalError(err_msg); + return false; + } + + auto& finfo = inverted_files_info[idx]; + if (finfo.has_index_size() || finfo.index_info_size() > 0) [[unlikely]] { + // File size should not been set + auto err_msg = fmt::format("duplicate seg_id={}", seg_id); + DCHECK(false) << err_msg; + st = Status::InternalError(err_msg); + return false; + } + finfo = info; + return true; + }); + + if (succ) { + return inverted_files_info; + } + + return ResultError(st); + } + + void add_file_info(int seg_id, InvertedIndexFileInfo file_info) { + std::lock_guard lock(_lock); + _inverted_index_files_info.emplace(seg_id, file_info); + } + +private: + std::unordered_map _inverted_index_files_info; + mutable SpinLock _lock; +}; + class BaseBetaRowsetWriter : public RowsetWriter { public: BaseBetaRowsetWriter(); @@ -160,6 +215,8 @@ class BaseBetaRowsetWriter : public RowsetWriter { return _seg_files.get_file_writers(); } + InvertedIndexFilesInfo& get_inverted_index_files_info() { return _idx_files_info; } + private: void update_rowset_schema(TabletSchemaSPtr flush_schema); // build a tmp rowset for load segment to calc delete_bitmap @@ -174,6 +231,11 @@ class BaseBetaRowsetWriter : public RowsetWriter { // build a tmp rowset for load segment to calc delete_bitmap for this segment Status _build_tmp(RowsetSharedPtr& rowset_ptr); + uint64_t get_rowset_num_rows() { + std::lock_guard l(_segid_statistics_map_mutex); + return std::accumulate(_segment_num_rows.begin(), _segment_num_rows.end(), uint64_t(0)); + } + std::atomic _num_segment; // number of consecutive flushed segments roaring::Roaring _segment_set; // bitmap set to record flushed segment id std::mutex _segment_set_mutex; // mutex for _segment_set @@ -207,6 +269,9 @@ class BaseBetaRowsetWriter : public RowsetWriter { int64_t _delete_bitmap_ns = 0; int64_t _segment_writer_ns = 0; + + // map + InvertedIndexFilesInfo _idx_files_info; }; class SegcompactionWorker; @@ -227,6 +292,8 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { std::unique_ptr* writer, uint64_t index_size, KeyBoundsPB& key_bounds); + bool is_segcompacted() const { return _num_segcompacted > 0; } + private: // segment compaction friend class SegcompactionWorker; @@ -240,7 +307,6 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { Status _segcompaction_rename_last_segments(); Status _load_noncompacted_segment(segment_v2::SegmentSharedPtr& segment, int32_t segment_id); Status _find_longest_consecutive_small_segment(SegCompactionCandidatesSharedPtr& segments); - bool _is_segcompacted() const { return _num_segcompacted > 0; } bool _check_and_set_is_doing_segcompaction(); Status _rename_compacted_segments(int64_t begin, int64_t end); Status _rename_compacted_segment_plain(uint64_t seg_id); diff --git a/be/src/olap/rowset/beta_rowset_writer_v2.cpp b/be/src/olap/rowset/beta_rowset_writer_v2.cpp index 3ebe331cfc12f89..0d0ad435b9efd1b 100644 --- a/be/src/olap/rowset/beta_rowset_writer_v2.cpp +++ b/be/src/olap/rowset/beta_rowset_writer_v2.cpp @@ -58,7 +58,7 @@ namespace doris { using namespace ErrorCode; BetaRowsetWriterV2::BetaRowsetWriterV2(const std::vector>& streams) - : _segment_creator(_context, _seg_files), _streams(streams) {} + : _segment_creator(_context, _seg_files, _idx_files_info), _streams(streams) {} BetaRowsetWriterV2::~BetaRowsetWriterV2() = default; @@ -83,9 +83,19 @@ Status BetaRowsetWriterV2::create_file_writer(uint32_t segment_id, io::FileWrite Status BetaRowsetWriterV2::add_segment(uint32_t segment_id, const SegmentStatistics& segstat, TabletSchemaSPtr flush_schema) { + bool ok = false; for (const auto& stream : _streams) { - RETURN_IF_ERROR(stream->add_segment(_context.partition_id, _context.index_id, - _context.tablet_id, segment_id, segstat, flush_schema)); + auto st = stream->add_segment(_context.partition_id, _context.index_id, _context.tablet_id, + segment_id, segstat, flush_schema); + if (!st.ok()) { + LOG(WARNING) << "failed to add segment " << segment_id << " to stream " + << stream->stream_id(); + } + ok = ok || st.ok(); + } + if (!ok) { + return Status::InternalError("failed to add segment {} of tablet {} to any replicas", + segment_id, _context.tablet_id); } return Status::OK(); } diff --git a/be/src/olap/rowset/beta_rowset_writer_v2.h b/be/src/olap/rowset/beta_rowset_writer_v2.h index d2267a3dbd17342..174b70a072bc173 100644 --- a/be/src/olap/rowset/beta_rowset_writer_v2.h +++ b/be/src/olap/rowset/beta_rowset_writer_v2.h @@ -157,6 +157,8 @@ class BetaRowsetWriterV2 : public RowsetWriter { SegmentCreator _segment_creator; + InvertedIndexFilesInfo _idx_files_info; + fmt::memory_buffer vlog_buffer; std::vector> _streams; diff --git a/be/src/olap/rowset/rowset.cpp b/be/src/olap/rowset/rowset.cpp index 8a4a25c21587954..256f4d35313d131 100644 --- a/be/src/olap/rowset/rowset.cpp +++ b/be/src/olap/rowset/rowset.cpp @@ -23,6 +23,7 @@ #include "olap/segment_loader.h" #include "olap/tablet_schema.h" #include "util/time.h" +#include "util/trace.h" namespace doris { @@ -118,8 +119,14 @@ std::string Rowset::get_rowset_info_str() { } void Rowset::clear_cache() { - SegmentLoader::instance()->erase_segments(rowset_id(), num_segments()); - clear_inverted_index_cache(); + { + SCOPED_SIMPLE_TRACE_IF_TIMEOUT(std::chrono::seconds(1)); + SegmentLoader::instance()->erase_segments(rowset_id(), num_segments()); + } + { + SCOPED_SIMPLE_TRACE_IF_TIMEOUT(std::chrono::seconds(1)); + clear_inverted_index_cache(); + } } Result Rowset::segment_path(int64_t seg_id) { diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 310d0901b2a7514..6050a33bfc2f5da 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -169,6 +169,7 @@ class Rowset : public std::enable_shared_from_this { bool is_segments_overlapping() const { return rowset_meta()->is_segments_overlapping(); } KeysType keys_type() { return _schema->keys_type(); } RowsetStatePB rowset_meta_state() const { return rowset_meta()->rowset_state(); } + bool produced_by_compaction() const { return rowset_meta()->produced_by_compaction(); } // remove all files in this rowset // TODO should we rename the method to remove_files() to be more specific? diff --git a/be/src/olap/rowset/rowset_meta.cpp b/be/src/olap/rowset/rowset_meta.cpp index b969db7a2a229e5..2bc5a6cef858f20 100644 --- a/be/src/olap/rowset/rowset_meta.cpp +++ b/be/src/olap/rowset/rowset_meta.cpp @@ -233,6 +233,13 @@ void RowsetMeta::merge_rowset_meta(const RowsetMeta& other) { _rowset_meta_pb.add_segments_file_size(fsize); } } + if (_rowset_meta_pb.enable_inverted_index_file_info() && + other._rowset_meta_pb.enable_inverted_index_file_info()) { + for (auto finfo : other.inverted_index_file_info()) { + InvertedIndexFileInfo* new_file_info = _rowset_meta_pb.add_inverted_index_file_info(); + *new_file_info = finfo; + } + } // In partial update the rowset schema maybe updated when table contains variant type, so we need the newest schema to be updated // Otherwise the schema is stale and lead to wrong data read if (tablet_schema()->num_variant_columns() > 0) { @@ -249,6 +256,29 @@ void RowsetMeta::merge_rowset_meta(const RowsetMeta& other) { } } +InvertedIndexFileInfo RowsetMeta::inverted_index_file_info(int seg_id) { + return _rowset_meta_pb.enable_inverted_index_file_info() + ? (_rowset_meta_pb.inverted_index_file_info_size() > seg_id + ? _rowset_meta_pb.inverted_index_file_info(seg_id) + : InvertedIndexFileInfo()) + : InvertedIndexFileInfo(); +} + +void RowsetMeta::add_inverted_index_files_info( + const std::vector& idx_file_info) { + _rowset_meta_pb.set_enable_inverted_index_file_info(true); + for (auto finfo : idx_file_info) { + auto* new_file_info = _rowset_meta_pb.add_inverted_index_file_info(); + *new_file_info = finfo; + } +} + +void RowsetMeta::update_inverted_index_files_info( + const std::vector& idx_file_info) { + _rowset_meta_pb.clear_inverted_index_file_info(); + add_inverted_index_files_info(idx_file_info); +} + bool operator==(const RowsetMeta& a, const RowsetMeta& b) { if (a._rowset_id != b._rowset_id) return false; if (a._is_removed_from_rowset_meta != b._is_removed_from_rowset_meta) return false; diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h index aa20b5b1ef13ace..4f25c676f6bd7ff 100644 --- a/be/src/olap/rowset/rowset_meta.h +++ b/be/src/olap/rowset/rowset_meta.h @@ -255,6 +255,12 @@ class RowsetMeta { return num_segments() > 1 && is_singleton_delta() && segments_overlap() != NONOVERLAPPING; } + bool produced_by_compaction() const { + return has_version() && + (start_version() < end_version() || + (start_version() == end_version() && segments_overlap() == NONOVERLAPPING)); + } + // get the compaction score of this rowset. // if segments are overlapping, the score equals to the number of segments, // otherwise, score is 1. @@ -351,6 +357,16 @@ class RowsetMeta { // Used for partial update, when publish, partial update may add a new rowset and we should update rowset meta void merge_rowset_meta(const RowsetMeta& other); + InvertedIndexFileInfo inverted_index_file_info(int seg_id); + + const auto& inverted_index_file_info() const { + return _rowset_meta_pb.inverted_index_file_info(); + } + + void add_inverted_index_files_info(const std::vector& idx_file_info); + + void update_inverted_index_files_info(const std::vector& idx_file_info); + // Because the member field '_handle' is a raw pointer, use member func 'init' to replace copy ctor RowsetMeta(const RowsetMeta&) = delete; RowsetMeta operator=(const RowsetMeta&) = delete; diff --git a/be/src/olap/rowset/rowset_meta_manager.cpp b/be/src/olap/rowset/rowset_meta_manager.cpp index 1cebe28ffbbc784..9d1cbd8858983b2 100644 --- a/be/src/olap/rowset/rowset_meta_manager.cpp +++ b/be/src/olap/rowset/rowset_meta_manager.cpp @@ -533,4 +533,98 @@ Status RowsetMetaManager::load_json_rowset_meta(OlapMeta* meta, return status; } +Status RowsetMetaManager::save_partial_update_info( + OlapMeta* meta, int64_t tablet_id, int64_t partition_id, int64_t txn_id, + const PartialUpdateInfoPB& partial_update_info_pb) { + std::string key = + fmt::format("{}{}_{}_{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id, partition_id, txn_id); + std::string value; + if (!partial_update_info_pb.SerializeToString(&value)) { + return Status::Error( + "serialize partial update info failed. key={}", key); + } + VLOG_NOTICE << "save partial update info, key=" << key << ", value_size=" << value.size(); + return meta->put(META_COLUMN_FAMILY_INDEX, key, value); +} + +Status RowsetMetaManager::try_get_partial_update_info(OlapMeta* meta, int64_t tablet_id, + int64_t partition_id, int64_t txn_id, + PartialUpdateInfoPB* partial_update_info_pb) { + std::string key = + fmt::format("{}{}_{}_{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id, partition_id, txn_id); + std::string value; + Status status = meta->get(META_COLUMN_FAMILY_INDEX, key, &value); + if (status.is()) { + return status; + } + if (!status.ok()) { + LOG_WARNING("failed to get partial update info. tablet_id={}, partition_id={}, txn_id={}", + tablet_id, partition_id, txn_id); + return status; + } + if (!partial_update_info_pb->ParseFromString(value)) { + return Status::Error( + "fail to parse partial update info content to protobuf object. tablet_id={}, " + "partition_id={}, txn_id={}", + tablet_id, partition_id, txn_id); + } + return Status::OK(); +} + +Status RowsetMetaManager::traverse_partial_update_info( + OlapMeta* meta, + std::function const& func) { + auto traverse_partial_update_info_func = [&func](std::string_view key, + std::string_view value) -> bool { + std::vector parts; + // key format: pui_{tablet_id}_{partition_id}_{txn_id} + RETURN_IF_ERROR(split_string(key, '_', &parts)); + if (parts.size() != 4) { + LOG_WARNING("invalid rowset key={}, splitted size={}", key, parts.size()); + return true; + } + int64_t tablet_id = std::stoll(parts[1]); + int64_t partition_id = std::stoll(parts[2]); + int64_t txn_id = std::stoll(parts[3]); + return func(tablet_id, partition_id, txn_id, value); + }; + return meta->iterate(META_COLUMN_FAMILY_INDEX, PARTIAL_UPDATE_INFO_PREFIX, + traverse_partial_update_info_func); +} + +Status RowsetMetaManager::remove_partial_update_info(OlapMeta* meta, int64_t tablet_id, + int64_t partition_id, int64_t txn_id) { + std::string key = + fmt::format("{}{}_{}_{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id, partition_id, txn_id); + Status res = meta->remove(META_COLUMN_FAMILY_INDEX, key); + VLOG_NOTICE << "remove partial update info, key=" << key; + return res; +} + +Status RowsetMetaManager::remove_partial_update_infos( + OlapMeta* meta, const std::vector>& keys) { + std::vector remove_keys; + for (auto [tablet_id, partition_id, txn_id] : keys) { + remove_keys.push_back(fmt::format("{}{}_{}_{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id, + partition_id, txn_id)); + } + Status res = meta->remove(META_COLUMN_FAMILY_INDEX, remove_keys); + VLOG_NOTICE << "remove partial update info, remove_keys.size()=" << remove_keys.size(); + return res; +} + +Status RowsetMetaManager::remove_tablet_related_partial_update_info(OlapMeta* meta, + int64_t tablet_id) { + std::string prefix = fmt::format("{}{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id); + std::vector remove_keys; + auto get_remove_keys_func = [&](std::string_view key, std::string_view val) -> bool { + remove_keys.emplace_back(key); + return true; + }; + VLOG_NOTICE << "remove tablet related partial update info, tablet_id: " << tablet_id + << " removed keys size: " << remove_keys.size(); + RETURN_IF_ERROR(meta->iterate(META_COLUMN_FAMILY_INDEX, prefix, get_remove_keys_func)); + return meta->remove(META_COLUMN_FAMILY_INDEX, remove_keys); +} + } // namespace doris diff --git a/be/src/olap/rowset/rowset_meta_manager.h b/be/src/olap/rowset/rowset_meta_manager.h index 6618faa38d7d528..b61e8c0276949f9 100644 --- a/be/src/olap/rowset/rowset_meta_manager.h +++ b/be/src/olap/rowset/rowset_meta_manager.h @@ -18,6 +18,8 @@ #ifndef DORIS_BE_SRC_OLAP_ROWSET_ROWSET_META_MANAGER_H #define DORIS_BE_SRC_OLAP_ROWSET_ROWSET_META_MANAGER_H +#include + #include #include #include @@ -32,11 +34,15 @@ namespace doris { class OlapMeta; class RowsetMetaPB; +class PartialUpdateInfoPB; } // namespace doris namespace doris { namespace { const std::string ROWSET_PREFIX = "rst_"; + +constexpr std::string_view PARTIAL_UPDATE_INFO_PREFIX = "pui_"; + } // namespace // Helper class for managing rowset meta of one root path. @@ -80,6 +86,21 @@ class RowsetMetaManager { static Status load_json_rowset_meta(OlapMeta* meta, const std::string& rowset_meta_path); + static Status save_partial_update_info(OlapMeta* meta, int64_t tablet_id, int64_t partition_id, + int64_t txn_id, + const PartialUpdateInfoPB& partial_update_info_pb); + static Status try_get_partial_update_info(OlapMeta* meta, int64_t tablet_id, + int64_t partition_id, int64_t txn_id, + PartialUpdateInfoPB* partial_update_info_pb); + static Status traverse_partial_update_info( + OlapMeta* meta, + std::function const& func); + static Status remove_partial_update_info(OlapMeta* meta, int64_t tablet_id, + int64_t partition_id, int64_t txn_id); + static Status remove_partial_update_infos( + OlapMeta* meta, const std::vector>& keys); + static Status remove_tablet_related_partial_update_info(OlapMeta* meta, int64_t tablet_id); + private: static Status _save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, const RowsetMetaPB& rowset_meta_pb); diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index 0130916bfb48118..e13f7efe6e94fa4 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -140,6 +140,16 @@ struct RowsetWriterContext { return *storage_resource->fs; } } + + io::FileWriterOptions get_file_writer_options() const { + io::FileWriterOptions opts { + .write_file_cache = write_file_cache, + .is_cold_data = is_hot_data, + .file_cache_expiration = file_cache_ttl_sec > 0 && newest_write_timestamp > 0 + ? newest_write_timestamp + file_cache_ttl_sec + : 0}; + return opts; + } }; } // namespace doris diff --git a/be/src/olap/rowset/segcompaction.cpp b/be/src/olap/rowset/segcompaction.cpp index f3e8d9f085c9409..374056f7b9dd964 100644 --- a/be/src/olap/rowset/segcompaction.cpp +++ b/be/src/olap/rowset/segcompaction.cpp @@ -76,11 +76,14 @@ Status SegcompactionWorker::_get_segcompaction_reader( std::vector& return_columns, std::unique_ptr* reader) { const auto& ctx = _writer->_context; + bool record_rowids = need_convert_delete_bitmap() && is_key; StorageReadOptions read_options; read_options.stats = stat; read_options.use_page_cache = false; read_options.tablet_schema = ctx.tablet_schema; + read_options.record_rowids = record_rowids; std::vector> seg_iterators; + std::map segment_rows; for (auto& seg_ptr : *segments) { std::unique_ptr iter; auto s = seg_ptr->new_iterator(schema, read_options, &iter); @@ -89,6 +92,10 @@ Status SegcompactionWorker::_get_segcompaction_reader( s.to_string()); } seg_iterators.push_back(std::move(iter)); + segment_rows.emplace(seg_ptr->id(), seg_ptr->num_rows()); + } + if (record_rowids && _rowid_conversion != nullptr) { + _rowid_conversion->reset_segment_map(segment_rows); } *reader = std::make_unique(&row_sources_buf); @@ -102,6 +109,7 @@ Status SegcompactionWorker::_get_segcompaction_reader( reader_params.return_columns = return_columns; reader_params.is_key_column_group = is_key; reader_params.use_page_cache = false; + reader_params.record_rowids = record_rowids; return (*reader)->init(reader_params, nullptr); } @@ -235,6 +243,9 @@ Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPt DCHECK(ctx.tablet); auto tablet = std::static_pointer_cast(ctx.tablet); + if (need_convert_delete_bitmap() && _rowid_conversion == nullptr) { + _rowid_conversion = std::make_unique(_writer->rowset_id()); + } std::vector> column_groups; Merger::vertical_split_columns(*ctx.tablet_schema, &column_groups); @@ -265,8 +276,8 @@ Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPt Merger::Statistics merger_stats; RETURN_IF_ERROR(Merger::vertical_compact_one_group( tablet->tablet_id(), ReaderType::READER_SEGMENT_COMPACTION, *ctx.tablet_schema, - is_key, column_ids, &row_sources_buf, *reader, *writer, INT_MAX, &merger_stats, - &index_size, key_bounds)); + is_key, column_ids, &row_sources_buf, *reader, *writer, &merger_stats, &index_size, + key_bounds, _rowid_conversion.get())); total_index_size += index_size; if (is_key) { RETURN_IF_ERROR(row_sources_buf.flush()); @@ -292,6 +303,10 @@ Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPt } RETURN_IF_ERROR(_delete_original_segments(begin, end)); + if (_rowid_conversion != nullptr) { + convert_segment_delete_bitmap(ctx.mow_context->delete_bitmap, begin, end, + _writer->_num_segcompacted); + } RETURN_IF_ERROR(_writer->_rename_compacted_segments(begin, end)); if (VLOG_DEBUG_IS_ON) { @@ -352,6 +367,59 @@ void SegcompactionWorker::compact_segments(SegCompactionCandidatesSharedPtr segm _is_compacting_state_mutable = true; } +bool SegcompactionWorker::need_convert_delete_bitmap() { + if (_writer == nullptr) { + return false; + } + auto tablet = _writer->context().tablet; + return tablet != nullptr && tablet->keys_type() == KeysType::UNIQUE_KEYS && + tablet->enable_unique_key_merge_on_write() && + tablet->tablet_schema()->has_sequence_col(); +} + +void SegcompactionWorker::convert_segment_delete_bitmap(DeleteBitmapPtr src_delete_bitmap, + uint32_t src_seg_id, uint32_t dest_seg_id) { + // lazy init + if (nullptr == _converted_delete_bitmap) { + _converted_delete_bitmap = std::make_shared(_writer->context().tablet_id); + } + auto rowset_id = _writer->context().rowset_id; + const auto* seg_map = + src_delete_bitmap->get({rowset_id, src_seg_id, DeleteBitmap::TEMP_VERSION_COMMON}); + if (seg_map != nullptr) { + _converted_delete_bitmap->set({rowset_id, dest_seg_id, DeleteBitmap::TEMP_VERSION_COMMON}, + *seg_map); + } +} + +void SegcompactionWorker::convert_segment_delete_bitmap(DeleteBitmapPtr src_delete_bitmap, + uint32_t src_begin, uint32_t src_end, + uint32_t dst_seg_id) { + // lazy init + if (nullptr == _converted_delete_bitmap) { + _converted_delete_bitmap = std::make_shared(_writer->context().tablet_id); + } + auto rowset_id = _writer->context().rowset_id; + RowLocation src(rowset_id, 0, 0); + for (uint32_t seg_id = src_begin; seg_id <= src_end; seg_id++) { + const auto* seg_map = + src_delete_bitmap->get({rowset_id, seg_id, DeleteBitmap::TEMP_VERSION_COMMON}); + if (!seg_map) { + continue; + } + src.segment_id = seg_id; + for (unsigned int row_id : *seg_map) { + src.row_id = row_id; + auto dst_row_id = _rowid_conversion->get(src); + if (dst_row_id < 0) { + continue; + } + _converted_delete_bitmap->add( + {rowset_id, dst_seg_id, DeleteBitmap::TEMP_VERSION_COMMON}, dst_row_id); + } + } +} + bool SegcompactionWorker::cancel() { // return true if the task is canncellable (actual compaction is not started) // return false when the task is not cancellable (it is in the middle of segcompaction) diff --git a/be/src/olap/rowset/segcompaction.h b/be/src/olap/rowset/segcompaction.h index 5aef89992d30b82..67dd6889aadd72a 100644 --- a/be/src/olap/rowset/segcompaction.h +++ b/be/src/olap/rowset/segcompaction.h @@ -23,6 +23,7 @@ #include "common/status.h" #include "io/fs/file_reader_writer_fwd.h" #include "olap/merger.h" +#include "olap/simple_rowid_conversion.h" #include "olap/tablet.h" #include "segment_v2/segment.h" @@ -51,6 +52,14 @@ class SegcompactionWorker { void compact_segments(SegCompactionCandidatesSharedPtr segments); + bool need_convert_delete_bitmap(); + + void convert_segment_delete_bitmap(DeleteBitmapPtr src_delete_bitmap, uint32_t src_seg_id, + uint32_t dest_seg_id); + void convert_segment_delete_bitmap(DeleteBitmapPtr src_delete_bitmap, uint32_t src_begin, + uint32_t src_end, uint32_t dest_seg_id); + DeleteBitmapPtr get_converted_delete_bitmap() { return _converted_delete_bitmap; } + io::FileWriterPtr& get_file_writer() { return _file_writer; } // set the cancel flag, tasks already started will not be cancelled. @@ -78,6 +87,10 @@ class SegcompactionWorker { BetaRowsetWriter* _writer = nullptr; io::FileWriterPtr _file_writer; + // for unique key mow table + std::unique_ptr _rowid_conversion; + DeleteBitmapPtr _converted_delete_bitmap; + // the state is not mutable when 1)actual compaction operation started or 2) cancelled std::atomic _is_compacting_state_mutable = true; }; diff --git a/be/src/olap/rowset/segment_creator.cpp b/be/src/olap/rowset/segment_creator.cpp index 82313f988cbb2b4..b7cbd8b6022616a 100644 --- a/be/src/olap/rowset/segment_creator.cpp +++ b/be/src/olap/rowset/segment_creator.cpp @@ -27,6 +27,7 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" +#include "common/exception.h" #include "common/logging.h" #include "common/status.h" #include "io/fs/file_writer.h" @@ -50,8 +51,9 @@ namespace doris { using namespace ErrorCode; -SegmentFlusher::SegmentFlusher(RowsetWriterContext& context, SegmentFileCollection& seg_files) - : _context(context), _seg_files(seg_files) {} +SegmentFlusher::SegmentFlusher(RowsetWriterContext& context, SegmentFileCollection& seg_files, + InvertedIndexFilesInfo& idx_files_info) + : _context(context), _seg_files(seg_files), _idx_files_info(idx_files_info) {} SegmentFlusher::~SegmentFlusher() = default; @@ -66,16 +68,15 @@ Status SegmentFlusher::flush_single_block(const vectorized::Block* block, int32_ RETURN_IF_ERROR(_parse_variant_columns(flush_block)); } bool no_compression = flush_block.bytes() <= config::segment_compression_threshold_kb * 1024; - if (config::enable_vertical_segment_writer && - _context.tablet_schema->cluster_key_idxes().empty()) { + if (config::enable_vertical_segment_writer) { std::unique_ptr writer; RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, no_compression)); - RETURN_IF_ERROR(_add_rows(writer, &flush_block, 0, flush_block.rows())); + RETURN_IF_ERROR_OR_CATCH_EXCEPTION(_add_rows(writer, &flush_block, 0, flush_block.rows())); RETURN_IF_ERROR(_flush_segment_writer(writer, writer->flush_schema(), flush_size)); } else { std::unique_ptr writer; RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, no_compression)); - RETURN_IF_ERROR(_add_rows(writer, &flush_block, 0, flush_block.rows())); + RETURN_IF_ERROR_OR_CATCH_EXCEPTION(_add_rows(writer, &flush_block, 0, flush_block.rows())); RETURN_IF_ERROR(_flush_segment_writer(writer, writer->flush_schema(), flush_size)); } return Status::OK(); @@ -150,14 +151,15 @@ Status SegmentFlusher::_create_segment_writer(std::unique_ptr( segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, - _context.data_dir, _context.max_rows_per_segment, writer_options, _context.mow_context, - std::move(inverted_file_writer)); + _context.data_dir, writer_options, std::move(inverted_file_writer)); RETURN_IF_ERROR(_seg_files.add(segment_id, std::move(segment_file_writer))); auto s = writer->init(); if (!s.ok()) { @@ -187,14 +189,14 @@ Status SegmentFlusher::_create_segment_writer( writer_options.enable_unique_key_merge_on_write = _context.enable_unique_key_merge_on_write; writer_options.rowset_ctx = &_context; writer_options.write_type = _context.write_type; + writer_options.mow_ctx = _context.mow_context; if (no_compression) { writer_options.compression_type = NO_COMPRESSION; } writer = std::make_unique( segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, - _context.data_dir, _context.max_rows_per_segment, writer_options, _context.mow_context, - std::move(inverted_file_writer)); + _context.data_dir, writer_options, std::move(inverted_file_writer)); RETURN_IF_ERROR(_seg_files.add(segment_id, std::move(segment_file_writer))); auto s = writer->init(); if (!s.ok()) { @@ -241,10 +243,11 @@ Status SegmentFlusher::_flush_segment_writer( uint32_t segment_id = writer->segment_id(); SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + writer->inverted_index_file_size(); - segstat.index_size = index_size + writer->inverted_index_file_size(); + segstat.data_size = segment_size + writer->get_inverted_index_total_size(); + segstat.index_size = index_size + writer->get_inverted_index_total_size(); segstat.key_bounds = key_bounds; + _idx_files_info.add_file_info(segment_id, writer->get_inverted_index_file_info()); writer.reset(); RETURN_IF_ERROR(_context.segment_collector->add(segment_id, segstat, flush_schema)); @@ -286,10 +289,11 @@ Status SegmentFlusher::_flush_segment_writer(std::unique_ptrget_segment_id(); SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + writer->get_inverted_index_file_size(); - segstat.index_size = index_size + writer->get_inverted_index_file_size(); + segstat.data_size = segment_size + writer->get_inverted_index_total_size(); + segstat.index_size = index_size + writer->get_inverted_index_total_size(); segstat.key_bounds = key_bounds; + _idx_files_info.add_file_info(segment_id, writer->get_inverted_index_file_info()); writer.reset(); RETURN_IF_ERROR(_context.segment_collector->add(segment_id, segstat, flush_schema)); @@ -323,8 +327,9 @@ int64_t SegmentFlusher::Writer::max_row_to_add(size_t row_avg_size_in_bytes) { return _writer->max_row_to_add(row_avg_size_in_bytes); } -SegmentCreator::SegmentCreator(RowsetWriterContext& context, SegmentFileCollection& seg_files) - : _segment_flusher(context, seg_files) {} +SegmentCreator::SegmentCreator(RowsetWriterContext& context, SegmentFileCollection& seg_files, + InvertedIndexFilesInfo& idx_files_info) + : _segment_flusher(context, seg_files, idx_files_info) {} Status SegmentCreator::add_block(const vectorized::Block* block) { if (block->rows() == 0) { diff --git a/be/src/olap/rowset/segment_creator.h b/be/src/olap/rowset/segment_creator.h index 97a8f177ad91167..961e161853c1b72 100644 --- a/be/src/olap/rowset/segment_creator.h +++ b/be/src/olap/rowset/segment_creator.h @@ -46,6 +46,7 @@ class VerticalSegmentWriter; struct SegmentStatistics; class BetaRowsetWriter; class SegmentFileCollection; +class InvertedIndexFilesInfo; class FileWriterCreator { public: @@ -93,7 +94,8 @@ class SegmentCollectorT : public SegmentCollector { class SegmentFlusher { public: - SegmentFlusher(RowsetWriterContext& context, SegmentFileCollection& seg_files); + SegmentFlusher(RowsetWriterContext& context, SegmentFileCollection& seg_files, + InvertedIndexFilesInfo& idx_files_info); ~SegmentFlusher(); @@ -158,6 +160,7 @@ class SegmentFlusher { private: RowsetWriterContext& _context; SegmentFileCollection& _seg_files; + InvertedIndexFilesInfo& _idx_files_info; // written rows by add_block/add_row std::atomic _num_rows_written = 0; @@ -169,7 +172,8 @@ class SegmentFlusher { class SegmentCreator { public: - SegmentCreator(RowsetWriterContext& context, SegmentFileCollection& seg_files); + SegmentCreator(RowsetWriterContext& context, SegmentFileCollection& seg_files, + InvertedIndexFilesInfo& idx_files_info); ~SegmentCreator() = default; diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp index 7ad20f210c2c867..d4b0c5fff1b78c2 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp @@ -34,6 +34,15 @@ #undef bshuf_compress_lz4 #undef bshuf_decompress_lz4 +#undef BITSHUFFLE_H +#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_neon +#define bshuf_compress_lz4 bshuf_compress_lz4_neon +#define bshuf_decompress_lz4 bshuf_decompress_lz4_neon +#include // NOLINT(*) +#undef bshuf_compress_lz4_bound +#undef bshuf_compress_lz4 +#undef bshuf_decompress_lz4 + using base::CPU; namespace doris { @@ -63,6 +72,10 @@ __attribute__((constructor)) void SelectBitshuffleFunctions() { g_bshuf_compress_lz4 = bshuf_compress_lz4; g_bshuf_decompress_lz4 = bshuf_decompress_lz4; } +#elif defined(__ARM_NEON) && defined(__aarch64__) && !defined(__APPLE__) + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_neon; + g_bshuf_compress_lz4 = bshuf_compress_lz4_neon; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4_neon; #else g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; g_bshuf_compress_lz4 = bshuf_compress_lz4; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 3e80d2c6e618c20..98669ccb141ae72 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -320,9 +320,11 @@ Status BloomFilterIndexWriter::create(const BloomFilterOptions& bf_options, M(FieldType::OLAP_FIELD_TYPE_DECIMAL64) M(FieldType::OLAP_FIELD_TYPE_DECIMAL128I) M(FieldType::OLAP_FIELD_TYPE_DECIMAL256) + M(FieldType::OLAP_FIELD_TYPE_IPV4) + M(FieldType::OLAP_FIELD_TYPE_IPV6) #undef M default: - return Status::NotSupported("unsupported type for bitmap index: {}", + return Status::NotSupported("unsupported type for bloom filter index: {}", std::to_string(int(type))); } return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 798cbe082613ca2..9e748a01db2fe45 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -468,6 +468,7 @@ Status ScalarColumnWriter::init() { return Status::OK(); } Status add_nulls(uint32_t count) override { return Status::OK(); } + Status add_array_nulls(uint32_t row_id) override { return Status::OK(); } Status finish() override { return Status::OK(); } int64_t size() const override { return 0; } void close_on_error() override {} @@ -946,10 +947,18 @@ Status ArrayColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t size_t num_rows) { RETURN_IF_ERROR(append_data(ptr, num_rows)); if (is_nullable()) { + if (_opts.need_inverted_index) { + for (int row_id = 0; row_id < num_rows; row_id++) { + if (null_map[row_id] == 1) { + RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(row_id)); + } + } + } RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows)); } return Status::OK(); } + Status ArrayColumnWriter::finish() { RETURN_IF_ERROR(_offset_writer->finish()); if (is_nullable()) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp index 428dc05e6f6aa5f..ec1b5bdd9e4d35d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp @@ -31,7 +31,9 @@ namespace doris::segment_v2 { PhraseEdgeQuery::PhraseEdgeQuery(const std::shared_ptr& searcher, const TQueryOptions& query_options) - : _searcher(searcher), _query(std::make_unique()) {} + : _searcher(searcher), + _query(std::make_unique()), + _max_expansions(query_options.inverted_index_max_expansions) {} void PhraseEdgeQuery::add(const std::wstring& field_name, const std::vector& terms) { if (terms.empty()) { @@ -50,9 +52,9 @@ void PhraseEdgeQuery::search(roaring::Roaring& roaring) { } void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) { - size_t count = 0; + bool first = true; std::wstring sub_term = StringUtil::string_to_wstring(_terms[0]); - find_words([this, &count, &sub_term, &roaring](Term* term) { + find_words([this, &first, &sub_term, &roaring](Term* term) { std::wstring_view ws_term(term->text(), term->textLength()); if (ws_term.find(sub_term) == std::wstring::npos) { return; @@ -70,12 +72,12 @@ void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) { } _CLDELETE(term_doc); - if (count) { + if (!first) { roaring.swap(result); + first = false; } else { roaring |= result; } - count++; }); } @@ -86,15 +88,19 @@ void PhraseEdgeQuery::search_multi_term(roaring::Roaring& roaring) { std::vector suffix_terms; std::vector prefix_terms; - find_words([&suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { + find_words([this, &suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { std::wstring_view ws_term(term->text(), term->textLength()); - if (ws_term.ends_with(suffix_term)) { - suffix_terms.push_back(_CL_POINTER(term)); + if (_max_expansions == 0 || suffix_terms.size() < _max_expansions) { + if (ws_term.ends_with(suffix_term)) { + suffix_terms.push_back(_CL_POINTER(term)); + } } - if (ws_term.starts_with(prefix_term)) { - prefix_terms.push_back(_CL_POINTER(term)); + if (_max_expansions == 0 || prefix_terms.size() < _max_expansions) { + if (ws_term.starts_with(prefix_term)) { + prefix_terms.push_back(_CL_POINTER(term)); + } } }); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h index 823f46285b1d007..5daf382e0d08fa7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h @@ -52,6 +52,7 @@ class PhraseEdgeQuery : public Query { std::wstring _field_name; std::vector _terms; std::unique_ptr _query; + int32_t _max_expansions = 50; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index 33fcd10ef363030..e47189f9137adaf 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -40,7 +40,6 @@ Status compact_column(int64_t index_id, std::vector& "debug point: index compaction error"); } }) - lucene::store::Directory* dir = DorisFSDirectoryFactory::getDirectory(io::global_local_filesystem(), tmp_path.data()); lucene::analysis::SimpleAnalyzer analyzer; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp index 67c3ac5253f75a8..7613df112ed9aad 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp @@ -32,6 +32,7 @@ #include "CLucene/SharedHeader.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" #include "olap/tablet_schema.h" +#include "util/debug_points.h" namespace doris { namespace io { @@ -65,7 +66,7 @@ class CSIndexInput : public lucene::store::BufferedIndexInput { public: CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset, const int64_t length, - const int32_t readBufferSize = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE); + const int32_t read_buffer_size = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE); CSIndexInput(const CSIndexInput& clone); ~CSIndexInput() override; void close() override; @@ -77,8 +78,8 @@ class CSIndexInput : public lucene::store::BufferedIndexInput { }; CSIndexInput::CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset, - const int64_t length, const int32_t _readBufferSize) - : BufferedIndexInput(_readBufferSize) { + const int64_t length, const int32_t read_buffer_size) + : BufferedIndexInput(read_buffer_size) { this->base = base; this->fileOffset = fileOffset; this->_length = length; @@ -110,25 +111,13 @@ CSIndexInput::CSIndexInput(const CSIndexInput& clone) : BufferedIndexInput(clone void CSIndexInput::close() {} -DorisCompoundReader::DorisCompoundReader(lucene::store::Directory* d, const char* name, - int32_t read_buffer_size, bool open_idx_file_cache) - : readBufferSize(read_buffer_size), - dir(d), - ram_dir(new lucene::store::RAMDirectory()), - file_name(name), - entries(_CLNEW EntriesType(true, true)) { - bool success = false; +DorisCompoundReader::DorisCompoundReader(CL_NS(store)::IndexInput* stream, int32_t read_buffer_size) + : _ram_dir(new lucene::store::RAMDirectory()), + _stream(stream), + _entries(_CLNEW EntriesType(true, true)), + _read_buffer_size(read_buffer_size) { try { - if (dir->fileLength(name) == 0) { - LOG(WARNING) << "CompoundReader open failed, index file " << name << " is empty."; - _CLTHROWA(CL_ERR_IO, - fmt::format("CompoundReader open failed, index file {} is empty", name) - .c_str()); - } - stream = dir->openInput(name, readBufferSize); - stream->setIdxFileCache(open_idx_file_cache); - - int32_t count = stream->readVInt(); + int32_t count = _stream->readVInt(); ReaderFileEntry* entry = nullptr; TCHAR tid[CL_MAX_PATH]; uint8_t buffer[BUFFER_LENGTH]; @@ -139,37 +128,50 @@ DorisCompoundReader::DorisCompoundReader(lucene::store::Directory* d, const char entry->file_name = aid; entry->offset = stream->readLong(); entry->length = stream->readLong(); - entries->put(aid, entry); + DBUG_EXECUTE_IF("construct_DorisCompoundReader_failed", { + CLuceneError err; + err.set(CL_ERR_IO, "construct_DorisCompoundReader_failed"); + throw err; + }) + _entries->put(aid, entry); // read header file data if (entry->offset < 0) { copyFile(entry->file_name.c_str(), entry->length, buffer, BUFFER_LENGTH); } } - - success = true; - } - _CLFINALLY(if (!success && (stream != nullptr)) { + } catch (...) { try { - stream->close(); - _CLDELETE(stream) + if (_stream != nullptr) { + _stream->close(); + _CLDELETE(_stream) + } + if (_entries != nullptr) { + _entries->clear(); + _CLDELETE(_entries); + } + if (_ram_dir) { + _ram_dir->close(); + _CLDELETE(_ram_dir) + } } catch (CLuceneError& err) { if (err.number() != CL_ERR_IO) { throw err; } } - }) + throw; + } } void DorisCompoundReader::copyFile(const char* file, int64_t file_length, uint8_t* buffer, int64_t buffer_length) { - std::unique_ptr output(ram_dir->createOutput(file)); + std::unique_ptr output(_ram_dir->createOutput(file)); int64_t start_ptr = output->getFilePointer(); int64_t remainder = file_length; int64_t chunk = buffer_length; while (remainder > 0) { int64_t len = std::min(std::min(chunk, file_length), remainder); - stream->readBytes(buffer, len); + _stream->readBytes(buffer, len); output->writeBytes(buffer, len); remainder -= len; } @@ -178,7 +180,7 @@ void DorisCompoundReader::copyFile(const char* file, int64_t file_length, uint8_ swprintf(buf, CL_MAX_PATH + 100, _T("Non-zero remainder length after copying") _T(": %d (id: %s, length: %d, buffer size: %d)"), - (int)remainder, file_name.c_str(), (int)file_length, (int)chunk); + (int)remainder, file, (int)file_length, (int)chunk); _CLTHROWT(CL_ERR_IO, buf); } @@ -203,7 +205,7 @@ DorisCompoundReader::~DorisCompoundReader() { LOG(ERROR) << "DorisCompoundReader finalize error:" << err.what(); } } - _CLDELETE(entries) + _CLDELETE(_entries) } const char* DorisCompoundReader::getClassName() { @@ -214,26 +216,22 @@ const char* DorisCompoundReader::getObjectName() const { } bool DorisCompoundReader::list(std::vector* names) const { - for (EntriesType::const_iterator i = entries->begin(); i != entries->end(); i++) { + for (EntriesType::const_iterator i = _entries->begin(); i != _entries->end(); i++) { names->push_back(i->first); } return true; } bool DorisCompoundReader::fileExists(const char* name) const { - return entries->exists((char*)name); -} - -lucene::store::Directory* DorisCompoundReader::getDirectory() { - return dir; + return _entries->exists((char*)name); } int64_t DorisCompoundReader::fileModified(const char* name) const { - return dir->fileModified(name); + return 0; } int64_t DorisCompoundReader::fileLength(const char* name) const { - ReaderFileEntry* e = entries->get((char*)name); + ReaderFileEntry* e = _entries->get((char*)name); if (e == nullptr) { char buf[CL_MAX_PATH + 30]; strcpy(buf, "File "); @@ -257,12 +255,12 @@ bool DorisCompoundReader::openInput(const char* name, bool DorisCompoundReader::openInput(const char* name, lucene::store::IndexInput*& ret, CLuceneError& error, int32_t bufferSize) { - if (stream == nullptr) { + if (_stream == nullptr) { error.set(CL_ERR_IO, "Stream closed"); return false; } - const ReaderFileEntry* entry = entries->get((char*)name); + const ReaderFileEntry* entry = _entries->get((char*)name); if (entry == nullptr) { char buf[CL_MAX_PATH + 26]; snprintf(buf, CL_MAX_PATH + 26, "No sub-file with id %s found", name); @@ -271,34 +269,30 @@ bool DorisCompoundReader::openInput(const char* name, lucene::store::IndexInput* } // If file is in RAM, just return. - if (ram_dir && ram_dir->fileExists(name)) { - return ram_dir->openInput(name, ret, error, bufferSize); + if (_ram_dir && _ram_dir->fileExists(name)) { + return _ram_dir->openInput(name, ret, error, bufferSize); } if (bufferSize < 1) { - bufferSize = readBufferSize; + bufferSize = _read_buffer_size; } - ret = _CLNEW CSIndexInput(stream, entry->offset, entry->length, bufferSize); + ret = _CLNEW CSIndexInput(_stream, entry->offset, entry->length, bufferSize); return true; } void DorisCompoundReader::close() { std::lock_guard wlock(_this_lock); - if (stream != nullptr) { - stream->close(); - _CLDELETE(stream) - } - if (entries != nullptr) { - entries->clear(); + if (_stream != nullptr) { + _stream->close(); + _CLDELETE(_stream) } - if (ram_dir) { - ram_dir->close(); - _CLDELETE(ram_dir) + if (_entries != nullptr) { + _entries->clear(); } - if (dir) { - dir->close(); - _CLDECDELETE(dir) + if (_ram_dir) { + _ram_dir->close(); + _CLDELETE(_ram_dir) } _closed = true; } @@ -324,12 +318,11 @@ lucene::store::IndexOutput* DorisCompoundReader::createOutput(const char* /*name } std::string DorisCompoundReader::toString() const { - return std::string("DorisCompoundReader@") + this->directory + std::string("; file_name: ") + - std::string(file_name); + return std::string("DorisCompoundReader@"); } CL_NS(store)::IndexInput* DorisCompoundReader::getDorisIndexInput() { - return stream; + return _stream; } } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h index 1ca2d6ad3718c0f..a30c39f8a2ffdde 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h @@ -65,16 +65,12 @@ using EntriesType = lucene::util::Deletor::Object>; class CLUCENE_EXPORT DorisCompoundReader : public lucene::store::Directory { private: - int32_t readBufferSize; - // base info - lucene::store::Directory* dir = nullptr; - lucene::store::RAMDirectory* ram_dir = nullptr; - std::string directory; - std::string file_name; - CL_NS(store)::IndexInput* stream = nullptr; - EntriesType* entries = nullptr; + lucene::store::RAMDirectory* _ram_dir = nullptr; + CL_NS(store)::IndexInput* _stream = nullptr; + EntriesType* _entries = nullptr; std::mutex _this_lock; bool _closed = false; + int32_t _read_buffer_size = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE; protected: /** Removes an existing file in the directory-> */ @@ -83,10 +79,10 @@ class CLUCENE_EXPORT DorisCompoundReader : public lucene::store::Directory { public: explicit DorisCompoundReader( CL_NS(store)::IndexInput* stream, EntriesType* entries_clone, - int32_t _readBufferSize = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE) - : readBufferSize(_readBufferSize), - stream(stream), - entries(_CLNEW EntriesType(true, true)) { + int32_t read_buffer_size = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE) + : _stream(stream), + _entries(_CLNEW EntriesType(true, true)), + _read_buffer_size(read_buffer_size) { for (auto& e : *entries_clone) { auto* origin_entry = e.second; auto* entry = _CLNEW ReaderFileEntry(); @@ -94,17 +90,15 @@ class CLUCENE_EXPORT DorisCompoundReader : public lucene::store::Directory { entry->file_name = origin_entry->file_name; entry->offset = origin_entry->offset; entry->length = origin_entry->length; - entries->put(aid, entry); + _entries->put(aid, entry); } }; - DorisCompoundReader(lucene::store::Directory* dir, const char* name, - int32_t _readBufferSize = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE, - bool open_idx_file_cache = false); + DorisCompoundReader(CL_NS(store)::IndexInput* stream, + int32_t read_buffer_size = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE); ~DorisCompoundReader() override; void copyFile(const char* file, int64_t file_length, uint8_t* buffer, int64_t buffer_length); bool list(std::vector* names) const override; bool fileExists(const char* name) const override; - lucene::store::Directory* getDirectory(); int64_t fileModified(const char* name) const override; int64_t fileLength(const char* name) const override; bool openInput(const char* name, lucene::store::IndexInput*& ret, CLuceneError& err, @@ -116,7 +110,6 @@ class CLUCENE_EXPORT DorisCompoundReader : public lucene::store::Directory { lucene::store::IndexOutput* createOutput(const char* name) override; void close() override; std::string toString() const override; - std::string getFileName() { return file_name; } std::string getPath() const; static const char* getClassName(); const char* getObjectName() const override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp index dbd86bb93a511ec..e0c75922c98bb20 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp @@ -28,13 +28,18 @@ namespace doris::segment_v2 { Status InvertedIndexFileReader::init(int32_t read_buffer_size, bool open_idx_file_cache) { - _read_buffer_size = read_buffer_size; - _open_idx_file_cache = open_idx_file_cache; - if (_storage_format == InvertedIndexStorageFormatPB::V2) { - return _init_from_v2(read_buffer_size); - } else { - return Status::OK(); + if (!_inited) { + _read_buffer_size = read_buffer_size; + _open_idx_file_cache = open_idx_file_cache; + if (_storage_format == InvertedIndexStorageFormatPB::V2) { + auto st = _init_from_v2(read_buffer_size); + if (!st.ok()) { + return st; + } + } + _inited = true; } + return Status::OK(); } Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) { @@ -42,34 +47,39 @@ Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) { std::unique_lock lock(_mutex); // Lock for writing try { - int64_t file_size = 0; - Status st = _fs->file_size(index_file_full_path, &file_size); - DBUG_EXECUTE_IF("inverted file read error: index file not found", { - st = Status::Error("index file not found"); - }) - if (st.code() == ErrorCode::NOT_FOUND) { - return Status::Error( - "inverted index file {} is not found", index_file_full_path); - } else if (!st.ok()) { - return st; - } - if (file_size == 0) { - LOG(WARNING) << "inverted index file " << index_file_full_path << " is empty."; - return Status::Error( - "inverted index file {} is empty", index_file_full_path); - } - CLuceneError err; CL_NS(store)::IndexInput* index_input = nullptr; - auto ok = DorisFSDirectory::FSIndexInput::open(_fs, index_file_full_path.c_str(), - index_input, err, read_buffer_size); + + // 1. get file size from meta + int64_t file_size = -1; + if (_idx_file_info.has_index_size()) { + file_size = _idx_file_info.index_size(); + } + file_size = file_size == 0 ? -1 : file_size; + + DBUG_EXECUTE_IF("file_size_not_in_rowset_meta ", { + if (file_size == -1) { + return Status::Error( + "CLuceneError occur file size = -1, file is {}", index_file_full_path); + } + }) + + // 2. open file + auto ok = DorisFSDirectory::FSIndexInput::open( + _fs, index_file_full_path.c_str(), index_input, err, read_buffer_size, file_size); if (!ok) { + if (err.number() == CL_ERR_FileNotFound) { + return Status::Error( + "inverted index file {} is not found.", index_file_full_path); + } return Status::Error( "CLuceneError occur when open idx file {}, error msg: {}", index_file_full_path, err.what()); } index_input->setIdxFileCache(_open_idx_file_cache); _stream = std::unique_ptr(index_input); + + // 3. read file int32_t version = _stream->readInt(); // Read version number if (version == InvertedIndexStorageFormatPB::V2) { DCHECK(version == _storage_format); @@ -148,23 +158,49 @@ Result> InvertedIndexFileReader::_open( std::unique_ptr compound_reader; if (_storage_format == InvertedIndexStorageFormatPB::V1) { - DorisFSDirectory* dir = nullptr; auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v1( _index_path_prefix, index_id, index_suffix); try { - std::filesystem::path path(index_file_path); - dir = DorisFSDirectoryFactory::getDirectory(_fs, path.parent_path().c_str()); - compound_reader = std::make_unique( - dir, path.filename().c_str(), _read_buffer_size, _open_idx_file_cache); - } catch (CLuceneError& err) { - if (dir != nullptr) { - dir->close(); - _CLDELETE(dir) + CLuceneError err; + CL_NS(store)::IndexInput* index_input = nullptr; + + // 1. get file size from meta + int64_t file_size = -1; + if (_idx_file_info.index_info_size() > 0) { + for (const auto& idx_info : _idx_file_info.index_info()) { + if (index_id == idx_info.index_id() && + index_suffix == idx_info.index_suffix()) { + file_size = idx_info.index_file_size(); + break; + } + } } - if (err.number() == CL_ERR_FileNotFound) { - return ResultError(Status::Error( - "inverted index path: {} not exist.", index_file_path)); + file_size = file_size == 0 ? -1 : file_size; + DBUG_EXECUTE_IF("file_size_not_in_rowset_meta ", { + if (file_size == -1) { + return ResultError(Status::Error( + "CLuceneError occur file size = -1, file is {}", index_file_path)); + } + }) + + // 2. open file + auto ok = DorisFSDirectory::FSIndexInput::open( + _fs, index_file_path.c_str(), index_input, err, _read_buffer_size, file_size); + if (!ok) { + // now index_input = nullptr + if (err.number() == CL_ERR_FileNotFound) { + return ResultError(Status::Error( + "inverted index file {} is not found.", index_file_path)); + } + return ResultError(Status::Error( + "CLuceneError occur when open idx file {}, error msg: {}", index_file_path, + err.what())); } + + // 3. read file in DorisCompoundReader + index_input->setIdxFileCache(_open_idx_file_cache); + compound_reader = std::make_unique(index_input, _read_buffer_size); + } catch (CLuceneError& err) { return ResultError(Status::Error( "CLuceneError occur when open idx file {}, error msg: {}", index_file_path, err.what())); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h index fc0dd6e43d1c702..8bc28b1882f9d85 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h @@ -51,10 +51,12 @@ class InvertedIndexFileReader { std::map, std::unique_ptr>; InvertedIndexFileReader(io::FileSystemSPtr fs, std::string index_path_prefix, - InvertedIndexStorageFormatPB storage_format) + InvertedIndexStorageFormatPB storage_format, + InvertedIndexFileInfo idx_file_info = InvertedIndexFileInfo()) : _fs(std::move(fs)), _index_path_prefix(std::move(index_path_prefix)), - _storage_format(storage_format) {} + _storage_format(storage_format), + _idx_file_info(idx_file_info) {} Status init(int32_t read_buffer_size = config::inverted_index_read_buffer_size, bool open_idx_file_cache = false); @@ -65,6 +67,8 @@ class InvertedIndexFileReader { Status index_file_exist(const TabletIndex* index_meta, bool* res) const; Status has_null(const TabletIndex* index_meta, bool* res) const; Result get_all_directories(); + // open file v2, init _stream + int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); } private: Status _init_from_v2(int32_t read_buffer_size); @@ -72,13 +76,15 @@ class InvertedIndexFileReader { const std::string& index_suffix) const; IndicesEntriesMap _indices_entries; - std::unique_ptr _stream; + std::unique_ptr _stream = nullptr; const io::FileSystemSPtr _fs; std::string _index_path_prefix; int32_t _read_buffer_size = -1; bool _open_idx_file_cache = false; InvertedIndexStorageFormatPB _storage_format; mutable std::shared_mutex _mutex; // Use mutable for const read operations + bool _inited = false; + InvertedIndexFileInfo _idx_file_info; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp index 3dd842b16d8808c..d11b9fa54d04219 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp @@ -84,8 +84,8 @@ Status InvertedIndexFileWriter::delete_index(const TabletIndex* index_meta) { return Status::OK(); } -size_t InvertedIndexFileWriter::headerLength() { - size_t header_size = 0; +int64_t InvertedIndexFileWriter::headerLength() { + int64_t header_size = 0; header_size += sizeof(int32_t) * 2; // Account for the size of the version number and number of indices @@ -115,12 +115,14 @@ Status InvertedIndexFileWriter::close() { } DBUG_EXECUTE_IF("inverted_index_storage_format_must_be_v2", { if (_storage_format != InvertedIndexStorageFormatPB::V2) { - _CLTHROWA(CL_ERR_IO, "inverted index storage format must be v2"); + return Status::Error( + "InvertedIndexFileWriter::close fault injection:inverted index storage format " + "must be v2"); } }) if (_storage_format == InvertedIndexStorageFormatPB::V1) { try { - _file_size = write_v1(); + _total_file_size = write_v1(); for (const auto& entry : _indices_dirs) { const auto& dir = entry.second; // delete index path, which contains separated inverted index files @@ -135,7 +137,7 @@ Status InvertedIndexFileWriter::close() { } } else { try { - _file_size = write_v2(); + _total_file_size = write_v2(); for (const auto& entry : _indices_dirs) { const auto& dir = entry.second; // delete index path, which contains separated inverted index files @@ -218,8 +220,8 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire input->close(); } -size_t InvertedIndexFileWriter::write_v1() { - size_t total_size = 0; +int64_t InvertedIndexFileWriter::write_v1() { + int64_t total_size = 0; for (const auto& entry : _indices_dirs) { const int64_t index_id = entry.first.first; const auto& index_suffix = entry.first.second; @@ -281,6 +283,7 @@ size_t InvertedIndexFileWriter::write_v1() { ram_dir.close(); auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, idx_path.c_str()); + out_dir->set_file_writer_opts(_opts); auto* out = out_dir->createOutput(idx_name.c_str()); if (out == nullptr) { @@ -327,6 +330,12 @@ size_t InvertedIndexFileWriter::write_v1() { output->close(); //LOG(INFO) << (idx_path / idx_name).c_str() << " size:" << compound_file_size; total_size += compound_file_size; + InvertedIndexFileInfo_IndexInfo index_info; + index_info.set_index_id(index_id); + index_info.set_index_suffix(index_suffix); + index_info.set_index_file_size(compound_file_size); + auto* new_index_info = _file_info.add_index_info(); + *new_index_info = index_info; } catch (CLuceneError& err) { LOG(ERROR) << "CLuceneError occur when close idx file " << InvertedIndexDescriptor::get_index_file_path_v1(_index_path_prefix, @@ -339,13 +348,15 @@ size_t InvertedIndexFileWriter::write_v1() { return total_size; } -size_t InvertedIndexFileWriter::write_v2() { +int64_t InvertedIndexFileWriter::write_v2() { // Create the output stream to write the compound file int64_t current_offset = headerLength(); io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); + out_dir->set_file_writer_opts(_opts); + std::unique_ptr compound_file_output; // idx v2 writer != nullptr means memtable on sink node now if (_idx_v2_writer != nullptr) { @@ -429,6 +440,7 @@ size_t InvertedIndexFileWriter::write_v2() { _CLDECDELETE(out_dir) auto compound_file_size = compound_file_output->getFilePointer(); compound_file_output->close(); + _file_info.set_index_size(compound_file_size); return compound_file_size; } } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h index 0d82504c07fdb1f..2aceb671d809a77 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h @@ -19,6 +19,7 @@ #include // IWYU pragma: keep #include +#include #include #include @@ -40,7 +41,7 @@ using InvertedIndexDirectoryMap = class FileInfo { public: std::string filename; - int32_t filesize; + int64_t filesize; }; class InvertedIndexFileWriter { @@ -60,17 +61,20 @@ class InvertedIndexFileWriter { Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); ~InvertedIndexFileWriter() = default; - size_t write_v2(); - size_t write_v1(); + int64_t write_v2(); + int64_t write_v1(); Status close(); - size_t headerLength(); - size_t get_index_file_size() const { return _file_size; } + int64_t headerLength(); + InvertedIndexFileInfo get_index_file_info() const { return _file_info; } + int64_t get_index_file_total_size() const { return _total_file_size; } const io::FileSystemSPtr& get_fs() const { return _fs; } void sort_files(std::vector& file_infos); void copyFile(const char* fileName, lucene::store::Directory* dir, lucene::store::IndexOutput* output, uint8_t* buffer, int64_t bufferLength); InvertedIndexStorageFormatPB get_storage_format() const { return _storage_format; } + void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } + private: InvertedIndexDirectoryMap _indices_dirs; const io::FileSystemSPtr _fs; @@ -78,9 +82,14 @@ class InvertedIndexFileWriter { std::string _rowset_id; int64_t _seg_id; InvertedIndexStorageFormatPB _storage_format; - size_t _file_size = 0; + // v1: all file size + // v2: file size + int64_t _total_file_size = 0; // write to disk or stream io::FileWriterPtr _idx_v2_writer; + io::FileWriterOptions _opts; + + InvertedIndexFileInfo _file_info; }; } // namespace segment_v2 -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp index 0443bf345ba1d67..f752c5300204de7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp @@ -84,9 +84,6 @@ namespace doris::segment_v2 { const char* const DorisFSDirectory::WRITE_LOCK_FILE = "write.lock"; class DorisFSDirectory::FSIndexOutput : public lucene::store::BufferedIndexOutput { -private: - io::FileWriterPtr _writer; - protected: void flushBuffer(const uint8_t* b, const int32_t size) override; @@ -96,6 +93,12 @@ class DorisFSDirectory::FSIndexOutput : public lucene::store::BufferedIndexOutpu ~FSIndexOutput() override; void close() override; int64_t length() const override; + + void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } + +private: + io::FileWriterPtr _writer; + io::FileWriterOptions _opts; }; class DorisFSDirectory::FSIndexOutputV2 : public lucene::store::BufferedIndexOutput { @@ -115,7 +118,7 @@ class DorisFSDirectory::FSIndexOutputV2 : public lucene::store::BufferedIndexOut bool DorisFSDirectory::FSIndexInput::open(const io::FileSystemSPtr& fs, const char* path, IndexInput*& ret, CLuceneError& error, - int32_t buffer_size) { + int32_t buffer_size, int64_t file_size) { CND_PRECONDITION(path != nullptr, "path is NULL"); if (buffer_size == -1) { @@ -127,21 +130,26 @@ bool DorisFSDirectory::FSIndexInput::open(const io::FileSystemSPtr& fs, const ch reader_options.cache_type = config::enable_file_cache ? io::FileCachePolicy::FILE_BLOCK_CACHE : io::FileCachePolicy::NO_CACHE; reader_options.is_doris_table = true; + reader_options.file_size = file_size; Status st = fs->open_file(path, &h->_reader, &reader_options); DBUG_EXECUTE_IF("inverted file read error: index file not found", { st = Status::Error("index file not found"); }) if (st.code() == ErrorCode::NOT_FOUND) { - error.set(CL_ERR_FileNotFound, "File does not exist"); + error.set(CL_ERR_FileNotFound, fmt::format("File does not exist, file is {}", path).data()); } else if (st.code() == ErrorCode::IO_ERROR) { - error.set(CL_ERR_IO, "File open io error"); + error.set(CL_ERR_IO, fmt::format("File open io error, file is {}", path).data()); } else if (st.code() == ErrorCode::PERMISSION_DENIED) { - error.set(CL_ERR_IO, "File Access denied"); - } else { - error.set(CL_ERR_IO, "Could not open file"); + error.set(CL_ERR_IO, fmt::format("File Access denied, file is {}", path).data()); + } else if (!st.ok()) { + error.set(CL_ERR_IO, fmt::format("Could not open file, file is {}", path).data()); } //Check if a valid handle was retrieved if (st.ok() && h->_reader) { + if (h->_reader->size() == 0) { + // may be an empty file + LOG(INFO) << "Opened inverted index file is empty, file is " << path; + } //Store the file length h->_length = h->_reader->size(); h->_fpos = 0; @@ -242,7 +250,13 @@ void DorisFSDirectory::FSIndexInput::readInternal(uint8_t* b, const int32_t len) } void DorisFSDirectory::FSIndexOutput::init(const io::FileSystemSPtr& fs, const char* path) { - Status status = fs->create_file(path, &_writer); + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutput::init.file_cache", { + if (fs->type() == io::FileSystemType::S3 && _opts.write_file_cache == false) { + _CLTHROWA(CL_ERR_IO, "Inverted index failed to enter file cache"); + } + }); + + Status status = fs->create_file(path, &_writer, &_opts); DBUG_EXECUTE_IF( "DorisFSDirectory::FSIndexOutput._throw_clucene_error_in_fsindexoutput_" "init", @@ -579,6 +593,7 @@ lucene::store::IndexOutput* DorisFSDirectory::createOutput(const char* name) { assert(!exists); } auto* ret = _CLNEW FSIndexOutput(); + ret->set_file_writer_opts(_opts); try { ret->init(_fs, fl); } catch (CLuceneError& err) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h index b3e0352d7adf91b..59ae6db1a9630d3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h @@ -29,6 +29,7 @@ #include "CLucene/SharedHeader.h" #include "io/fs/file_reader_writer_fwd.h" #include "io/fs/file_system.h" +#include "io/fs/file_writer.h" #include "io/io_common.h" class CLuceneError; @@ -46,8 +47,6 @@ class CLUCENE_EXPORT DorisFSDirectory : public lucene::store::Directory { public: static const char* const WRITE_LOCK_FILE; static const int64_t MAX_HEADER_DATA_SIZE = 1024 * 128; // 128k -private: - int filemode; protected: mutable std::mutex _this_lock; @@ -91,6 +90,12 @@ class CLUCENE_EXPORT DorisFSDirectory : public lucene::store::Directory { virtual void init(const io::FileSystemSPtr& fs, const char* path, lucene::store::LockFactory* lock_factory = nullptr); + + void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } + +private: + int32_t filemode; + io::FileWriterOptions _opts; }; class CLUCENE_EXPORT DorisRAMFSDirectory : public DorisFSDirectory { @@ -184,7 +189,7 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput public: static bool open(const io::FileSystemSPtr& fs, const char* path, IndexInput*& ret, - CLuceneError& error, int32_t bufferSize = -1); + CLuceneError& error, int32_t bufferSize = -1, int64_t file_size = -1); ~FSIndexInput() override; IndexInput* clone() const override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index d89d089de3bf0fe..609a41a5f99f060 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -187,8 +187,10 @@ void InvertedIndexReader::get_analyse_result(std::vector& analyse_r } } -Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, +Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, + InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir) { + SCOPED_RAW_TIMER(&stats->inverted_index_query_null_bitmap_timer); lucene::store::IndexInput* null_bitmap_in = nullptr; bool owned_dir = false; try { @@ -203,6 +205,13 @@ Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cach if (!dir) { // TODO: ugly code here, try to refact. + bool open_idx_file_cache = true; + auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, + open_idx_file_cache); + if (!st.ok()) { + LOG(WARNING) << st; + return st; + } auto directory = DORIS_TRY(_inverted_index_file_reader->open(&_index_meta)); dir = directory.release(); owned_dir = true; @@ -244,19 +253,28 @@ Status InvertedIndexReader::handle_searcher_cache( InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); if (InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, inverted_index_cache_handle)) { + stats->inverted_index_searcher_cache_hit++; return Status::OK(); } else { // searcher cache miss + stats->inverted_index_searcher_cache_miss++; auto mem_tracker = std::make_unique("InvertedIndexSearcherCacheWithRead"); SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); IndexSearcherPtr searcher; + bool open_idx_file_cache = true; + auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, + open_idx_file_cache); + if (!st.ok()) { + LOG(WARNING) << st; + return st; + } auto dir = DORIS_TRY(_inverted_index_file_reader->open(&_index_meta)); // try to reuse index_searcher's directory to read null_bitmap to cache // to avoid open directory additionally for null_bitmap // TODO: handle null bitmap procedure in new format. InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - static_cast(read_null_bitmap(&null_bitmap_cache_handle, dir.get())); + static_cast(read_null_bitmap(stats, &null_bitmap_cache_handle, dir.get())); RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, mem_tracker.get(), type())); auto* cache_value = new InvertedIndexSearcherCache::CacheValue( std::move(searcher), mem_tracker->consumption(), UnixMillis()); @@ -284,6 +302,27 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, return Status::OK(); }; +Status InvertedIndexReader::match_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, + const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + auto query = QueryFactory::create(query_type, index_searcher, queryOptions); + if (!query) { + return Status::Error( + "query type " + query_type_to_string(query_type) + ", query is nullptr"); + } + query->add(query_info); + query->search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr* iterator) { *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); @@ -384,27 +423,6 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } } -Status FullTextIndexReader::match_index_search( - OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, - const std::shared_ptr& term_match_bitmap) { - TQueryOptions queryOptions = runtime_state->query_options(); - try { - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - auto query = QueryFactory::create(query_type, index_searcher, queryOptions); - if (!query) { - return Status::Error( - "query type " + query_type_to_string(query_type) + ", query is nullptr"); - } - query->add(query_info); - query->search(*term_match_bitmap); - } catch (const CLuceneError& e) { - return Status::Error("CLuceneError occured: {}", - e.what()); - } - return Status::OK(); -} - InvertedIndexReaderType FullTextIndexReader::type() { return InvertedIndexReaderType::FULLTEXT; } @@ -461,28 +479,25 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, std::string search_str(search_query->data, act_len); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; - std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); - std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); - // unique_ptr with custom deleter - std::unique_ptr term { - _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), - [](lucene::index::Term* term) { _CLDECDELETE(term); }}; - std::unique_ptr query; auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); - // try to get query bitmap result from cache and return immediately on cache hit InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type, search_str}; auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { return Status::OK(); } - roaring::Roaring result; + std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); + + InvertedIndexQueryInfo query_info; + query_info.field_name = column_name_ws; + query_info.terms.emplace_back(search_str); + + auto result = std::make_shared(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); @@ -494,33 +509,29 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_ANY_QUERY: case InvertedIndexQueryType::MATCH_ALL_QUERY: case InvertedIndexQueryType::EQUAL_QUERY: { - query = std::make_unique(term.get()); - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - (*searcher_ptr)->_search(query.get(), [&result](DocRange* doc_range) { - if (doc_range->type_ == DocRangeType::kMany) { - result.addMany(doc_range->doc_many_size_, doc_range->doc_many->data()); - } else { - result.addRange(doc_range->doc_range.first, doc_range->doc_range.second); - } - }); + RETURN_IF_ERROR(match_index_search(stats, runtime_state, + InvertedIndexQueryType::MATCH_ANY_QUERY, + query_info, *searcher_ptr, result)); break; } - case InvertedIndexQueryType::MATCH_PHRASE_QUERY: { - query = std::make_unique(term.get()); - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - (*searcher_ptr) - ->_search(query.get(), - [&result](const int32_t docid, const float_t /*score*/) { - // docid equal to rowid in segment - result.add(docid); - }); + case InvertedIndexQueryType::MATCH_PHRASE_QUERY: + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { + RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, + *searcher_ptr, result)); break; } - case InvertedIndexQueryType::LESS_THAN_QUERY: case InvertedIndexQueryType::LESS_EQUAL_QUERY: case InvertedIndexQueryType::GREATER_THAN_QUERY: case InvertedIndexQueryType::GREATER_EQUAL_QUERY: { + std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); + // unique_ptr with custom deleter + std::unique_ptr term { + _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), + [](lucene::index::Term* term) { _CLDECDELETE(term); }}; + std::unique_ptr query; + bool include_upper = query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY; bool include_lower = query_type == InvertedIndexQueryType::GREATER_EQUAL_QUERY; @@ -537,7 +548,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, (*searcher_ptr) ->_search(query.get(), [&result](const int32_t docid, const float_t /*score*/) { - result.add(docid); + result->add(docid); }); break; } @@ -560,12 +571,10 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, } // add to cache - std::shared_ptr term_match_bitmap = - std::make_shared(result); - term_match_bitmap->runOptimize(); - cache->insert(cache_key, term_match_bitmap, &cache_handler); + result->runOptimize(); + cache->insert(cache_key, result, &cache_handler); - bit_map = term_match_bitmap; + bit_map = result; } return Status::OK(); } @@ -1196,6 +1205,9 @@ lucene::util::bkd::relation InvertedIndexVisitor::compare(std::vector& bit_map, bool skip_try) { + DBUG_EXECUTE_IF("return_inverted_index_bypass", { + return Status::Error("inverted index bypass"); + }); if (UNLIKELY(_reader == nullptr)) { throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 92df87361c89cfa..2377a91845fc4d1 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -93,7 +93,8 @@ class InvertedIndexReader : public std::enable_shared_from_this& term_match_bitmap); + friend class InvertedIndexIterator; std::shared_ptr _inverted_index_file_reader; TabletIndex _index_meta; @@ -177,13 +184,6 @@ class FullTextIndexReader : public InvertedIndexReader { const std::map& properties); static void setup_analyzer_use_stopwords(std::unique_ptr& analyzer, const std::map& properties); - -private: - Status match_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, - InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, - const FulltextIndexSearcherPtr& index_searcher, - const std::shared_ptr& term_match_bitmap); }; class StringTypeInvertedIndexReader : public InvertedIndexReader { @@ -373,7 +373,7 @@ class InvertedIndexIterator { Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr) { - return _reader->read_null_bitmap(cache_handle, dir); + return _reader->read_null_bitmap(_stats, cache_handle, dir); } [[nodiscard]] InvertedIndexReaderType get_inverted_index_reader_type() const; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index eea9179a2cf4b62..b491c97e63c799b 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -309,6 +309,11 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } + Status add_array_nulls(uint32_t row_id) override { + _null_bitmap.add(row_id); + return Status::OK(); + } + void new_inverted_index_field(const char* field_value_data, size_t field_value_size) { if (_parser_type != InvertedIndexParserType::PARSER_UNKNOWN && _parser_type != InvertedIndexParserType::PARSER_NONE) { @@ -357,7 +362,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { _rid++; } } else if constexpr (field_is_numeric_type(field_type)) { - add_numeric_values(values, count); + RETURN_IF_ERROR(add_numeric_values(values, count)); } return Status::OK(); } @@ -421,6 +426,23 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { } } start_off += array_elem_size; + // here to make debug for array field with current doc which should has expected number of fields + DBUG_EXECUTE_IF("array_inverted_index.write_index", { + auto single_array_field_count = + DebugPoints::instance()->get_debug_param_or_default( + "array_inverted_index.write_index", "single_array_field_count", + 0); + if (single_array_field_count < 0) { + return Status::Error( + "indexes count cannot be negative"); + } + if (_doc->getFields()->size() != single_array_field_count) { + return Status::Error( + "array field has fields count {} not equal to expected {}", + _doc->getFields()->size(), single_array_field_count); + } + }) + if (!_doc->getFields()->empty()) { // if this array is null, we just ignore to write inverted index RETURN_IF_ERROR(add_document()); @@ -450,11 +472,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { continue; } const CppType* p = &reinterpret_cast(value_ptr)[j]; - std::string new_value; - size_t value_length = sizeof(CppType); - - _value_key_coder->full_encode_ascending(p, &new_value); - _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid); + RETURN_IF_ERROR(add_value(*p)); } start_off += array_elem_size; _row_ids_seen_for_bkd++; @@ -499,11 +517,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { if (values->is_null_at(j)) { // bkd do not index null values, so we do nothing here. } else { - std::string new_value; - size_t value_length = sizeof(CppType); - - _value_key_coder->full_encode_ascending(p, &new_value); - _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid); + RETURN_IF_ERROR(add_value(*p)); } item_data_ptr = (uint8_t*)item_data_ptr + field_size; } @@ -515,23 +529,33 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } - void add_numeric_values(const void* values, size_t count) { + Status add_numeric_values(const void* values, size_t count) { auto p = reinterpret_cast(values); for (size_t i = 0; i < count; ++i) { - add_value(*p); + RETURN_IF_ERROR(add_value(*p)); + _rid++; p++; _row_ids_seen_for_bkd++; } + return Status::OK(); } - void add_value(const CppType& value) { - std::string new_value; - size_t value_length = sizeof(CppType); + Status add_value(const CppType& value) { + try { + std::string new_value; + size_t value_length = sizeof(CppType); - _value_key_coder->full_encode_ascending(&value, &new_value); - _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid); + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_value_bkd_writer_add_throw_error", { + _CLTHROWA(CL_ERR_IllegalArgument, ("packedValue should be length=xxx")); + }); - _rid++; + _value_key_coder->full_encode_ascending(&value, &new_value); + _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid); + } catch (const CLuceneError& e) { + return Status::Error( + "CLuceneError add_value: {}", e.what()); + } + return Status::OK(); } int64_t size() const override { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h index 9a071f5bfdda8da..63c1e219e649e81 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -64,6 +64,7 @@ class InvertedIndexColumnWriter { size_t count) = 0; virtual Status add_nulls(uint32_t count) = 0; + virtual Status add_array_nulls(uint32_t row_id) = 0; virtual Status finish() = 0; diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 8d862528de608a8..2666fc8b633e1ab 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -20,15 +20,16 @@ #include #include #include -#include -#include +#include #include #include #include "common/logging.h" #include "common/status.h" +#include "cpp/sync_point.h" #include "io/cache/block_file_cache.h" +#include "io/cache/block_file_cache_factory.h" #include "io/fs/file_reader.h" #include "io/fs/file_system.h" #include "io/io_common.h" @@ -49,17 +50,13 @@ #include "olap/rowset/segment_v2/segment_writer.h" // k_segment_magic_length #include "olap/schema.h" #include "olap/short_key_index.h" -#include "olap/storage_engine.h" #include "olap/tablet_schema.h" #include "olap/types.h" #include "olap/utils.h" -#include "runtime/define_primitive_type.h" #include "runtime/exec_env.h" -#include "runtime/memory/mem_tracker.h" #include "runtime/query_context.h" #include "runtime/runtime_predicate.h" #include "runtime/runtime_state.h" -#include "util/bvar_helper.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/slice.h" // Slice @@ -77,25 +74,71 @@ namespace doris::segment_v2 { static bvar::Adder g_total_segment_num("doris_total_segment_num"); class InvertedIndexIterator; +io::UInt128Wrapper file_cache_key_from_path(const std::string& seg_path) { + std::string base = seg_path.substr(seg_path.rfind('/') + 1); // tricky: npos + 1 == 0 + return io::BlockFileCache::hash(base); +} + +std::string file_cache_key_str(const std::string& seg_path) { + return file_cache_key_from_path(seg_path).to_string(); +} + Status Segment::open(io::FileSystemSPtr fs, const std::string& path, uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, - const io::FileReaderOptions& reader_options, - std::shared_ptr* output) { + const io::FileReaderOptions& reader_options, std::shared_ptr* output, + InvertedIndexFileInfo idx_file_info) { io::FileReaderSPtr file_reader; RETURN_IF_ERROR(fs->open_file(path, &file_reader, &reader_options)); - std::shared_ptr segment(new Segment(segment_id, rowset_id, std::move(tablet_schema))); - segment->_fs = std::move(fs); + std::shared_ptr segment( + new Segment(segment_id, rowset_id, std::move(tablet_schema), idx_file_info)); + segment->_fs = fs; segment->_file_reader = std::move(file_reader); - RETURN_IF_ERROR(segment->_open()); + auto st = segment->_open(); + TEST_INJECTION_POINT_CALLBACK("Segment::open:corruption", &st); + if (st.is() && + reader_options.cache_type == io::FileCachePolicy::FILE_BLOCK_CACHE) { + LOG(WARNING) << "bad segment file may be read from file cache, try to read remote source " + "file directly, file path: " + << path << " cache_key: " << file_cache_key_str(path); + auto file_key = file_cache_key_from_path(path); + auto* file_cache = io::FileCacheFactory::instance()->get_by_path(file_key); + file_cache->remove_if_cached(file_key); + + RETURN_IF_ERROR(fs->open_file(path, &file_reader, &reader_options)); + segment->_file_reader = std::move(file_reader); + st = segment->_open(); + TEST_INJECTION_POINT_CALLBACK("Segment::open:corruption1", &st); + if (st.is()) { // corrupt again + LOG(WARNING) << "failed to try to read remote source file again with cache support," + << " try to read from remote directly, " + << " file path: " << path << " cache_key: " << file_cache_key_str(path); + file_cache = io::FileCacheFactory::instance()->get_by_path(file_key); + file_cache->remove_if_cached(file_key); + + io::FileReaderOptions opt = reader_options; + opt.cache_type = io::FileCachePolicy::NO_CACHE; // skip cache + RETURN_IF_ERROR(fs->open_file(path, &file_reader, &opt)); + segment->_file_reader = std::move(file_reader); + st = segment->_open(); + if (!st.ok()) { + LOG(WARNING) << "failed to try to read remote source file directly," + << " file path: " << path + << " cache_key: " << file_cache_key_str(path); + } + } + } + RETURN_IF_ERROR(st); *output = std::move(segment); return Status::OK(); } -Segment::Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema) +Segment::Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, + InvertedIndexFileInfo idx_file_info) : _segment_id(segment_id), _meta_mem_usage(0), _rowset_id(rowset_id), - _tablet_schema(std::move(tablet_schema)) { + _tablet_schema(std::move(tablet_schema)), + _idx_file_info(idx_file_info) { g_total_segment_num << 1; } @@ -139,15 +182,8 @@ Status Segment::_open_inverted_index() { _fs, std::string {InvertedIndexDescriptor::get_index_file_path_prefix( _file_reader->path().native())}, - _tablet_schema->get_inverted_index_storage_format()); - bool open_idx_file_cache = true; - auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache); - if (st.is()) { - LOG(INFO) << st; - return Status::OK(); - } - return st; + _tablet_schema->get_inverted_index_storage_format(), _idx_file_info); + return Status::OK(); } Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_options, @@ -256,8 +292,9 @@ Status Segment::_parse_footer(SegmentFooterPB* footer) { // Footer := SegmentFooterPB, FooterPBSize(4), FooterPBChecksum(4), MagicNumber(4) auto file_size = _file_reader->size(); if (file_size < 12) { - return Status::Corruption("Bad segment file {}: file size {} < 12", - _file_reader->path().native(), file_size); + return Status::Corruption("Bad segment file {}: file size {} < 12, cache_key: {}", + _file_reader->path().native(), file_size, + file_cache_key_str(_file_reader->path().native())); } uint8_t fixed_buf[12]; @@ -269,15 +306,18 @@ Status Segment::_parse_footer(SegmentFooterPB* footer) { DCHECK_EQ(bytes_read, 12); if (memcmp(fixed_buf + 8, k_segment_magic, k_segment_magic_length) != 0) { - return Status::Corruption("Bad segment file {}: magic number not match", - _file_reader->path().native()); + return Status::Corruption( + "Bad segment file {}: file_size: {}, magic number not match, cache_key: {}", + _file_reader->path().native(), file_size, + file_cache_key_str(_file_reader->path().native())); } // read footer PB uint32_t footer_length = decode_fixed32_le(fixed_buf); if (file_size < 12 + footer_length) { - return Status::Corruption("Bad segment file {}: file size {} < {}", - _file_reader->path().native(), file_size, 12 + footer_length); + return Status::Corruption("Bad segment file {}: file size {} < {}, cache_key: {}", + _file_reader->path().native(), file_size, 12 + footer_length, + file_cache_key_str(_file_reader->path().native())); } std::string footer_buf; @@ -291,14 +331,18 @@ Status Segment::_parse_footer(SegmentFooterPB* footer) { uint32_t actual_checksum = crc32c::Value(footer_buf.data(), footer_buf.size()); if (actual_checksum != expect_checksum) { return Status::Corruption( - "Bad segment file {}: footer checksum not match, actual={} vs expect={}", - _file_reader->path().native(), actual_checksum, expect_checksum); + "Bad segment file {}: file_size = {}, footer checksum not match, actual={} " + "vs expect={}, cache_key: {}", + _file_reader->path().native(), file_size, actual_checksum, expect_checksum, + file_cache_key_str(_file_reader->path().native())); } // deserialize footer PB if (!footer->ParseFromString(footer_buf)) { - return Status::Corruption("Bad segment file {}: failed to parse SegmentFooterPB", - _file_reader->path().native()); + return Status::Corruption( + "Bad segment file {}: file_size = {}, failed to parse SegmentFooterPB, cache_key: ", + _file_reader->path().native(), file_size, + file_cache_key_str(_file_reader->path().native())); } return Status::OK(); } @@ -354,7 +398,7 @@ Status Segment::load_index() { Status Segment::_load_index_impl() { return _load_index_once.call([this] { if (_tablet_schema->keys_type() == UNIQUE_KEYS && _pk_index_meta != nullptr) { - _pk_index_reader.reset(new PrimaryKeyIndexReader()); + _pk_index_reader = std::make_unique(); RETURN_IF_ERROR(_pk_index_reader->parse_index(_file_reader, *_pk_index_meta)); // _meta_mem_usage += _pk_index_reader->get_memory_size(); return Status::OK(); @@ -379,7 +423,7 @@ Status Segment::_load_index_impl() { DCHECK(footer.has_short_key_page_footer()); // _meta_mem_usage += body.get_size(); - _sk_index_decoder.reset(new ShortKeyIndexDecoder); + _sk_index_decoder = std::make_unique(); return _sk_index_decoder->parse(body, footer.short_key_page_footer()); } }); @@ -391,8 +435,8 @@ vectorized::DataTypePtr Segment::get_data_type_of(vectorized::PathInDataPtr path bool ignore_children) const { // Path has higher priority if (path != nullptr && !path->empty()) { - auto node = _sub_column_tree.find_leaf(*path); - auto sparse_node = _sparse_column_tree.find_exact(*path); + const auto* node = _sub_column_tree.find_leaf(*path); + const auto* sparse_node = _sparse_column_tree.find_exact(*path); if (node) { if (ignore_children || (node->children.empty() && sparse_node == nullptr)) { return node->data.file_column_type; @@ -420,7 +464,7 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { std::unordered_map column_path_to_footer_ordinal; for (uint32_t ordinal = 0; ordinal < footer.columns().size(); ++ordinal) { - auto& column_pb = footer.columns(ordinal); + const auto& column_pb = footer.columns(ordinal); // column path for accessing subcolumns of variant if (column_pb.has_column_path_info()) { vectorized::PathInData path; @@ -435,7 +479,7 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { } // init by unique_id for (uint32_t ordinal = 0; ordinal < _tablet_schema->num_columns(); ++ordinal) { - auto& column = _tablet_schema->column(ordinal); + const auto& column = _tablet_schema->column(ordinal); auto iter = column_id_to_footer_ordinal.find(column.unique_id()); if (iter == column_id_to_footer_ordinal.end()) { continue; @@ -452,7 +496,7 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { // init by column path for (uint32_t ordinal = 0; ordinal < _tablet_schema->num_columns(); ++ordinal) { - auto& column = _tablet_schema->column(ordinal); + const auto& column = _tablet_schema->column(ordinal); if (!column.has_path_info()) { continue; } @@ -475,7 +519,7 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { vectorized::DataTypeFactory::instance().create_data_type(column_pb)}); // init sparse columns paths and type info for (uint32_t ordinal = 0; ordinal < column_pb.sparse_columns().size(); ++ordinal) { - auto& spase_column_pb = column_pb.sparse_columns(ordinal); + const auto& spase_column_pb = column_pb.sparse_columns(ordinal); if (spase_column_pb.has_column_path_info()) { vectorized::PathInData path; path.from_protobuf(spase_column_pb.column_path_info()); @@ -495,7 +539,10 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { static Status new_default_iterator(const TabletColumn& tablet_column, std::unique_ptr* iter) { if (!tablet_column.has_default_value() && !tablet_column.is_nullable()) { - return Status::InternalError("invalid nonexistent column without default value."); + return Status::InternalError( + "invalid nonexistent column without default value. column_uid={}, column_name={}, " + "column_type={}", + tablet_column.unique_id(), tablet_column.name(), tablet_column.type()); } auto type_info = get_type_info(&tablet_column); std::unique_ptr default_value_iter(new DefaultValueColumnIterator( @@ -515,7 +562,7 @@ Status Segment::_new_iterator_with_variant_root(const TabletColumn& tablet_colum vectorized::DataTypePtr target_type_hint) { ColumnIterator* it; RETURN_IF_ERROR(root->data.reader->new_iterator(&it)); - auto stream_iter = new ExtractReader( + auto* stream_iter = new ExtractReader( tablet_column, std::make_unique(root->data.file_column_type->create_column(), std::unique_ptr(it), @@ -535,13 +582,14 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, } else { root_path = vectorized::PathInData({tablet_column.path_info_ptr()->get_parts()[0]}); } - auto root = _sub_column_tree.find_leaf(root_path); - auto node = tablet_column.has_path_info() - ? _sub_column_tree.find_exact(*tablet_column.path_info_ptr()) - : nullptr; - auto sparse_node = tablet_column.has_path_info() - ? _sparse_column_tree.find_exact(*tablet_column.path_info_ptr()) + const auto* root = _sub_column_tree.find_leaf(root_path); + const auto* node = tablet_column.has_path_info() + ? _sub_column_tree.find_exact(*tablet_column.path_info_ptr()) : nullptr; + const auto* sparse_node = + tablet_column.has_path_info() + ? _sparse_column_tree.find_exact(*tablet_column.path_info_ptr()) + : nullptr; // Currently only compaction and checksum need to read flat leaves // They both use tablet_schema_with_merged_max_schema_version as read schema @@ -620,7 +668,7 @@ Status Segment::new_column_iterator(const TabletColumn& tablet_column, return new_column_iterator_with_path(tablet_column, iter, opt); } // init default iterator - if (_column_readers.count(tablet_column.unique_id()) < 1) { + if (!_column_readers.contains(tablet_column.unique_id())) { RETURN_IF_ERROR(new_default_iterator(tablet_column, iter)); return Status::OK(); } @@ -652,7 +700,7 @@ Status Segment::new_column_iterator(int32_t unique_id, std::unique_ptrdata.reader.get(); @@ -660,7 +708,7 @@ ColumnReader* Segment::_get_column_reader(const TabletColumn& col) { return nullptr; } auto col_unique_id = col.unique_id(); - if (_column_readers.count(col_unique_id) > 0) { + if (_column_readers.contains(col_unique_id)) { return _column_readers[col_unique_id].get(); } return nullptr; @@ -777,7 +825,7 @@ Status Segment::lookup_row_key(const Slice& key, bool with_seq_col, bool with_ro sought_key.get_data() + sought_key_without_seq.get_size() + seq_col_length + 1, rowid_length - 1); const auto* type_info = get_scalar_type_info(); - auto rowid_coder = get_key_coder(type_info->type()); + const auto* rowid_coder = get_key_coder(type_info->type()); RETURN_IF_ERROR(rowid_coder->decode_ascending(&rowid_slice, rowid_length, (uint8_t*)&row_location->row_id)); } diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 2baeadcaf076a1a..dd61e7eb8312071 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -82,7 +82,7 @@ class Segment : public std::enable_shared_from_this { static Status open(io::FileSystemSPtr fs, const std::string& path, uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, const io::FileReaderOptions& reader_options, - std::shared_ptr* output); + std::shared_ptr* output, InvertedIndexFileInfo idx_file_info = {}); static io::UInt128Wrapper file_cache_key(std::string_view rowset_id, uint32_t seg_id); io::UInt128Wrapper file_cache_key() const { @@ -195,7 +195,8 @@ class Segment : public std::enable_shared_from_this { private: DISALLOW_COPY_AND_ASSIGN(Segment); - Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema); + Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, + InvertedIndexFileInfo idx_file_info = InvertedIndexFileInfo()); // open segment file and read the minimum amount of necessary information (footer) Status _open(); Status _parse_footer(SegmentFooterPB* footer); @@ -271,6 +272,8 @@ class Segment : public std::enable_shared_from_this { // inverted index file reader std::shared_ptr _inverted_index_file_reader; DorisCallOnce _inverted_index_file_reader_open; + + InvertedIndexFileInfo _idx_file_info; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 7c9df76eb7868b4..8fa1a81540a8c60 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -390,6 +390,14 @@ void SegmentIterator::_initialize_predicate_results() { _column_predicate_inverted_index_status[cid][pred_sign] = false; } } + + // Initialize from _func_name_to_result_sign + for (auto& iter : _func_name_to_result_sign) { + for (auto& pred_sign : iter.second) { + auto column_id = _opts.tablet_schema->field_index(iter.first); + _column_predicate_inverted_index_status[column_id][pred_sign] = false; + } + } } Status SegmentIterator::init_iterators() { @@ -577,7 +585,6 @@ Status SegmentIterator::_get_row_ranges_by_column_conditions() { RETURN_IF_ERROR(_apply_inverted_index()); for (auto cid : _schema->column_ids()) { bool result_true = _check_all_predicates_passed_inverted_index_for_column(cid); - if (result_true) { _need_read_data_indices[cid] = false; } @@ -699,6 +706,11 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) { RowRanges dict_row_ranges = RowRanges::create_single(num_rows()); for (auto cid : cids) { + if (!_segment->can_apply_predicate_safely(cid, + _opts.col_id_to_predicates.at(cid).get(), + *_schema, _opts.io_ctx.reader_type)) { + continue; + } RowRanges tmp_row_ranges = RowRanges::create_single(num_rows()); DCHECK(_opts.col_id_to_predicates.count(cid) > 0); RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict( @@ -894,8 +906,7 @@ bool SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() { return false; } } - for (const auto& func_expr_pair : compound_func_exprs) { - const auto& expr = func_expr_pair.first; + for (const auto& expr : compound_func_exprs) { std::string pred_result_sign = BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); if (!_rowid_result_for_index.contains(pred_result_sign)) { @@ -1018,14 +1029,20 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() { } } - for (const auto& func_expr_pair : compound_func_exprs) { - const auto& expr = func_expr_pair.first; - const auto& expr_ctx = func_expr_pair.second; + for (const auto& expr : compound_func_exprs) { + roaring::Roaring bitmap = _row_bitmap; auto result = std::make_shared(); - RETURN_IF_ERROR(execute_func_expr(expr, expr_ctx, result)); + RETURN_IF_ERROR(execute_func_expr(expr, result)); + bitmap &= *result; std::string result_sign = BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); - _rowid_result_for_index.emplace(result_sign, std::make_pair(true, std::move(*result))); + _rowid_result_for_index.emplace(result_sign, std::make_pair(true, std::move(bitmap))); + for (const auto& child_expr : expr->children()) { + if (child_expr->node_type() == TExprNodeType::type::SLOT_REF) { + auto column_id = _opts.tablet_schema->field_index(child_expr->expr_name()); + _column_predicate_inverted_index_status[column_id][result_sign] = true; + } + } } return Status::OK(); @@ -1319,11 +1336,9 @@ Status SegmentIterator::_apply_inverted_index() { } } - for (const auto& func_expr_pair : no_compound_func_exprs) { - const auto& expr = func_expr_pair.first; - const auto& expr_ctx = func_expr_pair.second; + for (const auto& expr : no_compound_func_exprs) { auto result = std::make_shared(); - RETURN_IF_ERROR(execute_func_expr(expr, expr_ctx, result)); + RETURN_IF_ERROR(execute_func_expr(expr, result)); _row_bitmap &= *result; for (auto it = _remaining_conjunct_roots.begin(); it != _remaining_conjunct_roots.end();) { if (*it == expr) { @@ -1334,6 +1349,14 @@ Status SegmentIterator::_apply_inverted_index() { ++it; } } + std::string result_sign = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); + for (const auto& child_expr : expr->children()) { + if (child_expr->node_type() == TExprNodeType::type::SLOT_REF) { + auto column_id = _opts.tablet_schema->field_index(child_expr->expr_name()); + _column_predicate_inverted_index_status[column_id][result_sign] = true; + } + } } // add a switch for inverted index filter @@ -1470,18 +1493,6 @@ Status SegmentIterator::_init_inverted_index_iterators() { return Status::OK(); } -Status SegmentIterator::_init_inverted_index_iterators(ColumnId cid) { - if (_inverted_index_iterators[cid] == nullptr) { - return _init_single_inverted_index_iterator.call([&] { - return _segment->new_inverted_index_iterator( - _opts.tablet_schema->column(cid), - _segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)), - _opts, &_inverted_index_iterators[cid]); - }); - } - return Status::OK(); -} - Status SegmentIterator::_lookup_ordinal(const RowCursor& key, bool is_include, rowid_t upper_bound, rowid_t* rowid) { if (_segment->_tablet_schema->keys_type() == UNIQUE_KEYS && @@ -2171,11 +2182,17 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_ SCOPED_RAW_TIMER(&_opts.stats->vec_cond_ns); bool all_pred_always_true = true; for (const auto& pred : _pre_eval_block_predicate) { - if (!pred->always_true()) { + if (!pred->always_true(false)) { all_pred_always_true = false; break; } } + if (all_pred_always_true) { + for (const auto& pred : _pre_eval_block_predicate) { + pred->always_true(true); + } + } + //If all predicates are always_true, then return directly. if (all_pred_always_true || !_is_need_vec_eval) { for (uint16_t i = 0; i < selected_size; ++i) { @@ -2189,7 +2206,7 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_ DCHECK(!_pre_eval_block_predicate.empty()); bool is_first = true; for (auto& pred : _pre_eval_block_predicate) { - if (pred->always_true()) { + if (pred->always_true(true)) { continue; } auto column_id = pred->column_id(); @@ -2206,23 +2223,21 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_ uint32_t sel_pos = 0; const uint32_t sel_end = sel_pos + selected_size; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES; while (sel_pos < sel_end_simd) { - auto mask = simd::bytes32_mask_to_bits32_mask(_ret_flags.data() + sel_pos); + auto mask = simd::bytes_mask_to_bits_mask(_ret_flags.data() + sel_pos); if (0 == mask) { //pass - } else if (0xffffffff == mask) { + } else if (simd::bits_mask_all() == mask) { for (uint32_t i = 0; i < SIMD_BYTES; i++) { sel_rowid_idx[new_size++] = sel_pos + i; } } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - sel_rowid_idx[new_size++] = sel_pos + bit_pos; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { sel_rowid_idx[new_size++] = sel_pos + bit_pos; }, + mask); } sel_pos += SIMD_BYTES; } @@ -2692,23 +2707,23 @@ uint16_t SegmentIterator::_evaluate_common_expr_filter(uint16_t* sel_rowid_idx, uint16_t new_size = 0; uint32_t sel_pos = 0; const uint32_t sel_end = selected_size; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES; while (sel_pos < sel_end_simd) { - auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos + sel_pos); + auto mask = simd::bytes_mask_to_bits_mask(filt_pos + sel_pos); if (0 == mask) { //pass - } else if (0xffffffff == mask) { + } else if (simd::bits_mask_all() == mask) { for (uint32_t i = 0; i < SIMD_BYTES; i++) { sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + i]; } } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos]; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { + sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos]; + }, + mask); } sel_pos += SIMD_BYTES; } @@ -2819,53 +2834,16 @@ Status SegmentIterator::current_block_row_locations(std::vector* bl return Status::OK(); } -/** - * solution 1: where cluase included nodes are all `and` leaf nodes, - * predicate pushed down and remove from vconjunct. - * for example: where A = 1 and B = 'test' and B like '%he%'; - * column A : `A = 1` pushed down, this column's predicates all pushed down, - * call _check_column_pred_all_push_down will return true. - * column B : `B = 'test'` pushed down, but `B like '%he%'` remain in vconjunct, - * call _check_column_pred_all_push_down will return false. - * - * solution 2: where cluase included nodes are compound or other complex conditions, - * predicate pushed down but still remain in vconjunct. - * for exmple: where (A = 1 and B = 'test') or B = 'hi' or (C like '%ye%' and C > 'aa'); - * column A : `A = 1` pushed down, check it applyed by index, - * call _check_column_pred_all_push_down will return true. - * column B : `B = 'test'`, `B = 'hi'` all pushed down, check them all applyed by index, - * call _check_column_pred_all_push_down will return true. - * column C : `C like '%ye%'` not pushed down, `C > 'aa'` pushed down, only `C > 'aa'` applyed by index, - * call _check_column_pred_all_push_down will return false. -*/ -bool SegmentIterator::_check_column_pred_all_push_down(const std::string& column_name, - bool in_compound, bool is_match) { - if (_remaining_conjunct_roots.empty()) { - return true; - } - - if (in_compound || is_match) { - auto preds_in_remaining_vconjuct = _column_pred_in_remaining_vconjunct[column_name]; - for (auto pred_info : preds_in_remaining_vconjuct) { - auto column_sign = _gen_predicate_result_sign(&pred_info); - if (!_rowid_result_for_index.contains(column_sign)) { - return false; - } - } - } else { - if (_column_pred_in_remaining_vconjunct[column_name].size() != 0) { - return false; - } - } - return true; -} - void SegmentIterator::_calculate_pred_in_remaining_conjunct_root( const vectorized::VExprSPtr& expr) { if (expr == nullptr) { return; } + if (expr->fn().name.function_name == "multi_match") { + return; + } + auto& children = expr->children(); for (int i = 0; i < children.size(); ++i) { _calculate_pred_in_remaining_conjunct_root(children[i]); @@ -2953,13 +2931,21 @@ void SegmentIterator::_calculate_func_in_remaining_conjunct_root() { bool current_has_compound_pred = has_compound_pred || (expr->node_type() == TExprNodeType::COMPOUND_PRED); - if (expr->node_type() == TExprNodeType::FUNCTION_CALL && - expr->can_push_down_to_index()) { + if (expr->fn().name.function_name == "multi_match") { expr->set_index_unique_id(gen_func_unique_id(expr)); if (current_has_compound_pred) { - compound_func_exprs.emplace_back(expr, root_expr_ctx); + compound_func_exprs.emplace_back(expr); } else { - no_compound_func_exprs.emplace_back(expr, root_expr_ctx); + no_compound_func_exprs.emplace_back(expr); + } + + for (int32_t i = expr->get_num_children() - 1; i >= 0; i--) { + auto child_expr = expr->get_child(i); + if (child_expr->node_type() == TExprNodeType::type::SLOT_REF) { + std::string result_sign = BeConsts::BLOCK_TEMP_COLUMN_PREFIX + + std::to_string(expr->index_unique_id()); + _func_name_to_result_sign[child_expr->expr_name()].push_back(result_sign); + } } } @@ -3048,7 +3034,6 @@ bool SegmentIterator::_can_opt_topn_reads() { } Status SegmentIterator::execute_func_expr(const vectorized::VExprSPtr& expr, - const vectorized::VExprContextSPtr& expr_ctx, std::shared_ptr& result) { const auto& expr0 = expr->get_child(0); if (!expr0 || expr0->node_type() != TExprNodeType::SLOT_REF) { @@ -3061,9 +3046,8 @@ Status SegmentIterator::execute_func_expr(const vectorized::VExprSPtr& expr, params._unique_id = _schema->unique_id(slot_expr->column_id()); params._column_name = _opts.tablet_schema->column(params._column_id).name(); params._segment_iterator = this; - params.result = result; - return expr->eval_inverted_index(expr_ctx.get(), params); + return expr->eval_inverted_index(params, result); } } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index f163376d95fce43..43f0092eb128a47 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -156,7 +156,6 @@ class SegmentIterator : public RowwiseIterator { std::vector>& inverted_index_iterators() { return _inverted_index_iterators; } - [[nodiscard]] Status _init_inverted_index_iterators(ColumnId cid); private: Status _next_batch_internal(vectorized::Block* block); @@ -411,7 +410,6 @@ class SegmentIterator : public RowwiseIterator { bool default_return = false); Status execute_func_expr(const vectorized::VExprSPtr& expr, - const vectorized::VExprContextSPtr& expr_ctx, std::shared_ptr& result); class BitmapRangeIterator; @@ -481,9 +479,8 @@ class SegmentIterator : public RowwiseIterator { std::vector _col_predicates; std::vector _col_preds_except_leafnode_of_andnode; - using FuncExprPair = std::pair; - std::vector no_compound_func_exprs; - std::vector compound_func_exprs; + std::vector no_compound_func_exprs; + std::vector compound_func_exprs; vectorized::VExprContextSPtrs _common_expr_ctxs_push_down; bool _enable_common_expr_pushdown = false; @@ -492,6 +489,7 @@ class SegmentIterator : public RowwiseIterator { std::unique_ptr _column_predicate_info; std::unordered_map> _column_pred_in_remaining_vconjunct; + std::unordered_map> _func_name_to_result_sign; std::set _not_apply_index_pred; // row schema of the key to seek @@ -531,8 +529,6 @@ class SegmentIterator : public RowwiseIterator { std::unordered_map> _column_predicate_inverted_index_status; - - DorisCallOnce _init_single_inverted_index_iterator; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index bdfcaba8b8eb99e..2c94942bac08c79 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -21,11 +21,6 @@ #include #include -#include -#include -#include -#include - // IWYU pragma: no_include #include "cloud/config.h" #include "common/compiler_util.h" // IWYU pragma: keep @@ -42,6 +37,7 @@ #include "olap/data_dir.h" #include "olap/key_coder.h" #include "olap/olap_common.h" +#include "olap/partial_update_info.h" #include "olap/primary_key_index.h" #include "olap/row_cursor.h" // RowCursor // IWYU pragma: keep #include "olap/rowset/rowset_writer_context.h" // RowsetWriterContext @@ -82,45 +78,45 @@ using namespace ErrorCode; const char* k_segment_magic = "D0R1"; const uint32_t k_segment_magic_length = 4; +inline std::string segment_mem_tracker_name(uint32_t segment_id) { + return "SegmentWriter:Segment-" + std::to_string(segment_id); +} + SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, - DataDir* data_dir, uint32_t max_row_per_segment, - const SegmentWriterOptions& opts, - std::shared_ptr mow_context, + DataDir* data_dir, const SegmentWriterOptions& opts, io::FileWriterPtr inverted_file_writer) : _segment_id(segment_id), _tablet_schema(std::move(tablet_schema)), _tablet(std::move(tablet)), _data_dir(data_dir), - _max_row_per_segment(max_row_per_segment), _opts(opts), _file_writer(file_writer), - _mem_tracker(std::make_unique("SegmentWriter:Segment-" + - std::to_string(segment_id))), - _mow_context(std::move(mow_context)) { + _mem_tracker(std::make_unique(segment_mem_tracker_name(segment_id))), + _mow_context(std::move(opts.mow_ctx)) { CHECK_NOTNULL(file_writer); - _num_key_columns = _tablet_schema->num_key_columns(); + _num_sort_key_columns = _tablet_schema->num_key_columns(); _num_short_key_columns = _tablet_schema->num_short_key_columns(); - if (_tablet_schema->cluster_key_idxes().empty()) { - DCHECK(_num_key_columns >= _num_short_key_columns) + if (!_is_mow_with_cluster_key()) { + DCHECK(_num_sort_key_columns >= _num_short_key_columns) << ", table_id=" << _tablet_schema->table_id() - << ", num_key_columns=" << _num_key_columns + << ", num_key_columns=" << _num_sort_key_columns << ", num_short_key_columns=" << _num_short_key_columns << ", cluster_key_columns=" << _tablet_schema->cluster_key_idxes().size(); } - for (size_t cid = 0; cid < _num_key_columns; ++cid) { + for (size_t cid = 0; cid < _num_sort_key_columns; ++cid) { const auto& column = _tablet_schema->column(cid); _key_coders.push_back(get_key_coder(column.type())); _key_index_size.push_back(column.index_length()); } - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + if (_is_mow()) { // encode the sequence id into the primary key index if (_tablet_schema->has_sequence_col()) { const auto& column = _tablet_schema->column(_tablet_schema->sequence_col_idx()); _seq_coder = get_key_coder(column.type()); } // encode the rowid into the primary key index - if (!_tablet_schema->cluster_key_idxes().empty()) { + if (_is_mow_with_cluster_key()) { const auto* type_info = get_scalar_type_info(); _rowid_coder = get_key_coder(type_info->type()); // primary keys @@ -128,7 +124,7 @@ SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, // cluster keys _key_coders.clear(); _key_index_size.clear(); - _num_key_columns = _tablet_schema->cluster_key_idxes().size(); + _num_sort_key_columns = _tablet_schema->cluster_key_idxes().size(); for (auto cid : _tablet_schema->cluster_key_idxes()) { const auto& column = _tablet_schema->column(cid); _key_coders.push_back(get_key_coder(column.type())); @@ -144,6 +140,8 @@ SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, _opts.rowset_ctx->rowset_id.to_string(), segment_id, _tablet_schema->get_inverted_index_storage_format(), std::move(inverted_file_writer)); + _inverted_index_file_writer->set_file_writer_opts( + _opts.rowset_ctx->get_file_writer_options()); } } @@ -287,14 +285,14 @@ Status SegmentWriter::init(const std::vector& col_ids, bool has_key) { // we don't need the short key index for unique key merge on write table. if (_has_key) { - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + if (_is_mow()) { size_t seq_col_length = 0; if (_tablet_schema->has_sequence_col()) { seq_col_length = _tablet_schema->column(_tablet_schema->sequence_col_idx()).length() + 1; } size_t rowid_length = 0; - if (!_tablet_schema->cluster_key_idxes().empty()) { + if (_is_mow_with_cluster_key()) { rowid_length = PrimaryKeyIndexReader::ROW_ID_LENGTH; _short_key_index_builder.reset( new ShortKeyIndexBuilder(_segment_id, _opts.num_rows_per_block)); @@ -481,7 +479,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* block->columns(), _tablet_schema->num_key_columns(), _tablet_schema->num_columns())); } - DCHECK(_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write); + DCHECK(_is_mow()); DCHECK(_opts.rowset_ctx->partial_update_info); // find missing column cids @@ -510,7 +508,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* if (!converted_result.first.ok()) { return converted_result.first; } - if (cid < _num_key_columns) { + if (cid < _num_sort_key_columns) { key_columns.push_back(converted_result.second); } else if (_tablet_schema->has_sequence_col() && cid == _tablet_schema->sequence_col_idx()) { @@ -525,16 +523,8 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* bool has_default_or_nullable = false; std::vector use_default_or_null_flag; use_default_or_null_flag.reserve(num_rows); - const vectorized::Int8* delete_sign_column_data = nullptr; - if (const vectorized::ColumnWithTypeAndName* delete_sign_column = - full_block.try_get_by_name(DELETE_SIGN); - delete_sign_column != nullptr) { - auto& delete_sign_col = - reinterpret_cast(*(delete_sign_column->column)); - if (delete_sign_col.size() >= row_pos + num_rows) { - delete_sign_column_data = delete_sign_col.get_data().data(); - } - } + const auto* delete_sign_column_data = + BaseTablet::get_delete_sign_column_data(full_block, row_pos + num_rows); std::vector specified_rowsets; { @@ -557,6 +547,9 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* } } std::vector> segment_caches(specified_rowsets.size()); + + PartialUpdateReadPlan read_plan; + // locate rows in base data int64_t num_rows_updated = 0; int64_t num_rows_new_added = 0; @@ -631,7 +624,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* // 1. if the delete sign is marked, it means that the value columns of the row will not // be read. So we don't need to read the missing values from the previous rows. - // 2. the one exception is when there are sequence columns in the table, we need to read + // 2. the one exception is when there is sequence column in the table, we need to read // the sequence columns, otherwise it may cause the merge-on-read based compaction // policy to produce incorrect results if (have_delete_sign && !_tablet_schema->has_sequence_col()) { @@ -641,7 +634,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* // partial update should not contain invisible columns use_default_or_null_flag.emplace_back(false); _rsid_to_rowset.emplace(rowset->rowset_id(), rowset); - _tablet->prepare_to_read(loc, segment_pos, &_rssid_to_rid); + read_plan.prepare_to_read(loc, segment_pos); } if (st.is()) { @@ -665,10 +658,10 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* } // read and fill block - auto mutable_full_columns = full_block.mutate_columns(); - RETURN_IF_ERROR(fill_missing_columns(mutable_full_columns, use_default_or_null_flag, - has_default_or_nullable, segment_start_pos, block)); - full_block.set_columns(std::move(mutable_full_columns)); + RETURN_IF_ERROR(read_plan.fill_missing_columns( + _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, + use_default_or_null_flag, has_default_or_nullable, segment_start_pos, block)); + // convert block to row store format _serialize_block_to_row_column(full_block); @@ -724,134 +717,6 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* return Status::OK(); } -Status SegmentWriter::fill_missing_columns(vectorized::MutableColumns& mutable_full_columns, - const std::vector& use_default_or_null_flag, - bool has_default_or_nullable, - const size_t& segment_start_pos, - const vectorized::Block* block) { - if (config::is_cloud_mode()) { - // TODO(plat1ko): cloud mode - return Status::NotSupported("fill_missing_columns"); - } - // create old value columns - const auto& cids_missing = _opts.rowset_ctx->partial_update_info->missing_cids; - auto old_value_block = _tablet_schema->create_block_by_cids(cids_missing); - CHECK_EQ(cids_missing.size(), old_value_block.columns()); - bool has_row_column = _tablet_schema->has_row_store_for_all_columns(); - // record real pos, key is input line num, value is old_block line num - std::map read_index; - size_t read_idx = 0; - for (auto rs_it : _rssid_to_rid) { - for (auto seg_it : rs_it.second) { - auto rowset = _rsid_to_rowset[rs_it.first]; - CHECK(rowset); - std::vector rids; - for (auto id_and_pos : seg_it.second) { - rids.emplace_back(id_and_pos.rid); - read_index[id_and_pos.pos] = read_idx++; - } - if (has_row_column) { - auto st = _tablet->fetch_value_through_row_column( - rowset, *_tablet_schema, seg_it.first, rids, cids_missing, old_value_block); - if (!st.ok()) { - LOG(WARNING) << "failed to fetch value through row column"; - return st; - } - continue; - } - auto mutable_old_columns = old_value_block.mutate_columns(); - for (size_t cid = 0; cid < mutable_old_columns.size(); ++cid) { - TabletColumn tablet_column = _tablet_schema->column(cids_missing[cid]); - auto st = _tablet->fetch_value_by_rowids(rowset, seg_it.first, rids, tablet_column, - mutable_old_columns[cid]); - // set read value to output block - if (!st.ok()) { - LOG(WARNING) << "failed to fetch value by rowids"; - return st; - } - } - old_value_block.set_columns(std::move(mutable_old_columns)); - } - } - // build default value columns - auto default_value_block = old_value_block.clone_empty(); - auto mutable_default_value_columns = default_value_block.mutate_columns(); - - const vectorized::Int8* delete_sign_column_data = nullptr; - if (const vectorized::ColumnWithTypeAndName* delete_sign_column = - old_value_block.try_get_by_name(DELETE_SIGN); - delete_sign_column != nullptr) { - auto& delete_sign_col = - reinterpret_cast(*(delete_sign_column->column)); - delete_sign_column_data = delete_sign_col.get_data().data(); - } - - if (has_default_or_nullable || delete_sign_column_data != nullptr) { - for (auto i = 0; i < cids_missing.size(); ++i) { - const auto& column = _tablet_schema->column(cids_missing[i]); - if (column.has_default_value()) { - const auto& default_value = - _opts.rowset_ctx->partial_update_info->default_values[i]; - vectorized::ReadBuffer rb(const_cast(default_value.c_str()), - default_value.size()); - RETURN_IF_ERROR(old_value_block.get_by_position(i).type->from_string( - rb, mutable_default_value_columns[i].get())); - } - } - } - - // fill all missing value from mutable_old_columns, need to consider default value and null value - for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { - // `use_default_or_null_flag[idx] == true` doesn't mean that we should read values from the old row - // for the missing columns. For example, if a table has sequence column, the rows with DELETE_SIGN column - // marked will not be marked in delete bitmap(see https://github.com/apache/doris/pull/24011), so it will - // be found in Tablet::lookup_row_key() and `use_default_or_null_flag[idx]` will be false. But we should not - // read values from old rows for missing values in this occasion. So we should read the DELETE_SIGN column - // to check if a row REALLY exists in the table. - if (use_default_or_null_flag[idx] || - (delete_sign_column_data != nullptr && - delete_sign_column_data[read_index[idx + segment_start_pos]] != 0)) { - for (auto i = 0; i < cids_missing.size(); ++i) { - // if the column has default value, fill it with default value - // otherwise, if the column is nullable, fill it with null value - const auto& tablet_column = _tablet_schema->column(cids_missing[i]); - if (tablet_column.has_default_value()) { - mutable_full_columns[cids_missing[i]]->insert_from( - *mutable_default_value_columns[i].get(), 0); - } else if (tablet_column.is_nullable()) { - auto nullable_column = assert_cast( - mutable_full_columns[cids_missing[i]].get()); - nullable_column->insert_null_elements(1); - } else if (_tablet_schema->auto_increment_column() == tablet_column.name()) { - const auto& column = *DORIS_TRY( - _opts.rowset_ctx->tablet_schema->column(tablet_column.name())); - DCHECK(column.type() == FieldType::OLAP_FIELD_TYPE_BIGINT); - auto auto_inc_column = assert_cast( - mutable_full_columns[cids_missing[i]].get()); - auto_inc_column->insert( - (assert_cast( - block->get_by_name("__PARTIAL_UPDATE_AUTO_INC_COLUMN__") - .column.get())) - ->get_element(idx)); - } else { - // If the control flow reaches this branch, the column neither has default value - // nor is nullable. It means that the row's delete sign is marked, and the value - // columns are useless and won't be read. So we can just put arbitary values in the cells - mutable_full_columns[cids_missing[i]]->insert_default(); - } - } - continue; - } - auto pos_in_old_block = read_index[idx + segment_start_pos]; - for (auto i = 0; i < cids_missing.size(); ++i) { - mutable_full_columns[cids_missing[i]]->insert_from( - *old_value_block.get_columns_with_type_and_name()[i].column.get(), - pos_in_old_block); - } - } - return Status::OK(); -} - Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_pos, size_t num_rows) { if (_opts.rowset_ctx->partial_update_info && @@ -909,19 +774,9 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po converted_result.second->get_data(), num_rows)); } if (_has_key) { - // for now we don't need to query short key index for CLUSTER BY feature, - // but we still write the index for future usage. - bool need_primary_key_indexes = (_tablet_schema->keys_type() == UNIQUE_KEYS && - _opts.enable_unique_key_merge_on_write); - bool need_short_key_indexes = - !need_primary_key_indexes || - (need_primary_key_indexes && !_tablet_schema->cluster_key_idxes().empty()); - if (need_primary_key_indexes && !need_short_key_indexes) { // mow table without cluster keys - RETURN_IF_ERROR(_generate_primary_key_index(_key_coders, key_columns, seq_column, - num_rows, false)); - } else if (!need_primary_key_indexes && need_short_key_indexes) { // other tables - RETURN_IF_ERROR(_generate_short_key_index(key_columns, num_rows, short_key_pos)); - } else if (need_primary_key_indexes && need_short_key_indexes) { // mow with cluster keys + if (_is_mow_with_cluster_key()) { + // for now we don't need to query short key index for CLUSTER BY feature, + // but we still write the index for future usage. // 1. generate primary key index, the key_columns is primary_key_columns RETURN_IF_ERROR(_generate_primary_key_index(_primary_key_coders, key_columns, seq_column, num_rows, true)); @@ -941,6 +796,11 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po } } RETURN_IF_ERROR(_generate_short_key_index(key_columns, num_rows, short_key_pos)); + } else if (_is_mow()) { + RETURN_IF_ERROR(_generate_primary_key_index(_key_coders, key_columns, seq_column, + num_rows, false)); + } else { + RETURN_IF_ERROR(_generate_short_key_index(key_columns, num_rows, short_key_pos)); } } @@ -958,11 +818,11 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po int64_t SegmentWriter::max_row_to_add(size_t row_avg_size_in_bytes) { auto segment_size = estimate_segment_size(); if (PREDICT_FALSE(segment_size >= MAX_SEGMENT_SIZE || - _num_rows_written >= _max_row_per_segment)) { + _num_rows_written >= _opts.max_rows_per_segment)) { return 0; } int64_t size_rows = ((int64_t)MAX_SEGMENT_SIZE - (int64_t)segment_size) / row_avg_size_in_bytes; - int64_t count_rows = (int64_t)_max_row_per_segment - _num_rows_written; + int64_t count_rows = (int64_t)_opts.max_rows_per_segment - _num_rows_written; return std::min(size_rows, count_rows); } @@ -970,8 +830,9 @@ int64_t SegmentWriter::max_row_to_add(size_t row_avg_size_in_bytes) { std::string SegmentWriter::_full_encode_keys( const std::vector& key_columns, size_t pos, bool null_first) { - assert(_key_index_size.size() == _num_key_columns); - assert(key_columns.size() == _num_key_columns && _key_coders.size() == _num_key_columns); + assert(_key_index_size.size() == _num_sort_key_columns); + assert(key_columns.size() == _num_sort_key_columns && + _key_coders.size() == _num_sort_key_columns); return _full_encode_keys(_key_coders, key_columns, pos, null_first); } @@ -1050,7 +911,7 @@ Status SegmentWriter::append_row(const RowType& row) { RETURN_IF_ERROR(_column_writers[cid]->append(cell)); } std::string full_encoded_key; - encode_key(&full_encoded_key, row, _num_key_columns); + encode_key(&full_encoded_key, row, _num_sort_key_columns); if (_tablet_schema->has_sequence_col()) { full_encoded_key.push_back(KEY_NORMAL_MARKER); auto cid = _tablet_schema->sequence_col_idx(); @@ -1058,7 +919,10 @@ Status SegmentWriter::append_row(const RowType& row) { row.schema()->column(cid)->full_encode_ascending(cell.cell_ptr(), &full_encoded_key); } - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + if (_is_mow_with_cluster_key()) { + return Status::InternalError( + "SegmentWriter::append_row does not support mow tables with cluster key"); + } else if (_is_mow()) { RETURN_IF_ERROR(_primary_key_index_builder->add_item(full_encoded_key)); } else { // At the beginning of one block, so add a short key index entry @@ -1085,7 +949,9 @@ uint64_t SegmentWriter::estimate_segment_size() { for (auto& column_writer : _column_writers) { size += column_writer->estimate_buffer_size(); } - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + if (_is_mow_with_cluster_key()) { + size += _primary_key_index_builder->size() + _short_key_index_builder->size(); + } else if (_is_mow()) { size += _primary_key_index_builder->size(); } else { size += _short_key_index_builder->size(); @@ -1096,13 +962,6 @@ uint64_t SegmentWriter::estimate_segment_size() { return size; } -size_t SegmentWriter::try_get_inverted_index_file_size() { - if (_inverted_index_file_writer != nullptr) { - return _inverted_index_file_writer->get_index_file_size(); - } - return 0; -} - Status SegmentWriter::finalize_columns_data() { if (_has_key) { _row_count = _num_rows_written; @@ -1136,19 +995,17 @@ Status SegmentWriter::finalize_columns_index(uint64_t* index_size) { *index_size = _file_writer->bytes_appended() - index_start; if (_has_key) { - bool write_short_key_index = _tablet_schema->keys_type() != UNIQUE_KEYS || - (_tablet_schema->keys_type() == UNIQUE_KEYS && - !_opts.enable_unique_key_merge_on_write) || - (_tablet_schema->keys_type() == UNIQUE_KEYS && - _opts.enable_unique_key_merge_on_write && - !_tablet_schema->cluster_key_idxes().empty()); - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + if (_is_mow_with_cluster_key()) { + RETURN_IF_ERROR(_write_short_key_index()); + *index_size = _file_writer->bytes_appended() - index_start; + RETURN_IF_ERROR(_write_primary_key_index()); + *index_size += _primary_key_index_builder->disk_size(); + } else if (_is_mow()) { RETURN_IF_ERROR(_write_primary_key_index()); // IndexedColumnWriter write data pages mixed with segment data, we should use // the stat from primary key index builder. *index_size += _primary_key_index_builder->disk_size(); - } - if (write_short_key_index) { + } else { RETURN_IF_ERROR(_write_short_key_index()); *index_size = _file_writer->bytes_appended() - index_start; } @@ -1169,8 +1026,8 @@ Status SegmentWriter::finalize_footer(uint64_t* segment_file_size) { } if (_inverted_index_file_writer != nullptr) { RETURN_IF_ERROR(_inverted_index_file_writer->close()); + _inverted_index_file_info = _inverted_index_file_writer->get_index_file_info(); } - _inverted_index_file_size = try_get_inverted_index_file_size(); return Status::OK(); } @@ -1307,14 +1164,12 @@ Status SegmentWriter::_write_raw_data(const std::vector& slices) { } Slice SegmentWriter::min_encoded_key() { - return (_primary_key_index_builder == nullptr || !_tablet_schema->cluster_key_idxes().empty()) - ? Slice(_min_key.data(), _min_key.size()) - : _primary_key_index_builder->min_key(); + return (_primary_key_index_builder == nullptr) ? Slice(_min_key.data(), _min_key.size()) + : _primary_key_index_builder->min_key(); } Slice SegmentWriter::max_encoded_key() { - return (_primary_key_index_builder == nullptr || !_tablet_schema->cluster_key_idxes().empty()) - ? Slice(_max_key.data(), _max_key.size()) - : _primary_key_index_builder->max_key(); + return (_primary_key_index_builder == nullptr) ? Slice(_max_key.data(), _max_key.size()) + : _primary_key_index_builder->max_key(); } void SegmentWriter::set_min_max_key(const Slice& key) { @@ -1403,5 +1258,19 @@ Status SegmentWriter::_generate_short_key_index( return Status::OK(); } +int64_t SegmentWriter::get_inverted_index_total_size() { + if (_inverted_index_file_writer != nullptr) { + return _inverted_index_file_writer->get_index_file_total_size(); + } + return 0; +} + +inline bool SegmentWriter::_is_mow() { + return _tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write; +} + +inline bool SegmentWriter::_is_mow_with_cluster_key() { + return _is_mow() && !_tablet_schema->cluster_key_idxes().empty(); +} } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 9c667ee92fc3b17..c4b571cfc19d9d7 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -23,18 +23,14 @@ #include #include -#include #include #include // unique_ptr #include -#include #include #include "common/status.h" // Status #include "gen_cpp/segment_v2.pb.h" -#include "gutil/macros.h" #include "gutil/strings/substitute.h" -#include "io/fs/file_system.h" #include "olap/olap_define.h" #include "olap/rowset/segment_v2/column_writer.h" #include "olap/tablet.h" @@ -71,11 +67,13 @@ extern const uint32_t k_segment_magic_length; struct SegmentWriterOptions { uint32_t num_rows_per_block = 1024; + uint32_t max_rows_per_segment = UINT32_MAX; bool enable_unique_key_merge_on_write = false; CompressionTypePB compression_type = UNKNOWN_COMPRESSION; RowsetWriterContext* rowset_ctx = nullptr; DataWriteType write_type = DataWriteType::TYPE_DEFAULT; + std::shared_ptr mow_ctx; }; using TabletSharedPtr = std::shared_ptr; @@ -84,8 +82,7 @@ class SegmentWriter { public: explicit SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, DataDir* data_dir, - uint32_t max_row_per_segment, const SegmentWriterOptions& opts, - std::shared_ptr mow_context, + const SegmentWriterOptions& opts, io::FileWriterPtr inverted_file_writer = nullptr); ~SegmentWriter(); @@ -105,9 +102,10 @@ class SegmentWriter { int64_t max_row_to_add(size_t row_avg_size_in_bytes); uint64_t estimate_segment_size(); - size_t try_get_inverted_index_file_size(); - size_t get_inverted_index_file_size() const { return _inverted_index_file_size; } + InvertedIndexFileInfo get_inverted_index_file_info() const { return _inverted_index_file_info; } + int64_t get_inverted_index_total_size(); + uint32_t num_rows_written() const { return _num_rows_written; } // for partial update @@ -120,7 +118,7 @@ class SegmentWriter { Status finalize(uint64_t* segment_file_size, uint64_t* index_size); - uint32_t get_segment_id() { return _segment_id; } + uint32_t get_segment_id() const { return _segment_id; } Status finalize_columns_data(); Status finalize_columns_index(uint64_t* index_size); @@ -138,10 +136,6 @@ class SegmentWriter { TabletSchemaSPtr flush_schema() const { return _flush_schema; }; void set_mow_context(std::shared_ptr mow_context); - Status fill_missing_columns(vectorized::MutableColumns& mutable_full_columns, - const std::vector& use_default_or_null_flag, - bool has_default_or_nullable, const size_t& segment_start_pos, - const vectorized::Block* block); private: DISALLOW_COPY_AND_ASSIGN(SegmentWriter); @@ -186,22 +180,25 @@ class SegmentWriter { vectorized::IOlapColumnDataAccessor* seq_column, size_t num_rows, bool need_sort); Status _generate_short_key_index(std::vector& key_columns, size_t num_rows, const std::vector& short_key_pos); + bool _is_mow(); + bool _is_mow_with_cluster_key(); private: uint32_t _segment_id; TabletSchemaSPtr _tablet_schema; BaseTabletSPtr _tablet; DataDir* _data_dir = nullptr; - uint32_t _max_row_per_segment; SegmentWriterOptions _opts; // Not owned. owned by RowsetWriter or SegmentFlusher io::FileWriter* _file_writer = nullptr; std::unique_ptr _inverted_index_file_writer; SegmentFooterPB _footer; - size_t _num_key_columns; + // for mow tables with cluster key, the sort key is the cluster keys not unique keys + // for other tables, the sort key is the keys + size_t _num_sort_key_columns; size_t _num_short_key_columns; - size_t _inverted_index_file_size; + InvertedIndexFileInfo _inverted_index_file_info; std::unique_ptr _short_key_index_builder; std::unique_ptr _primary_key_index_builder; std::vector> _column_writers; @@ -240,7 +237,6 @@ class SegmentWriter { std::shared_ptr _mow_context; // group every rowset-segment row id to speed up reader - PartialUpdateReadPlan _rssid_to_rid; std::map _rsid_to_rowset; // contains auto generated columns, should be nullptr if no variants's subcolumns TabletSchemaSPtr _flush_schema = nullptr; diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index ba1bfcf353539f3..891fd8c6a10ce64 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -37,6 +36,7 @@ #include "inverted_index_fs_directory.h" #include "io/fs/file_writer.h" #include "io/fs/local_file_system.h" +#include "olap/base_tablet.h" #include "olap/data_dir.h" #include "olap/key_coder.h" #include "olap/olap_common.h" @@ -80,11 +80,14 @@ using namespace ErrorCode; static const char* k_segment_magic = "D0R1"; static const uint32_t k_segment_magic_length = 4; +inline std::string vertical_segment_writer_mem_tracker_name(uint32_t segment_id) { + return "VerticalSegmentWriter:Segment-" + std::to_string(segment_id); +} + VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, - DataDir* data_dir, uint32_t max_row_per_segment, + DataDir* data_dir, const VerticalSegmentWriterOptions& opts, - std::shared_ptr mow_context, io::FileWriterPtr inverted_file_writer) : _segment_id(segment_id), _tablet_schema(std::move(tablet_schema)), @@ -92,23 +95,46 @@ VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32 _data_dir(data_dir), _opts(opts), _file_writer(file_writer), - _mem_tracker(std::make_unique("VerticalSegmentWriter:Segment-" + - std::to_string(segment_id))), - _mow_context(std::move(mow_context)) { + _mem_tracker(std::make_unique( + vertical_segment_writer_mem_tracker_name(segment_id))), + _mow_context(std::move(opts.mow_ctx)) { CHECK_NOTNULL(file_writer); - _num_key_columns = _tablet_schema->num_key_columns(); + _num_sort_key_columns = _tablet_schema->num_key_columns(); _num_short_key_columns = _tablet_schema->num_short_key_columns(); - DCHECK(_num_key_columns >= _num_short_key_columns); - for (size_t cid = 0; cid < _num_key_columns; ++cid) { + if (!_is_mow_with_cluster_key()) { + DCHECK(_num_sort_key_columns >= _num_short_key_columns) + << ", table_id=" << _tablet_schema->table_id() + << ", num_key_columns=" << _num_sort_key_columns + << ", num_short_key_columns=" << _num_short_key_columns + << ", cluster_key_columns=" << _tablet_schema->cluster_key_idxes().size(); + } + for (size_t cid = 0; cid < _num_sort_key_columns; ++cid) { const auto& column = _tablet_schema->column(cid); _key_coders.push_back(get_key_coder(column.type())); _key_index_size.push_back(column.index_length()); } // encode the sequence id into the primary key index - if (_tablet_schema->has_sequence_col() && _tablet_schema->keys_type() == UNIQUE_KEYS && - _opts.enable_unique_key_merge_on_write) { - const auto& column = _tablet_schema->column(_tablet_schema->sequence_col_idx()); - _seq_coder = get_key_coder(column.type()); + if (_is_mow()) { + if (_tablet_schema->has_sequence_col()) { + const auto& column = _tablet_schema->column(_tablet_schema->sequence_col_idx()); + _seq_coder = get_key_coder(column.type()); + } + // encode the rowid into the primary key index + if (_is_mow_with_cluster_key()) { + const auto* type_info = get_scalar_type_info(); + _rowid_coder = get_key_coder(type_info->type()); + // primary keys + _primary_key_coders.swap(_key_coders); + // cluster keys + _key_coders.clear(); + _key_index_size.clear(); + _num_sort_key_columns = _tablet_schema->cluster_key_idxes().size(); + for (auto cid : _tablet_schema->cluster_key_idxes()) { + const auto& column = _tablet_schema->column(cid); + _key_coders.push_back(get_key_coder(column.type())); + _key_index_size.push_back(column.index_length()); + } + } } if (_tablet_schema->has_inverted_index()) { _inverted_index_file_writer = std::make_unique( @@ -118,6 +144,8 @@ VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32 _opts.rowset_ctx->rowset_id.to_string(), segment_id, _tablet_schema->get_inverted_index_storage_format(), std::move(inverted_file_writer)); + _inverted_index_file_writer->set_file_writer_opts( + _opts.rowset_ctx->get_file_writer_options()); } } @@ -244,14 +272,14 @@ Status VerticalSegmentWriter::init() { _olap_data_convertor->reserve(_tablet_schema->num_columns()); _column_writers.reserve(_tablet_schema->columns().size()); // we don't need the short key index for unique key merge on write table. - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + if (_is_mow()) { size_t seq_col_length = 0; if (_tablet_schema->has_sequence_col()) { seq_col_length = _tablet_schema->column(_tablet_schema->sequence_col_idx()).length() + 1; } size_t rowid_length = 0; - if (!_tablet_schema->cluster_key_idxes().empty()) { + if (_is_mow_with_cluster_key()) { rowid_length = PrimaryKeyIndexReader::ROW_ID_LENGTH; _short_key_index_builder.reset( new ShortKeyIndexBuilder(_segment_id, _opts.num_rows_per_block)); @@ -314,7 +342,7 @@ void VerticalSegmentWriter::_serialize_block_to_row_column(vectorized::Block& bl // 3. set columns to data convertor and then write all columns Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& data, vectorized::Block& full_block) { - DCHECK(_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write); + DCHECK(_is_mow()); DCHECK(_opts.rowset_ctx->partial_update_info != nullptr); // create full block and fill with input columns @@ -340,7 +368,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da if (!status.ok()) { return status; } - if (cid < _num_key_columns) { + if (cid < _num_sort_key_columns) { key_columns.push_back(column); } else if (_tablet_schema->has_sequence_col() && cid == _tablet_schema->sequence_col_idx()) { @@ -354,16 +382,8 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da bool has_default_or_nullable = false; std::vector use_default_or_null_flag; use_default_or_null_flag.reserve(data.num_rows); - const vectorized::Int8* delete_sign_column_data = nullptr; - if (const vectorized::ColumnWithTypeAndName* delete_sign_column = - full_block.try_get_by_name(DELETE_SIGN); - delete_sign_column != nullptr) { - auto& delete_sign_col = - reinterpret_cast(*(delete_sign_column->column)); - if (delete_sign_col.size() >= data.row_pos + data.num_rows) { - delete_sign_column_data = delete_sign_col.get_data().data(); - } - } + const auto* delete_sign_column_data = + BaseTablet::get_delete_sign_column_data(full_block, data.row_pos + data.num_rows); std::vector specified_rowsets; { @@ -389,6 +409,8 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da } std::vector> segment_caches(specified_rowsets.size()); + PartialUpdateReadPlan read_plan; + // locate rows in base data int64_t num_rows_updated = 0; int64_t num_rows_new_added = 0; @@ -471,7 +493,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da // partial update should not contain invisible columns use_default_or_null_flag.emplace_back(false); _rsid_to_rowset.emplace(rowset->rowset_id(), rowset); - _tablet->prepare_to_read(loc, segment_pos, &_rssid_to_rid); + read_plan.prepare_to_read(loc, segment_pos); } if (st.is()) { @@ -495,9 +517,9 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da } // read and fill block - auto mutable_full_columns = full_block.mutate_columns(); - RETURN_IF_ERROR(_fill_missing_columns(mutable_full_columns, use_default_or_null_flag, - has_default_or_nullable, segment_start_pos, data.block)); + RETURN_IF_ERROR(read_plan.fill_missing_columns( + _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, + use_default_or_null_flag, has_default_or_nullable, segment_start_pos, data.block)); // row column should be filled here // convert block to row store format @@ -555,128 +577,6 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da return Status::OK(); } -Status VerticalSegmentWriter::_fill_missing_columns( - vectorized::MutableColumns& mutable_full_columns, - const std::vector& use_default_or_null_flag, bool has_default_or_nullable, - const size_t& segment_start_pos, const vectorized::Block* block) { - // create old value columns - const auto& missing_cids = _opts.rowset_ctx->partial_update_info->missing_cids; - auto old_value_block = _tablet_schema->create_block_by_cids(missing_cids); - CHECK_EQ(missing_cids.size(), old_value_block.columns()); - auto mutable_old_columns = old_value_block.mutate_columns(); - bool has_row_column = _tablet_schema->has_row_store_for_all_columns(); - // record real pos, key is input line num, value is old_block line num - std::map read_index; - size_t read_idx = 0; - for (auto rs_it : _rssid_to_rid) { - for (auto seg_it : rs_it.second) { - auto rowset = _rsid_to_rowset[rs_it.first]; - CHECK(rowset); - std::vector rids; - for (auto id_and_pos : seg_it.second) { - rids.emplace_back(id_and_pos.rid); - read_index[id_and_pos.pos] = read_idx++; - } - if (has_row_column) { - auto st = _tablet->fetch_value_through_row_column( - rowset, *_tablet_schema, seg_it.first, rids, missing_cids, old_value_block); - if (!st.ok()) { - LOG(WARNING) << "failed to fetch value through row column"; - return st; - } - continue; - } - for (size_t cid = 0; cid < mutable_old_columns.size(); ++cid) { - TabletColumn tablet_column = _tablet_schema->column(missing_cids[cid]); - auto st = _tablet->fetch_value_by_rowids(rowset, seg_it.first, rids, tablet_column, - mutable_old_columns[cid]); - // set read value to output block - if (!st.ok()) { - LOG(WARNING) << "failed to fetch value by rowids"; - return st; - } - } - } - } - // build default value columns - auto default_value_block = old_value_block.clone_empty(); - auto mutable_default_value_columns = default_value_block.mutate_columns(); - - const vectorized::Int8* delete_sign_column_data = nullptr; - if (const vectorized::ColumnWithTypeAndName* delete_sign_column = - old_value_block.try_get_by_name(DELETE_SIGN); - delete_sign_column != nullptr) { - auto& delete_sign_col = - reinterpret_cast(*(delete_sign_column->column)); - delete_sign_column_data = delete_sign_col.get_data().data(); - } - - if (has_default_or_nullable || delete_sign_column_data != nullptr) { - for (auto i = 0; i < missing_cids.size(); ++i) { - const auto& column = _tablet_schema->column(missing_cids[i]); - if (column.has_default_value()) { - const auto& default_value = - _opts.rowset_ctx->partial_update_info->default_values[i]; - vectorized::ReadBuffer rb(const_cast(default_value.c_str()), - default_value.size()); - RETURN_IF_ERROR(old_value_block.get_by_position(i).type->from_string( - rb, mutable_default_value_columns[i].get())); - } - } - } - - // fill all missing value from mutable_old_columns, need to consider default value and null value - for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { - // `use_default_or_null_flag[idx] == true` doesn't mean that we should read values from the old row - // for the missing columns. For example, if a table has sequence column, the rows with DELETE_SIGN column - // marked will not be marked in delete bitmap(see https://github.com/apache/doris/pull/24011), so it will - // be found in Tablet::lookup_row_key() and `use_default_or_null_flag[idx]` will be false. But we should not - // read values from old rows for missing values in this occasion. So we should read the DELETE_SIGN column - // to check if a row REALLY exists in the table. - if (use_default_or_null_flag[idx] || - (delete_sign_column_data != nullptr && - delete_sign_column_data[read_index[idx + segment_start_pos]] != 0)) { - for (auto i = 0; i < missing_cids.size(); ++i) { - // if the column has default value, fill it with default value - // otherwise, if the column is nullable, fill it with null value - const auto& tablet_column = _tablet_schema->column(missing_cids[i]); - if (tablet_column.has_default_value()) { - mutable_full_columns[missing_cids[i]]->insert_from( - *mutable_default_value_columns[i].get(), 0); - } else if (tablet_column.is_nullable()) { - auto nullable_column = assert_cast( - mutable_full_columns[missing_cids[i]].get()); - nullable_column->insert_null_elements(1); - } else if (_tablet_schema->auto_increment_column() == tablet_column.name()) { - const auto& column = *DORIS_TRY( - _opts.rowset_ctx->tablet_schema->column(tablet_column.name())); - DCHECK(column.type() == FieldType::OLAP_FIELD_TYPE_BIGINT); - auto auto_inc_column = assert_cast( - mutable_full_columns[missing_cids[i]].get()); - auto_inc_column->insert( - (assert_cast( - block->get_by_name("__PARTIAL_UPDATE_AUTO_INC_COLUMN__") - .column.get())) - ->get_element(idx)); - } else { - // If the control flow reaches this branch, the column neither has default value - // nor is nullable. It means that the row's delete sign is marked, and the value - // columns are useless and won't be read. So we can just put arbitary values in the cells - mutable_full_columns[missing_cids[i]]->insert_default(); - } - } - continue; - } - auto pos_in_old_block = read_index[idx + segment_start_pos]; - for (auto i = 0; i < missing_cids.size(); ++i) { - mutable_full_columns[missing_cids[i]]->insert_from( - *old_value_block.get_columns_with_type_and_name()[i].column.get(), - pos_in_old_block); - } - } - return Status::OK(); -} - Status VerticalSegmentWriter::batch_block(const vectorized::Block* block, size_t row_pos, size_t num_rows) { if (_opts.rowset_ctx->partial_update_info && @@ -842,6 +742,7 @@ Status VerticalSegmentWriter::write_batch() { std::vector key_columns; vectorized::IOlapColumnDataAccessor* seq_column = nullptr; + std::map cid_to_column; for (uint32_t cid = 0; cid < _tablet_schema->num_columns(); ++cid) { RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid), _tablet_schema)); for (auto& data : _batched_blocks) { @@ -853,12 +754,18 @@ Status VerticalSegmentWriter::write_batch() { if (!status.ok()) { return status; } - if (cid < _num_key_columns) { + if (cid < _tablet_schema->num_key_columns()) { key_columns.push_back(column); - } else if (_tablet_schema->has_sequence_col() && - cid == _tablet_schema->sequence_col_idx()) { + } + if (_tablet_schema->has_sequence_col() && cid == _tablet_schema->sequence_col_idx()) { seq_column = column; } + if (_is_mow_with_cluster_key() && + std::find(_tablet_schema->cluster_key_idxes().begin(), + _tablet_schema->cluster_key_idxes().end(), + cid) != _tablet_schema->cluster_key_idxes().end()) { + cid_to_column[cid] = column; + } RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), data.num_rows)); _olap_data_convertor->clear_source_content(); @@ -874,44 +781,7 @@ Status VerticalSegmentWriter::write_batch() { for (auto& data : _batched_blocks) { _olap_data_convertor->set_source_content(data.block, data.row_pos, data.num_rows); - // find all row pos for short key indexes - std::vector short_key_pos; - // We build a short key index every `_opts.num_rows_per_block` rows. Specifically, we - // build a short key index using 1st rows for first block and `_short_key_row_pos - _row_count` - // for next blocks. - if (_short_key_row_pos == 0 && _num_rows_written == 0) { - short_key_pos.push_back(0); - } - while (_short_key_row_pos + _opts.num_rows_per_block < _num_rows_written + data.num_rows) { - _short_key_row_pos += _opts.num_rows_per_block; - short_key_pos.push_back(_short_key_row_pos - _num_rows_written); - } - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { - // create primary indexes - std::string last_key; - for (size_t pos = 0; pos < data.num_rows; pos++) { - std::string key = _full_encode_keys(key_columns, pos); - _maybe_invalid_row_cache(key); - if (_tablet_schema->has_sequence_col()) { - _encode_seq_column(seq_column, pos, &key); - } - DCHECK(key.compare(last_key) > 0) - << "found duplicate key or key is not sorted! current key: " << key - << ", last key" << last_key; - RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); - last_key = std::move(key); - } - } else { - // create short key indexes' - // for min_max key - _set_min_key(_full_encode_keys(key_columns, 0)); - _set_max_key(_full_encode_keys(key_columns, data.num_rows - 1)); - - key_columns.resize(_num_short_key_columns); - for (const auto pos : short_key_pos) { - RETURN_IF_ERROR(_short_key_index_builder->add_item(_encode_keys(key_columns, pos))); - } - } + RETURN_IF_ERROR(_generate_key_index(data, key_columns, seq_column, cid_to_column)); _olap_data_convertor->clear_source_content(); _num_rows_written += data.num_rows; } @@ -933,10 +803,117 @@ Status VerticalSegmentWriter::write_batch() { return Status::OK(); } +Status VerticalSegmentWriter::_generate_key_index( + RowsInBlock& data, std::vector& key_columns, + vectorized::IOlapColumnDataAccessor* seq_column, + std::map& cid_to_column) { + // find all row pos for short key indexes + std::vector short_key_pos; + // We build a short key index every `_opts.num_rows_per_block` rows. Specifically, we + // build a short key index using 1st rows for first block and `_short_key_row_pos - _row_count` + // for next blocks. + if (_short_key_row_pos == 0 && _num_rows_written == 0) { + short_key_pos.push_back(0); + } + while (_short_key_row_pos + _opts.num_rows_per_block < _num_rows_written + data.num_rows) { + _short_key_row_pos += _opts.num_rows_per_block; + short_key_pos.push_back(_short_key_row_pos - _num_rows_written); + } + if (_is_mow_with_cluster_key()) { + // 1. generate primary key index + RETURN_IF_ERROR(_generate_primary_key_index(_primary_key_coders, key_columns, seq_column, + data.num_rows, true)); + // 2. generate short key index (use cluster key) + std::vector short_key_columns; + for (const auto& cid : _tablet_schema->cluster_key_idxes()) { + short_key_columns.push_back(cid_to_column[cid]); + } + RETURN_IF_ERROR(_generate_short_key_index(short_key_columns, data.num_rows, short_key_pos)); + } else if (_is_mow()) { + RETURN_IF_ERROR(_generate_primary_key_index(_key_coders, key_columns, seq_column, + data.num_rows, false)); + } else { // other tables + RETURN_IF_ERROR(_generate_short_key_index(key_columns, data.num_rows, short_key_pos)); + } + return Status::OK(); +} + +Status VerticalSegmentWriter::_generate_primary_key_index( + const std::vector& primary_key_coders, + const std::vector& primary_key_columns, + vectorized::IOlapColumnDataAccessor* seq_column, size_t num_rows, bool need_sort) { + if (!need_sort) { // mow table without cluster key + std::string last_key; + for (size_t pos = 0; pos < num_rows; pos++) { + // use _key_coders + std::string key = _full_encode_keys(primary_key_columns, pos); + _maybe_invalid_row_cache(key); + if (_tablet_schema->has_sequence_col()) { + _encode_seq_column(seq_column, pos, &key); + } + DCHECK(key.compare(last_key) > 0) + << "found duplicate key or key is not sorted! current key: " << key + << ", last key" << last_key; + RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); + last_key = std::move(key); + } + } else { // mow table with cluster key + // 1. generate primary keys in memory + std::vector primary_keys; + for (uint32_t pos = 0; pos < num_rows; pos++) { + std::string key = _full_encode_keys(primary_key_coders, primary_key_columns, pos); + _maybe_invalid_row_cache(key); + if (_tablet_schema->has_sequence_col()) { + _encode_seq_column(seq_column, pos, &key); + } + _encode_rowid(pos, &key); + primary_keys.emplace_back(std::move(key)); + } + // 2. sort primary keys + std::sort(primary_keys.begin(), primary_keys.end()); + // 3. write primary keys index + std::string last_key; + for (const auto& key : primary_keys) { + DCHECK(key.compare(last_key) > 0) + << "found duplicate key or key is not sorted! current key: " << key + << ", last key" << last_key; + RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); + } + } + return Status::OK(); +} + +Status VerticalSegmentWriter::_generate_short_key_index( + std::vector& key_columns, size_t num_rows, + const std::vector& short_key_pos) { + // use _key_coders + _set_min_key(_full_encode_keys(key_columns, 0)); + _set_max_key(_full_encode_keys(key_columns, num_rows - 1)); + + key_columns.resize(_num_short_key_columns); + for (const auto pos : short_key_pos) { + RETURN_IF_ERROR(_short_key_index_builder->add_item(_encode_keys(key_columns, pos))); + } + return Status::OK(); +} + +void VerticalSegmentWriter::_encode_rowid(const uint32_t rowid, string* encoded_keys) { + encoded_keys->push_back(KEY_NORMAL_MARKER); + _rowid_coder->full_encode_ascending(&rowid, encoded_keys); +} + +std::string VerticalSegmentWriter::_full_encode_keys( + const std::vector& key_columns, size_t pos) { + assert(_key_index_size.size() == _num_sort_key_columns); + assert(key_columns.size() == _num_sort_key_columns && + _key_coders.size() == _num_sort_key_columns); + return _full_encode_keys(_key_coders, key_columns, pos); +} + std::string VerticalSegmentWriter::_full_encode_keys( + const std::vector& key_coders, const std::vector& key_columns, size_t pos) { - assert(_key_index_size.size() == _num_key_columns); - assert(key_columns.size() == _num_key_columns && _key_coders.size() == _num_key_columns); + assert(key_columns.size() == key_coders.size()); std::string encoded_keys; size_t cid = 0; @@ -948,7 +925,8 @@ std::string VerticalSegmentWriter::_full_encode_keys( continue; } encoded_keys.push_back(KEY_NORMAL_MARKER); - _key_coders[cid]->full_encode_ascending(field, &encoded_keys); + DCHECK(key_coders[cid] != nullptr); + key_coders[cid]->full_encode_ascending(field, &encoded_keys); ++cid; } return encoded_keys; @@ -995,7 +973,9 @@ std::string VerticalSegmentWriter::_encode_keys( uint64_t VerticalSegmentWriter::_estimated_remaining_size() { // footer_size(4) + checksum(4) + segment_magic(4) uint64_t size = 12; - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + if (_is_mow_with_cluster_key()) { + size += _primary_key_index_builder->size() + _short_key_index_builder->size(); + } else if (_is_mow()) { size += _primary_key_index_builder->size(); } else { size += _short_key_index_builder->size(); @@ -1006,13 +986,6 @@ uint64_t VerticalSegmentWriter::_estimated_remaining_size() { return size; } -size_t VerticalSegmentWriter::_calculate_inverted_index_file_size() { - if (_inverted_index_file_writer != nullptr) { - return _inverted_index_file_writer->get_index_file_size(); - } - return 0; -} - Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { uint64_t index_start = _file_writer->bytes_appended(); RETURN_IF_ERROR(_write_ordinal_index()); @@ -1022,7 +995,12 @@ Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { RETURN_IF_ERROR(_write_bloom_filter_index()); *index_size = _file_writer->bytes_appended() - index_start; - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + if (_is_mow_with_cluster_key()) { + RETURN_IF_ERROR(_write_short_key_index()); + *index_size = _file_writer->bytes_appended() - index_start; + RETURN_IF_ERROR(_write_primary_key_index()); + *index_size += _primary_key_index_builder->disk_size(); + } else if (_is_mow()) { RETURN_IF_ERROR(_write_primary_key_index()); // IndexedColumnWriter write data pages mixed with segment data, we should use // the stat from primary key index builder. @@ -1031,7 +1009,10 @@ Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { RETURN_IF_ERROR(_write_short_key_index()); *index_size = _file_writer->bytes_appended() - index_start; } - _inverted_index_file_size = _calculate_inverted_index_file_size(); + + if (_inverted_index_file_writer != nullptr) { + _inverted_index_file_info = _inverted_index_file_writer->get_index_file_info(); + } // reset all column writers and data_conveter clear(); @@ -1195,5 +1176,19 @@ void VerticalSegmentWriter::_set_max_key(const Slice& key) { _max_key.append(key.get_data(), key.get_size()); } +int64_t VerticalSegmentWriter::get_inverted_index_total_size() { + if (_inverted_index_file_writer != nullptr) { + return _inverted_index_file_writer->get_index_file_total_size(); + } + return 0; +} + +inline bool VerticalSegmentWriter::_is_mow() { + return _tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write; +} + +inline bool VerticalSegmentWriter::_is_mow_with_cluster_key() { + return _is_mow() && !_tablet_schema->cluster_key_idxes().empty(); +} } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h index 8068b3e44be6c8e..d84e08d081f4721 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h @@ -68,6 +68,7 @@ struct VerticalSegmentWriterOptions { RowsetWriterContext* rowset_ctx = nullptr; DataWriteType write_type = DataWriteType::TYPE_DEFAULT; + std::shared_ptr mow_ctx; }; struct RowsInBlock { @@ -80,9 +81,7 @@ class VerticalSegmentWriter { public: explicit VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, - DataDir* data_dir, uint32_t max_row_per_segment, - const VerticalSegmentWriterOptions& opts, - std::shared_ptr mow_context, + DataDir* data_dir, const VerticalSegmentWriterOptions& opts, io::FileWriterPtr inverted_file_writer = nullptr); ~VerticalSegmentWriter(); @@ -100,7 +99,9 @@ class VerticalSegmentWriter { [[nodiscard]] std::string data_dir_path() const { return _data_dir == nullptr ? "" : _data_dir->path(); } - [[nodiscard]] size_t inverted_index_file_size() const { return _inverted_index_file_size; } + [[nodiscard]] InvertedIndexFileInfo get_inverted_index_file_info() const { + return _inverted_index_file_info; + } [[nodiscard]] uint32_t num_rows_written() const { return _num_rows_written; } // for partial update @@ -121,13 +122,14 @@ class VerticalSegmentWriter { TabletSchemaSPtr flush_schema() const { return _flush_schema; }; + int64_t get_inverted_index_total_size(); + void clear(); private: void _init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column); Status _create_column_writer(uint32_t cid, const TabletColumn& column, const TabletSchemaSPtr& schema); - size_t _calculate_inverted_index_file_size(); uint64_t _estimated_remaining_size(); Status _write_ordinal_index(); Status _write_zone_map(); @@ -144,19 +146,32 @@ class VerticalSegmentWriter { // used for unique-key with merge on write and segment min_max key std::string _full_encode_keys( const std::vector& key_columns, size_t pos); + std::string _full_encode_keys( + const std::vector& key_coders, + const std::vector& key_columns, size_t pos); // used for unique-key with merge on write void _encode_seq_column(const vectorized::IOlapColumnDataAccessor* seq_column, size_t pos, string* encoded_keys); + // used for unique-key with merge on write tables with cluster keys + void _encode_rowid(const uint32_t rowid, string* encoded_keys); void _set_min_max_key(const Slice& key); void _set_min_key(const Slice& key); void _set_max_key(const Slice& key); void _serialize_block_to_row_column(vectorized::Block& block); Status _append_block_with_partial_content(RowsInBlock& data, vectorized::Block& full_block); Status _append_block_with_variant_subcolumns(RowsInBlock& data); - Status _fill_missing_columns(vectorized::MutableColumns& mutable_full_columns, - const std::vector& use_default_or_null_flag, - bool has_default_or_nullable, const size_t& segment_start_pos, - const vectorized::Block* block); + Status _generate_key_index( + RowsInBlock& data, std::vector& key_columns, + vectorized::IOlapColumnDataAccessor* seq_column, + std::map& cid_to_column); + Status _generate_primary_key_index( + const std::vector& primary_key_coders, + const std::vector& primary_key_columns, + vectorized::IOlapColumnDataAccessor* seq_column, size_t num_rows, bool need_sort); + Status _generate_short_key_index(std::vector& key_columns, + size_t num_rows, const std::vector& short_key_pos); + bool _is_mow(); + bool _is_mow_with_cluster_key(); private: uint32_t _segment_id; @@ -170,9 +185,11 @@ class VerticalSegmentWriter { std::unique_ptr _inverted_index_file_writer; SegmentFooterPB _footer; - size_t _num_key_columns; + // for mow tables with cluster key, the sort key is the cluster keys not unique keys + // for other tables, the sort key is the keys + size_t _num_sort_key_columns; size_t _num_short_key_columns; - size_t _inverted_index_file_size; + InvertedIndexFileInfo _inverted_index_file_info; std::unique_ptr _short_key_index_builder; std::unique_ptr _primary_key_index_builder; std::vector> _column_writers; @@ -181,7 +198,10 @@ class VerticalSegmentWriter { std::unique_ptr _olap_data_convertor; // used for building short key index or primary key index during vectorized write. std::vector _key_coders; + // for mow table with cluster keys, this is primary keys + std::vector _primary_key_coders; const KeyCoder* _seq_coder = nullptr; + const KeyCoder* _rowid_coder = nullptr; std::vector _key_index_size; size_t _short_key_row_pos = 0; @@ -206,7 +226,6 @@ class VerticalSegmentWriter { std::shared_ptr _mow_context; // group every rowset-segment row id to speed up reader - PartialUpdateReadPlan _rssid_to_rid; std::map _rsid_to_rowset; std::vector _batched_blocks; diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp index 863f0f597aa7c68..ced0fb880c41fba 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -165,14 +165,7 @@ Status VerticalBetaRowsetWriter::_create_segment_writer( int seg_id = this->_num_segment.fetch_add(1, std::memory_order_relaxed); io::FileWriterPtr file_writer; - io::FileWriterOptions opts { - .write_file_cache = this->_context.write_file_cache, - .is_cold_data = this->_context.is_hot_data, - .file_cache_expiration = this->_context.file_cache_ttl_sec > 0 && - this->_context.newest_write_timestamp > 0 - ? this->_context.newest_write_timestamp + - this->_context.file_cache_ttl_sec - : 0}; + io::FileWriterOptions opts = this->_context.get_file_writer_options(); auto path = context.segment_path(seg_id); auto& fs = context.fs_ref(); @@ -186,9 +179,10 @@ Status VerticalBetaRowsetWriter::_create_segment_writer( segment_v2::SegmentWriterOptions writer_options; writer_options.enable_unique_key_merge_on_write = context.enable_unique_key_merge_on_write; writer_options.rowset_ctx = &context; - *writer = std::make_unique( - file_writer.get(), seg_id, context.tablet_schema, context.tablet, context.data_dir, - context.max_rows_per_segment, writer_options, nullptr); + writer_options.max_rows_per_segment = context.max_rows_per_segment; + *writer = std::make_unique(file_writer.get(), seg_id, + context.tablet_schema, context.tablet, + context.data_dir, writer_options); RETURN_IF_ERROR(this->_seg_files.add(seg_id, std::move(file_writer))); auto s = (*writer)->init(column_ids, is_key); @@ -211,8 +205,10 @@ Status VerticalBetaRowsetWriter::final_flush() { LOG(WARNING) << "Fail to finalize segment footer, " << st; return st; } - this->_total_data_size += segment_size + segment_writer->get_inverted_index_file_size(); - this->_total_index_size += segment_writer->get_inverted_index_file_size(); + this->_total_data_size += segment_size + segment_writer->get_inverted_index_total_size(); + this->_total_index_size += segment_writer->get_inverted_index_total_size(); + this->_idx_files_info.add_file_info(segment_writer->get_segment_id(), + segment_writer->get_inverted_index_file_info()); segment_writer.reset(); } return Status::OK(); diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index 93058c05be332fd..f80c6457215b52b 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -36,10 +36,12 @@ #include "io/fs/file_writer.h" // IWYU pragma: keep #include "olap/calc_delete_bitmap_executor.h" #include "olap/olap_define.h" +#include "olap/partial_update_info.h" #include "olap/rowset/beta_rowset.h" #include "olap/rowset/beta_rowset_writer.h" #include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_meta.h" +#include "olap/rowset/rowset_meta_manager.h" #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" #include "olap/schema_change.h" @@ -123,7 +125,7 @@ void RowsetBuilder::_garbage_collection() { Status BaseRowsetBuilder::init_mow_context(std::shared_ptr& mow_context) { std::lock_guard lck(tablet()->get_header_lock()); - int64_t cur_max_version = tablet()->max_version_unlocked(); + _max_version_in_flush_phase = tablet()->max_version_unlocked(); std::vector rowset_ptrs; // tablet is under alter process. The delete bitmap will be calculated after conversion. if (tablet()->tablet_state() == TABLET_NOTREADY) { @@ -135,12 +137,13 @@ Status BaseRowsetBuilder::init_mow_context(std::shared_ptr& mow_cont } _rowset_ids.clear(); } else { - RETURN_IF_ERROR(tablet()->get_all_rs_id_unlocked(cur_max_version, &_rowset_ids)); + RETURN_IF_ERROR( + tablet()->get_all_rs_id_unlocked(_max_version_in_flush_phase, &_rowset_ids)); rowset_ptrs = tablet()->get_rowset_by_ids(&_rowset_ids); } _delete_bitmap = std::make_shared(tablet()->tablet_id()); - mow_context = std::make_shared(cur_max_version, _req.txn_id, _rowset_ids, - rowset_ptrs, _delete_bitmap); + mow_context = std::make_shared(_max_version_in_flush_phase, _req.txn_id, + _rowset_ids, rowset_ptrs, _delete_bitmap); return Status::OK(); } @@ -331,10 +334,11 @@ Status RowsetBuilder::commit_txn() { // => update_schema: A(bigint), B(double), C(int), D(int) RETURN_IF_ERROR(tablet()->update_by_least_common_schema(rw_ctx.tablet_schema)); } + // Transfer ownership of `PendingRowsetGuard` to `TxnManager` - Status res = _engine.txn_manager()->commit_txn(_req.partition_id, *tablet(), _req.txn_id, - _req.load_id, _rowset, - std::move(_pending_rs_guard), false); + Status res = _engine.txn_manager()->commit_txn( + _req.partition_id, *tablet(), _req.txn_id, _req.load_id, _rowset, + std::move(_pending_rs_guard), false, _partial_update_info); if (!res && !res.is()) { LOG(WARNING) << "Failed to commit txn: " << _req.txn_id @@ -408,7 +412,8 @@ void BaseRowsetBuilder::_build_current_tablet_schema(int64_t index_id, table_schema_param->partial_update_input_columns(), table_schema_param->is_strict_mode(), table_schema_param->timestamp_ms(), table_schema_param->timezone(), - table_schema_param->auto_increment_coulumn()); + table_schema_param->auto_increment_coulumn(), + _max_version_in_flush_phase); } } // namespace doris diff --git a/be/src/olap/rowset_builder.h b/be/src/olap/rowset_builder.h index e54faee3435c791..7fd578037363a0c 100644 --- a/be/src/olap/rowset_builder.h +++ b/be/src/olap/rowset_builder.h @@ -106,6 +106,7 @@ class BaseRowsetBuilder { std::unique_ptr _calc_delete_bitmap_token; // current rowset_ids, used to do diff in publish_version RowsetIdUnorderedSet _rowset_ids; + int64_t _max_version_in_flush_phase {-1}; std::shared_ptr _partial_update_info; diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index 599d9c1d1423ca9..1771bfb7c6714c4 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -17,6 +17,10 @@ #include "olap/schema_change.h" +#include +#include +#include + #include #include #include @@ -285,51 +289,63 @@ Status BlockChanger::change_block(vectorized::Block* ref_block, vectorized::VExprContext::filter_block(ctx.get(), ref_block, ref_block->columns())); } - const int row_size = ref_block->rows(); - const int column_size = new_block->columns(); + const int row_num = ref_block->rows(); + const int new_schema_cols_num = new_block->columns(); - // swap ref_block[key] and new_block[value] + // will be used for swaping ref_block[entry.first] and new_block[entry.second] std::list> swap_idx_list; - for (int idx = 0; idx < column_size; idx++) { - if (_schema_mapping[idx].expr != nullptr) { + for (int idx = 0; idx < new_schema_cols_num; idx++) { + auto expr = _schema_mapping[idx].expr; + if (expr != nullptr) { vectorized::VExprContextSPtr ctx; - RETURN_IF_ERROR(vectorized::VExpr::create_expr_tree(*_schema_mapping[idx].expr, ctx)); + RETURN_IF_ERROR(vectorized::VExpr::create_expr_tree(*expr, ctx)); RETURN_IF_ERROR(ctx->prepare(state.get(), row_desc)); RETURN_IF_ERROR(ctx->open(state.get())); - int result_column_id = -1; - RETURN_IF_ERROR(ctx->execute(ref_block, &result_column_id)); - if (ref_block->get_by_position(result_column_id).column == nullptr) { + int result_tmp_column_idx = -1; + RETURN_IF_ERROR(ctx->execute(ref_block, &result_tmp_column_idx)); + auto& result_tmp_column_def = ref_block->get_by_position(result_tmp_column_idx); + if (result_tmp_column_def.column == nullptr) { return Status::Error( - "{} result column is nullptr", - ref_block->get_by_position(result_column_id).name); + "result column={} is nullptr, input expr={}", result_tmp_column_def.name, + apache::thrift::ThriftDebugString(*expr)); } - ref_block->replace_by_position_if_const(result_column_id); + ref_block->replace_by_position_if_const(result_tmp_column_idx); - if (ref_block->get_by_position(result_column_id).column->size() != row_size) { + if (result_tmp_column_def.column->size() != row_num) { return Status::Error( - "{} size invalid, expect={}, real={}", new_block->get_by_position(idx).name, - row_size, ref_block->get_by_position(result_column_id).column->size()); + "result size invalid, expect={}, real={}; input expr={}", row_num, + result_tmp_column_def.column->size(), + apache::thrift::ThriftDebugString(*expr)); + } + + if (_type == SCHEMA_CHANGE) { + // danger casts (expected to be rejected by upstream caller) may cause data to be null and result in data loss in schema change + // for rollup, this check is unecessary, and ref columns are not set in this case, it works on exprs + + // column_idx in base schema + int32_t ref_column_idx = _schema_mapping[idx].ref_column_idx; + DCHECK_GE(ref_column_idx, 0); + auto& ref_column_def = ref_block->get_by_position(ref_column_idx); + RETURN_IF_ERROR( + _check_cast_valid(ref_column_def.column, result_tmp_column_def.column)); } - RETURN_IF_ERROR(_check_cast_valid(ref_block->get_by_position(idx).column, - ref_block->get_by_position(result_column_id).column, - _type)); - swap_idx_list.emplace_back(result_column_id, idx); - } else if (_schema_mapping[idx].ref_column < 0) { + swap_idx_list.emplace_back(result_tmp_column_idx, idx); + } else if (_schema_mapping[idx].ref_column_idx < 0) { // new column, write default value auto* value = _schema_mapping[idx].default_value; auto column = new_block->get_by_position(idx).column->assume_mutable(); if (value->is_null()) { DCHECK(column->is_nullable()); - column->insert_many_defaults(row_size); + column->insert_many_defaults(row_num); } else { auto type_info = get_type_info(_schema_mapping[idx].new_column); DefaultValueColumnIterator::insert_default_data(type_info.get(), value->size(), - value->ptr(), column, row_size); + value->ptr(), column, row_num); } } else { // same type, just swap column - swap_idx_list.emplace_back(_schema_mapping[idx].ref_column, idx); + swap_idx_list.emplace_back(_schema_mapping[idx].ref_column_idx, idx); } } @@ -367,78 +383,90 @@ Status BlockChanger::change_block(vectorized::Block* ref_block, return Status::OK(); } -// This check is to prevent schema-change from causing data loss -Status BlockChanger::_check_cast_valid(vectorized::ColumnPtr ref_column, - vectorized::ColumnPtr new_column, AlterTabletType type) { - if (ref_column->size() != new_column->size()) { +// This check can prevent schema-change from causing data loss after type cast +Status BlockChanger::_check_cast_valid(vectorized::ColumnPtr input_column, + vectorized::ColumnPtr output_column) { + if (input_column->size() != output_column->size()) { return Status::InternalError( - "column size is changed, ref_column_size={}, new_column_size={}", - ref_column->size(), new_column->size()); - } - if (type == ROLLUP) { - return Status::OK(); + "column size is changed, input_column_size={}, output_column_size={}; " + "input_column={}", + input_column->size(), output_column->size(), input_column->get_name()); } - if (ref_column->is_nullable() != new_column->is_nullable()) { - if (ref_column->is_nullable()) { + DCHECK_EQ(input_column->size(), output_column->size()) + << "length check should have done before calling this function!"; + + if (input_column->is_nullable() != output_column->is_nullable()) { + if (input_column->is_nullable()) { const auto* ref_null_map = - vectorized::check_and_get_column(ref_column) + vectorized::check_and_get_column(input_column) ->get_null_map_column() .get_data() .data(); bool is_changed = false; - for (size_t i = 0; i < ref_column->size(); i++) { + for (size_t i = 0; i < input_column->size(); i++) { is_changed |= ref_null_map[i]; } if (is_changed) { - return Status::DataQualityError("Null data is changed to not nullable"); + return Status::DataQualityError( + "some null data is changed to not null, intput_column={}", + input_column->get_name()); } } else { const auto& null_map_column = - vectorized::check_and_get_column(new_column) + vectorized::check_and_get_column(output_column) ->get_null_map_column(); const auto& nested_column = - vectorized::check_and_get_column(new_column) + vectorized::check_and_get_column(output_column) ->get_nested_column(); const auto* new_null_map = null_map_column.get_data().data(); - if (null_map_column.size() != new_column->size() || - nested_column.size() != new_column->size()) { - DCHECK(false); + if (null_map_column.size() != output_column->size()) { return Status::InternalError( - "null_map_column size is changed, null_map_column_size={}, " - "new_column_size={}", - null_map_column.size(), new_column->size()); + "null_map_column size mismatch output_column_size, " + "null_map_column_size={}, output_column_size={}; input_column={}", + null_map_column.size(), output_column->size(), input_column->get_name()); + } + + if (nested_column.size() != output_column->size()) { + return Status::InternalError( + "nested_column size is changed, nested_column_size={}, " + "ouput_column_size={}; input_column={}", + nested_column.size(), output_column->size(), input_column->get_name()); } bool is_changed = false; - for (size_t i = 0; i < ref_column->size(); i++) { + for (size_t i = 0; i < input_column->size(); i++) { is_changed |= new_null_map[i]; } if (is_changed) { - return Status::DataQualityError("Some data is changed to null"); + return Status::DataQualityError( + "some not null data is changed to null, intput_column={}", + input_column->get_name()); } } } - if (ref_column->is_nullable() && new_column->is_nullable()) { + if (input_column->is_nullable() && output_column->is_nullable()) { const auto* ref_null_map = - vectorized::check_and_get_column(ref_column) + vectorized::check_and_get_column(input_column) ->get_null_map_column() .get_data() .data(); const auto* new_null_map = - vectorized::check_and_get_column(new_column) + vectorized::check_and_get_column(output_column) ->get_null_map_column() .get_data() .data(); bool is_changed = false; - for (size_t i = 0; i < ref_column->size(); i++) { + for (size_t i = 0; i < input_column->size(); i++) { is_changed |= (ref_null_map[i] != new_null_map[i]); } if (is_changed) { - return Status::DataQualityError("is_null of data is changed!"); + return Status::DataQualityError( + "null map is changed after calculation, input_column={}", + input_column->get_name()); } } return Status::OK(); @@ -1202,6 +1230,8 @@ Status SchemaChangeJob::parse_request(const SchemaChangeParams& sc_params, ColumnMapping* column_mapping = changer->get_mutable_column_mapping(i); column_mapping->new_column = &new_column; + column_mapping->ref_column_idx = base_tablet_schema->field_index(new_column.name()); + if (materialized_function_map.find(column_name_lower) != materialized_function_map.end()) { auto mv_param = materialized_function_map.find(column_name_lower)->second; column_mapping->expr = mv_param.expr; @@ -1210,9 +1240,7 @@ Status SchemaChangeJob::parse_request(const SchemaChangeParams& sc_params, } } - int32_t column_index = base_tablet_schema->field_index(new_column.name()); - if (column_index >= 0) { - column_mapping->ref_column = column_index; + if (column_mapping->ref_column_idx >= 0) { continue; } @@ -1235,7 +1263,7 @@ Status SchemaChangeJob::parse_request(const SchemaChangeParams& sc_params, return Status::InternalError("failed due to operate on shadow column"); } // Newly added column go here - column_mapping->ref_column = -1; + column_mapping->ref_column_idx = -1; if (i < base_tablet_schema->num_short_key_columns()) { *sc_directly = true; @@ -1264,7 +1292,7 @@ Status SchemaChangeJob::parse_request(const SchemaChangeParams& sc_params, continue; } - if (column_mapping->ref_column != i - num_default_value) { + if (column_mapping->ref_column_idx != i - num_default_value) { *sc_sorting = true; return Status::OK(); } @@ -1317,8 +1345,8 @@ Status SchemaChangeJob::parse_request(const SchemaChangeParams& sc_params, } // if new tablet enable row store, or new tablet has different row store columns - if ((!base_tablet_schema->have_column(BeConsts::ROW_STORE_COL) && - new_tablet_schema->have_column(BeConsts::ROW_STORE_COL)) || + if ((!base_tablet_schema->exist_column(BeConsts::ROW_STORE_COL) && + new_tablet_schema->exist_column(BeConsts::ROW_STORE_COL)) || !std::equal(new_tablet_schema->row_columns_uids().begin(), new_tablet_schema->row_columns_uids().end(), base_tablet_schema->row_columns_uids().begin(), @@ -1331,9 +1359,9 @@ Status SchemaChangeJob::parse_request(const SchemaChangeParams& sc_params, if (column_mapping->expr != nullptr) { *sc_directly = true; return Status::OK(); - } else if (column_mapping->ref_column >= 0) { + } else if (column_mapping->ref_column_idx >= 0) { const auto& column_new = new_tablet_schema->column(i); - const auto& column_old = base_tablet_schema->column(column_mapping->ref_column); + const auto& column_old = base_tablet_schema->column(column_mapping->ref_column_idx); // index changed if (column_new.is_bf_column() != column_old.is_bf_column() || column_new.has_bitmap_index() != column_old.has_bitmap_index() || diff --git a/be/src/olap/schema_change.h b/be/src/olap/schema_change.h index eb0f046270db2bf..c29cb49a7aaece8 100644 --- a/be/src/olap/schema_change.h +++ b/be/src/olap/schema_change.h @@ -87,7 +87,7 @@ class BlockChanger { private: static Status _check_cast_valid(vectorized::ColumnPtr ref_column, - vectorized::ColumnPtr new_column, AlterTabletType type); + vectorized::ColumnPtr new_column); // @brief column-mapping specification of new schema SchemaMapping _schema_mapping; @@ -117,8 +117,8 @@ class SchemaChange { _filtered_rows = 0; _merged_rows = 0; - RETURN_IF_ERROR(_inner_process(rowset_reader, rowset_writer, new_tablet, base_tablet_schema, - new_tablet_schema)); + RETURN_IF_ERROR_OR_CATCH_EXCEPTION(_inner_process(rowset_reader, rowset_writer, new_tablet, + base_tablet_schema, new_tablet_schema)); // Check row num changes if (!_check_row_nums(rowset_reader, *rowset_writer)) { diff --git a/be/src/olap/shared_predicate.h b/be/src/olap/shared_predicate.h index 41b18e99ba470fb..2a83f7ef4344f31 100644 --- a/be/src/olap/shared_predicate.h +++ b/be/src/olap/shared_predicate.h @@ -167,9 +167,9 @@ class SharedPredicate : public ColumnPredicate { std::string _debug_string() const override { std::shared_lock lock(_mtx); if (!_nested) { - return "shared_predicate"; + return "shared_predicate(unknow)"; } - return "shared_predicate<" + _nested->debug_string() + ">"; + return "shared_predicate(" + _nested->debug_string() + ")"; } mutable std::shared_mutex _mtx; diff --git a/be/src/olap/simple_rowid_conversion.h b/be/src/olap/simple_rowid_conversion.h new file mode 100644 index 000000000000000..1a89b01838fe8ca --- /dev/null +++ b/be/src/olap/simple_rowid_conversion.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "olap/olap_common.h" +#include "olap/utils.h" + +namespace doris { + +// Simple verion of rowid conversion, for segcompaction +// convert rows from several segments to rows in 1 segment +class SimpleRowIdConversion { +public: + SimpleRowIdConversion(const RowsetId& rowset_id) : _rowst_id(rowset_id) {}; + ~SimpleRowIdConversion() = default; + + // resize segment rowid map to its rows num + void reset_segment_map(const std::map& num_rows) { + _cur_dst_segment_rowid = 0; + for (auto seg_rows : num_rows) { + _segments_rowid_map.emplace(seg_rows.first, + std::vector(seg_rows.second, UINT32_MAX)); + } + } + + // add row id to the map + void add(const std::vector& rss_row_ids) { + for (auto& item : rss_row_ids) { + if (item.row_id == -1) { + continue; + } + DCHECK(_segments_rowid_map.find(item.segment_id) != _segments_rowid_map.end() && + _segments_rowid_map[item.segment_id].size() > item.row_id); + _segments_rowid_map[item.segment_id][item.row_id] = _cur_dst_segment_rowid++; + } + } + + // get destination RowLocation + // return non-zero if the src RowLocation does not exist + int get(const RowLocation& src) const { + auto it = _segments_rowid_map.find(src.segment_id); + if (it == _segments_rowid_map.end()) { + return -1; + } + const auto& rowid_map = it->second; + if (src.row_id >= rowid_map.size() || UINT32_MAX == rowid_map[src.row_id]) { + return -1; + } + + return rowid_map[src.row_id]; + } + +private: + // key: index indicates src segment. + // value: index indicates row id of source segment, value indicates row id of destination + // segment. UINT32_MAX indicates current row not exist. + std::map> _segments_rowid_map; + + // dst rowset id + RowsetId _rowst_id; + + // current rowid of dst segment + std::uint32_t _cur_dst_segment_rowid = 0; +}; + +} // namespace doris diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index dfdc0132eeccfee..7e3d83863715072 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -404,20 +404,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, return Status::InternalError("single compaction init curl failed"); } for (auto& file_name : file_name_list) { - // The file name of the variant column with the inverted index contains % - // such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx - // {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx - // We need to handle %, otherwise it will cause an HTTP 404 error. - // Because the percent ("%") character serves as the indicator for percent-encoded octets, - // it must be percent-encoded as "%25" for that octet to be used as data within a URI. - // https://datatracker.ietf.org/doc/html/rfc3986 - auto output = std::unique_ptr( - curl_easy_escape(curl.get(), file_name.c_str(), file_name.length()), &curl_free); - if (!output) { - return Status::InternalError("escape file name failed, file name={}", file_name); - } - std::string encoded_filename(output.get()); - auto remote_file_url = remote_url_prefix + encoded_filename; + auto remote_file_url = remote_url_prefix + file_name; // get file length uint64_t file_size = 0; diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp index ac4f5ee9728b002..1aa0229ee6523ce 100644 --- a/be/src/olap/snapshot_manager.cpp +++ b/be/src/olap/snapshot_manager.cpp @@ -83,6 +83,9 @@ Status SnapshotManager::make_snapshot(const TSnapshotRequest& request, string* s } TabletSharedPtr ref_tablet = _engine.tablet_manager()->get_tablet(request.tablet_id); + + DBUG_EXECUTE_IF("SnapshotManager::make_snapshot.inject_failure", { ref_tablet = nullptr; }) + if (ref_tablet == nullptr) { return Status::Error("failed to get tablet. tablet={}", request.tablet_id); } diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index ae820364b89b2af..ebf40c90bea35b6 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -56,6 +56,8 @@ #include "olap/memtable_flush_executor.h" #include "olap/olap_common.h" #include "olap/olap_define.h" +#include "olap/olap_meta.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_meta_manager.h" #include "olap/rowset/unique_rowset_id_generator.h" @@ -859,6 +861,9 @@ Status StorageEngine::start_trash_sweep(double* usage, bool ignore_guard) { // cleand unused pending publish info for deleted tablet _clean_unused_pending_publish_info(); + // clean unused partial update info for finished txns + _clean_unused_partial_update_info(); + // clean unused rowsets in remote storage backends for (auto data_dir : get_stores()) { data_dir->perform_remote_rowset_gc(); @@ -1022,6 +1027,34 @@ void StorageEngine::_clean_unused_pending_publish_info() { } } +void StorageEngine::_clean_unused_partial_update_info() { + std::vector> remove_infos; + auto unused_partial_update_info_collector = + [this, &remove_infos](int64_t tablet_id, int64_t partition_id, int64_t txn_id, + std::string_view value) -> bool { + TabletSharedPtr tablet = _tablet_manager->get_tablet(tablet_id); + if (tablet == nullptr) { + remove_infos.emplace_back(tablet_id, partition_id, txn_id); + return true; + } + TxnState txn_state = + _txn_manager->get_txn_state(partition_id, txn_id, tablet_id, tablet->tablet_uid()); + if (txn_state == TxnState::NOT_FOUND || txn_state == TxnState::ABORTED || + txn_state == TxnState::DELETED) { + remove_infos.emplace_back(tablet_id, partition_id, txn_id); + return true; + } + return true; + }; + auto data_dirs = get_stores(); + for (auto* data_dir : data_dirs) { + static_cast(RowsetMetaManager::traverse_partial_update_info( + data_dir->get_meta(), unused_partial_update_info_collector)); + static_cast( + RowsetMetaManager::remove_partial_update_infos(data_dir->get_meta(), remove_infos)); + } +} + void StorageEngine::gc_binlogs(const std::unordered_map& gc_tablet_infos) { for (auto [tablet_id, version] : gc_tablet_infos) { LOG(INFO) << fmt::format("start to gc binlogs for tablet_id: {}, version: {}", tablet_id, diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index 3105c4f53d0aae6..d7ccd4597d6ef31 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -348,6 +348,8 @@ class StorageEngine final : public BaseStorageEngine { void _clean_unused_pending_publish_info(); + void _clean_unused_partial_update_info(); + Status _do_sweep(const std::string& scan_root, const time_t& local_tm_now, const int32_t expire); @@ -361,9 +363,6 @@ class StorageEngine final : public BaseStorageEngine { // delete tablet with io error process function void _disk_stat_monitor_thread_callback(); - // clean file descriptors cache - void _cache_clean_callback(); - // path gc process function void _path_gc_thread_callback(DataDir* data_dir); @@ -528,9 +527,6 @@ class StorageEngine final : public BaseStorageEngine { scoped_refptr _async_publish_thread; std::shared_mutex _async_publish_lock; - bool _clear_segment_cache = false; - bool _clear_page_cache = false; - std::atomic _need_clean_trash {false}; // next index for create tablet diff --git a/be/src/olap/storage_policy.cpp b/be/src/olap/storage_policy.cpp index c553d2b7441aeca..837e9bed178e3a9 100644 --- a/be/src/olap/storage_policy.cpp +++ b/be/src/olap/storage_policy.cpp @@ -191,6 +191,37 @@ std::string StorageResource::remote_segment_path(const RowsetMeta& rowset, int64 } } +std::string StorageResource::remote_idx_v1_path(const RowsetMeta& rowset, int64_t seg_id, + int64_t index_id, + std::string_view index_path_suffix) const { + std::string suffix = + index_path_suffix.empty() ? "" : std::string {"@"} + index_path_suffix.data(); + switch (path_version) { + case 0: + return fmt::format("{}/{}/{}_{}_{}{}.idx", DATA_PREFIX, rowset.tablet_id(), + rowset.rowset_id().to_string(), seg_id, index_id, suffix); + case 1: + return fmt::format("{}/{}/{}/{}/{}_{}{}.idx", DATA_PREFIX, shard_fn(rowset.tablet_id()), + rowset.tablet_id(), rowset.rowset_id().to_string(), seg_id, index_id, + suffix); + default: + exit_at_unknown_path_version(fs->id(), path_version); + } +} + +std::string StorageResource::remote_idx_v2_path(const RowsetMeta& rowset, int64_t seg_id) const { + switch (path_version) { + case 0: + return fmt::format("{}/{}/{}_{}.idx", DATA_PREFIX, rowset.tablet_id(), + rowset.rowset_id().to_string(), seg_id); + case 1: + return fmt::format("{}/{}/{}/{}/{}.idx", DATA_PREFIX, shard_fn(rowset.tablet_id()), + rowset.tablet_id(), rowset.rowset_id().to_string(), seg_id); + default: + exit_at_unknown_path_version(fs->id(), path_version); + } +} + std::string StorageResource::remote_tablet_path(int64_t tablet_id) const { switch (path_version) { case 0: diff --git a/be/src/olap/storage_policy.h b/be/src/olap/storage_policy.h index 9eb27773272704d..f79b1a052095cab 100644 --- a/be/src/olap/storage_policy.h +++ b/be/src/olap/storage_policy.h @@ -77,6 +77,11 @@ struct StorageResource { int64_t seg_id) const; std::string remote_segment_path(const RowsetMeta& rowset, int64_t seg_id) const; std::string remote_tablet_path(int64_t tablet_id) const; + + std::string remote_idx_v1_path(const RowsetMeta& rowset, int64_t seg_id, int64_t index_id, + std::string_view index_suffix) const; + std::string remote_idx_v2_path(const RowsetMeta& rowset, int64_t seg_id) const; + std::string cooldown_tablet_meta_path(int64_t tablet_id, int64_t replica_id, int64_t cooldown_term) const; }; diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 671fa1396efe70f..48e1efb4e6b2b6b 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -148,6 +148,8 @@ namespace { bvar::Adder exceed_version_limit_counter; bvar::Window> exceed_version_limit_counter_minute( &exceed_version_limit_counter, 60); +bvar::Adder cooldown_pending_task("cooldown_pending_task"); +bvar::Adder cooldown_processing_task("cooldown_processing_task"); void set_last_failure_time(Tablet* tablet, const Compaction& compaction, int64_t ms) { switch (compaction.compaction_type()) { @@ -168,6 +170,8 @@ void set_last_failure_time(Tablet* tablet, const Compaction& compaction, int64_t } // namespace +bvar::Adder unused_remote_rowset_num("unused_remote_rowset_num"); + WriteCooldownMetaExecutors::WriteCooldownMetaExecutors(size_t executor_nums) : _executor_nums(executor_nums) { for (size_t i = 0; i < _executor_nums; i++) { @@ -230,8 +234,13 @@ void WriteCooldownMetaExecutors::WriteCooldownMetaExecutors::submit(TabletShared VLOG_DEBUG << "tablet " << t->tablet_id() << " is not cooldown replica"; }; - _executors[_get_executor_pos(tablet_id)]->offer( - [task = std::move(async_write_task)]() { task(); }); + cooldown_pending_task << 1; + _executors[_get_executor_pos(tablet_id)]->offer([task = std::move(async_write_task)]() { + cooldown_pending_task << -1; + cooldown_processing_task << 1; + task(); + cooldown_processing_task << -1; + }); } Tablet::Tablet(StorageEngine& engine, TabletMetaSharedPtr tablet_meta, DataDir* data_dir, @@ -861,6 +870,14 @@ Status Tablet::capture_consistent_versions_unlocked(const Version& spec_version, } } } + + DBUG_EXECUTE_IF("TTablet::capture_consistent_versions.inject_failure", { + auto tablet_id = dp->param("tablet_id", -1); + if (tablet_id != -1 && tablet_id == _tablet_meta->tablet_id()) { + status = Status::Error("version already merged"); + } + }); + return status; } @@ -1713,7 +1730,13 @@ Status Tablet::prepare_compaction_and_calculate_permits( } } - permits = compaction->get_compaction_permits(); + // Time series policy does not rely on permits, it uses goal size to control memory + if (tablet->tablet_meta()->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY) { + // permits = 0 means that prepare_compaction failed + permits = 1; + } else { + permits = compaction->get_compaction_permits(); + } return Status::OK(); } @@ -2373,6 +2396,7 @@ void Tablet::record_unused_remote_rowset(const RowsetId& rowset_id, const std::s LOG(WARNING) << "failed to record unused remote rowset. tablet_id=" << tablet_id() << " rowset_id=" << rowset_id << " resource_id=" << resource; } + unused_remote_rowset_num << 1; } Status Tablet::remove_all_remote_rowsets() { @@ -2636,14 +2660,21 @@ Status Tablet::ingest_binlog_metas(RowsetBinlogMetasPB* metas_pb) { } void Tablet::clear_cache() { - std::shared_lock rlock(get_header_lock()); - static auto recycle_segment_cache = [](const auto& rowset_map) { - for (auto& [_, rowset] : rowset_map) { - rowset->clear_cache(); + std::vector rowsets; + { + std::shared_lock rlock(get_header_lock()); + SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); + + for (auto& [_, rowset] : rowset_map()) { + rowsets.push_back(rowset); } - }; - recycle_segment_cache(rowset_map()); - recycle_segment_cache(stale_rowset_map()); + for (auto& [_, rowset] : stale_rowset_map()) { + rowsets.push_back(rowset); + } + } + for (auto& rowset : rowsets) { + rowset->clear_cache(); + } } } // namespace doris diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index fa11c2d868569f4..4cd2a355586311e 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -51,6 +51,11 @@ #include "util/once.h" #include "util/slice.h" +namespace bvar { +template +class Adder; +} + namespace doris { class Tablet; @@ -78,6 +83,8 @@ enum SortType : int; enum TabletStorageType { STORAGE_TYPE_LOCAL, STORAGE_TYPE_REMOTE, STORAGE_TYPE_REMOTE_AND_LOCAL }; +extern bvar::Adder unused_remote_rowset_num; + static inline constexpr auto TRACE_TABLET_LOCK_THRESHOLD = std::chrono::seconds(1); struct WriteCooldownMetaExecutors { diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index a234ab93a476385..6696dcf2e68df25 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -226,7 +226,7 @@ Status TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id, // If the new tablet is fresher than the existing one, then replace // the existing tablet with the new one. // Use default replica_id to ignore whether replica_id is match when drop tablet. - Status status = _drop_tablet_unlocked(tablet_id, /* replica_id */ 0, keep_files, false); + Status status = _drop_tablet(tablet_id, /* replica_id */ 0, keep_files, false, true); COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropOldTablet", "AddTablet"), static_cast(watch.reset())); RETURN_NOT_OK_STATUS_WITH_WARN( @@ -438,7 +438,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked( } // something is wrong, we need clear environment if (is_tablet_added) { - Status status = _drop_tablet_unlocked(new_tablet_id, request.replica_id, false, false); + Status status = _drop_tablet(new_tablet_id, request.replica_id, false, false, true); COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropTablet", parent_timer_name), static_cast(watch.reset())); if (!status.ok()) { @@ -522,14 +522,12 @@ TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked( Status TabletManager::drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool is_drop_table_or_partition) { - auto& shard = _get_tablets_shard(tablet_id); - std::lock_guard wrlock(shard.lock); - return _drop_tablet_unlocked(tablet_id, replica_id, false, is_drop_table_or_partition); + return _drop_tablet(tablet_id, replica_id, false, is_drop_table_or_partition, false); } // Drop specified tablet. -Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId replica_id, - bool keep_files, bool is_drop_table_or_partition) { +Status TabletManager::_drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, + bool is_drop_table_or_partition, bool had_held_shard_lock) { LOG(INFO) << "begin drop tablet. tablet_id=" << tablet_id << ", replica_id=" << replica_id << ", is_drop_table_or_partition=" << is_drop_table_or_partition; DorisMetrics::instance()->drop_tablet_requests_total->increment(1); @@ -538,23 +536,31 @@ Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId repl Defer defer {[&]() { unregister_transition_tablet(tablet_id, "drop tablet"); }}; // Fetch tablet which need to be dropped - TabletSharedPtr to_drop_tablet = _get_tablet_unlocked(tablet_id); - if (to_drop_tablet == nullptr) { - LOG(WARNING) << "fail to drop tablet because it does not exist. " - << "tablet_id=" << tablet_id; - return Status::OK(); - } + TabletSharedPtr to_drop_tablet; + { + std::unique_lock wlock(_get_tablets_shard_lock(tablet_id), + std::defer_lock); + if (!had_held_shard_lock) { + wlock.lock(); + } + to_drop_tablet = _get_tablet_unlocked(tablet_id); + if (to_drop_tablet == nullptr) { + LOG(WARNING) << "fail to drop tablet because it does not exist. " + << "tablet_id=" << tablet_id; + return Status::OK(); + } - // We should compare replica id to avoid dropping new cloned tablet. - // Iff request replica id is 0, FE may be an older release, then we drop this tablet as before. - if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) { - return Status::Aborted("replica_id not match({} vs {})", to_drop_tablet->replica_id(), - replica_id); - } + // We should compare replica id to avoid dropping new cloned tablet. + // Iff request replica id is 0, FE may be an older release, then we drop this tablet as before. + if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) { + return Status::Aborted("replica_id not match({} vs {})", to_drop_tablet->replica_id(), + replica_id); + } - _remove_tablet_from_partition(to_drop_tablet); - tablet_map_t& tablet_map = _get_tablet_map(tablet_id); - tablet_map.erase(tablet_id); + _remove_tablet_from_partition(to_drop_tablet); + tablet_map_t& tablet_map = _get_tablet_map(tablet_id); + tablet_map.erase(tablet_id); + } to_drop_tablet->clear_cache(); diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h index 809a2237356dd79..42623cf05f2aea8 100644 --- a/be/src/olap/tablet_manager.h +++ b/be/src/olap/tablet_manager.h @@ -194,8 +194,8 @@ class TabletManager { bool _check_tablet_id_exist_unlocked(TTabletId tablet_id); - Status _drop_tablet_unlocked(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, - bool is_drop_table_or_partition); + Status _drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, + bool is_drop_table_or_partition, bool had_held_shard_lock); TabletSharedPtr _get_tablet_unlocked(TTabletId tablet_id); TabletSharedPtr _get_tablet_unlocked(TTabletId tablet_id, bool include_deleted, diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index a3526781dddd879..ed9a446551d00e6 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -1080,6 +1080,16 @@ bool DeleteBitmap::contains_agg_without_cache(const BitmapKey& bmk, uint32_t row return false; } +void DeleteBitmap::remove_sentinel_marks() { + for (auto it = delete_bitmap.begin(), end = delete_bitmap.end(); it != end;) { + if (std::get<1>(it->first) == DeleteBitmap::INVALID_SEGMENT_ID) { + it = delete_bitmap.erase(it); + } else { + ++it; + } + } +} + int DeleteBitmap::set(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) { std::lock_guard l(lock); auto [_, inserted] = delete_bitmap.insert_or_assign(bmk, segment_delete_bitmap); diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 32c6fde568c87b2..41455c051c7f44d 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -192,9 +192,6 @@ class TabletMeta { void revise_delete_bitmap_unlocked(const DeleteBitmap& delete_bitmap); const std::vector& all_stale_rs_metas() const; - // return the snapshot of rowset_meta - // the return value is map - std::unordered_map snapshot_rs_metas() const; RowsetMetaSharedPtr acquire_rs_meta_by_version(const Version& version) const; void delete_stale_rs_meta_by_version(const Version& version); RowsetMetaSharedPtr acquire_stale_rs_meta_by_version(const Version& version) const; @@ -516,6 +513,8 @@ class DeleteBitmap { */ std::shared_ptr get_agg(const BitmapKey& bmk) const; + void remove_sentinel_marks(); + class AggCachePolicy : public LRUCachePolicyTrackingManual { public: AggCachePolicy(size_t capacity) @@ -696,15 +695,6 @@ inline bool TabletMeta::all_beta() const { return true; } -inline std::unordered_map TabletMeta::snapshot_rs_metas() const { - std::unordered_map id_to_rowset_meta_map; - std::shared_lock rlock(_meta_lock); - std::for_each(_rs_metas.cbegin(), _rs_metas.cend(), [&](const auto& rowset_meta) { - id_to_rowset_meta_map.emplace(rowset_meta->rowset_id().to_string(), rowset_meta); - }); - return id_to_rowset_meta_map; -} - std::string tablet_state_name(TabletState state); // Only for unit test now. diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index b3dde488674bea3..095439e4d5b3931 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -1289,6 +1289,10 @@ void TabletSchema::update_indexes_from_thrift(const std::vector( @@ -1472,7 +1476,7 @@ vectorized::Block TabletSchema::create_block(bool ignore_dropped_col) const { return block; } -vectorized::Block TabletSchema::create_block_by_cids(const std::vector& cids) { +vectorized::Block TabletSchema::create_block_by_cids(const std::vector& cids) const { vectorized::Block block; for (const auto& cid : cids) { const auto& col = *_cols[cid]; diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 8cf6e20208c90f6..251c0b58eacaf77 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -318,6 +318,7 @@ class TabletSchema { const TabletColumn& column(size_t ordinal) const; Result column(const std::string& field_name) const; Status have_column(const std::string& field_name) const; + bool exist_column(const std::string& field_name) const; const TabletColumn& column_by_uid(int32_t col_unique_id) const; TabletColumn& mutable_column_by_uid(int32_t col_unique_id); TabletColumn& mutable_column(size_t ordinal); @@ -476,7 +477,7 @@ class TabletSchema { return str; } - vectorized::Block create_block_by_cids(const std::vector& cids); + vectorized::Block create_block_by_cids(const std::vector& cids) const; std::shared_ptr copy_without_variant_extracted_columns(); InvertedIndexStorageFormatPB get_inverted_index_storage_format() const { diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 40b789cf873fcb5..1fc5b7278c6a5c5 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -190,7 +190,7 @@ Status EngineCloneTask::_do_clone() { tablet->replica_id(), false)); tablet.reset(); } - bool is_new_tablet = tablet == nullptr; + _is_new_tablet = tablet == nullptr; // try to incremental clone Versions missed_versions; // try to repair a tablet with missing version @@ -228,7 +228,7 @@ Status EngineCloneTask::_do_clone() { if (missed_versions.empty()) { LOG(INFO) << "missed version size = 0, skip clone and return success. tablet_id=" << _clone_req.tablet_id << " replica_id=" << _clone_req.replica_id; - RETURN_IF_ERROR(_set_tablet_info(is_new_tablet)); + RETURN_IF_ERROR(_set_tablet_info()); return Status::OK(); } @@ -307,10 +307,11 @@ Status EngineCloneTask::_do_clone() { TabletMeta::construct_header_file_path(tablet_dir, _clone_req.tablet_id); RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(header_path)); } - return _set_tablet_info(is_new_tablet); + + return _set_tablet_info(); } -Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) { +Status EngineCloneTask::_set_tablet_info() { // Get clone tablet info TTabletInfo tablet_info; tablet_info.__set_tablet_id(_clone_req.tablet_id); @@ -320,7 +321,7 @@ Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) { if (_clone_req.__isset.version && tablet_info.version < _clone_req.version) { // if it is a new tablet and clone failed, then remove the tablet // if it is incremental clone, then must not drop the tablet - if (is_new_tablet) { + if (_is_new_tablet) { // we need to check if this cloned table's version is what we expect. // if not, maybe this is a stale remaining table which is waiting for drop. // we drop it. @@ -522,26 +523,8 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re uint64_t total_file_size = 0; MonotonicStopWatch watch; watch.start(); - auto curl = std::unique_ptr(curl_easy_init(), - &curl_easy_cleanup); - if (!curl) { - return Status::InternalError("engine clone task init curl failed"); - } for (auto& file_name : file_name_list) { - // The file name of the variant column with the inverted index contains % - // such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx - // {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx - // We need to handle %, otherwise it will cause an HTTP 404 error. - // Because the percent ("%") character serves as the indicator for percent-encoded octets, - // it must be percent-encoded as "%25" for that octet to be used as data within a URI. - // https://datatracker.ietf.org/doc/html/rfc3986 - auto output = std::unique_ptr( - curl_easy_escape(curl.get(), file_name.c_str(), file_name.length()), &curl_free); - if (!output) { - return Status::InternalError("escape file name failed, file name={}", file_name); - } - std::string encoded_filename(output.get()); - auto remote_file_url = remote_url_prefix + encoded_filename; + auto remote_file_url = remote_url_prefix + file_name; // get file length uint64_t file_size = 0; diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index 71dc3a817b8a13e..3161b803c82db15 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -55,6 +55,8 @@ class EngineCloneTask final : public EngineTask { std::vector* tablet_infos); ~EngineCloneTask() override = default; + bool is_new_tablet() const { return _is_new_tablet; } + private: Status _do_clone(); @@ -71,7 +73,7 @@ class EngineCloneTask final : public EngineTask { const std::vector& missing_versions, bool* allow_incremental_clone); - Status _set_tablet_info(bool is_new_tablet); + Status _set_tablet_info(); // Download tablet files from Status _download_files(DataDir* data_dir, const std::string& remote_url_prefix, @@ -95,6 +97,7 @@ class EngineCloneTask final : public EngineTask { int64_t _copy_size; int64_t _copy_time_ms; std::vector _pending_rs_guards; + bool _is_new_tablet = false; }; // EngineTask -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/olap/task/engine_publish_version_task.cpp b/be/src/olap/task/engine_publish_version_task.cpp index acdcebae165c6f4..09238f570b7567f 100644 --- a/be/src/olap/task/engine_publish_version_task.cpp +++ b/be/src/olap/task/engine_publish_version_task.cpp @@ -111,6 +111,20 @@ Status EnginePublishVersionTask::execute() { std::this_thread::sleep_for(std::chrono::milliseconds(wait)); } }); + DBUG_EXECUTE_IF("EnginePublishVersionTask::execute.enable_spin_wait", { + auto token = dp->param("token", "invalid_token"); + while (DebugPoints::instance()->is_enable("EnginePublishVersionTask::execute.block")) { + auto block_dp = DebugPoints::instance()->get_debug_point( + "EnginePublishVersionTask::execute.block"); + if (block_dp) { + auto pass_token = block_dp->param("pass_token", ""); + if (pass_token == token) { + break; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + }); std::unique_ptr token = _engine.tablet_publish_txn_thread_pool()->new_token( ThreadPool::ExecutionMode::CONCURRENT); std::unordered_map tablet_id_to_num_delta_rows; @@ -342,6 +356,8 @@ void EnginePublishVersionTask::_calculate_tbl_num_delta_rows( auto table_id = tablet->get_table_id(); if (kv.second > 0) { (*_table_id_to_tablet_id_to_num_delta_rows)[table_id][kv.first] += kv.second; + VLOG_DEBUG << "report delta rows to fe, table_id=" << table_id + << ", tablet=" << kv.first << ", num_rows=" << kv.second; } } } diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index e4a3332ad173cd1..38a52d1d2118aa6 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -310,7 +310,7 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta LOG(ERROR) << "close inverted_index_writer error:" << st; return st; } - inverted_index_size += inverted_index_writer->get_index_file_size(); + inverted_index_size += inverted_index_writer->get_index_file_total_size(); } _inverted_index_file_writers.clear(); output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size() + @@ -465,7 +465,7 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta LOG(ERROR) << "close inverted_index_writer error:" << st; return st; } - inverted_index_size += inverted_index_file_writer->get_index_file_size(); + inverted_index_size += inverted_index_file_writer->get_index_file_total_size(); } _inverted_index_builders.clear(); _inverted_index_file_writers.clear(); diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp index 3d33b8c16f426f1..1dd2d52f33b8ace 100644 --- a/be/src/olap/txn_manager.cpp +++ b/be/src/olap/txn_manager.cpp @@ -33,9 +33,11 @@ #include "common/config.h" #include "common/logging.h" +#include "common/status.h" #include "olap/data_dir.h" #include "olap/delta_writer.h" #include "olap/olap_common.h" +#include "olap/partial_update_info.h" #include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_meta_manager.h" @@ -173,10 +175,11 @@ Status TxnManager::prepare_txn(TPartitionId partition_id, TTransactionId transac Status TxnManager::commit_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, - bool is_recovery) { + bool is_recovery, + std::shared_ptr partial_update_info) { return commit_txn(tablet.data_dir()->get_meta(), partition_id, transaction_id, tablet.tablet_id(), tablet.tablet_uid(), load_id, rowset_ptr, - std::move(guard), is_recovery); + std::move(guard), is_recovery, partial_update_info); } Status TxnManager::publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet, @@ -259,7 +262,8 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, TabletUid tablet_uid, const PUniqueId& load_id, const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, - bool is_recovery) { + bool is_recovery, + std::shared_ptr partial_update_info) { if (partition_id < 1 || transaction_id < 1 || tablet_id < 1) { LOG(WARNING) << "invalid commit req " << " partition_id=" << partition_id << " transaction_id=" << transaction_id @@ -369,6 +373,36 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, save_status.append(fmt::format(", txn id: {}", transaction_id)); return save_status; } + + if (partial_update_info && partial_update_info->is_partial_update) { + PartialUpdateInfoPB partial_update_info_pb; + partial_update_info->to_pb(&partial_update_info_pb); + save_status = RowsetMetaManager::save_partial_update_info( + meta, tablet_id, partition_id, transaction_id, partial_update_info_pb); + if (!save_status.ok()) { + save_status.append(fmt::format(", txn_id: {}", transaction_id)); + return save_status; + } + } + } + + TabletSharedPtr tablet; + std::shared_ptr decoded_partial_update_info {nullptr}; + if (is_recovery) { + tablet = _engine.tablet_manager()->get_tablet(tablet_id, tablet_uid); + if (tablet != nullptr && tablet->enable_unique_key_merge_on_write()) { + PartialUpdateInfoPB partial_update_info_pb; + auto st = RowsetMetaManager::try_get_partial_update_info( + meta, tablet_id, partition_id, transaction_id, &partial_update_info_pb); + if (st.ok()) { + decoded_partial_update_info = std::make_shared(); + decoded_partial_update_info->from_pb(&partial_update_info_pb); + DCHECK(decoded_partial_update_info->is_partial_update); + } else if (!st.is()) { + // the load is not a partial update + return st; + } + } } { @@ -376,11 +410,17 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, auto load_info = std::make_shared(load_id, rowset_ptr); load_info->pending_rs_guard = std::move(guard); if (is_recovery) { - TabletSharedPtr tablet = _engine.tablet_manager()->get_tablet(tablet_info.tablet_id, - tablet_info.tablet_uid); if (tablet != nullptr && tablet->enable_unique_key_merge_on_write()) { load_info->unique_key_merge_on_write = true; load_info->delete_bitmap.reset(new DeleteBitmap(tablet->tablet_id())); + if (decoded_partial_update_info) { + LOG_INFO( + "get partial update info from RocksDB during recovery. txn_id={}, " + "partition_id={}, tablet_id={}, partial_update_info=[{}]", + transaction_id, partition_id, tablet_id, + decoded_partial_update_info->summary()); + load_info->partial_update_info = decoded_partial_update_info; + } } } load_info->commit(); @@ -514,6 +554,20 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, return status; } + if (tablet_txn_info->unique_key_merge_on_write && tablet_txn_info->partial_update_info && + tablet_txn_info->partial_update_info->is_partial_update) { + status = RowsetMetaManager::remove_partial_update_info(meta, tablet_id, partition_id, + transaction_id); + if (!status) { + // discard the error status and print the warning log + LOG_WARNING( + "fail to remove partial update info from RocksDB. txn_id={}, rowset_id={}, " + "tablet_id={}, tablet_uid={}", + transaction_id, rowset->rowset_id().to_string(), tablet_id, + tablet_uid.to_string()); + } + } + // TODO(Drogon): remove these test codes if (enable_binlog) { auto version_str = fmt::format("{}", version.first); @@ -693,6 +747,13 @@ void TxnManager::force_rollback_tablet_related_txns(OlapMeta* meta, TTabletId ta } } } + if (meta != nullptr) { + Status st = RowsetMetaManager::remove_tablet_related_partial_update_info(meta, tablet_id); + if (!st.ok()) { + LOG_WARNING("failed to partial update info, tablet_id={}, err={}", tablet_id, + st.to_string()); + } + } } void TxnManager::get_txn_related_tablets(const TTransactionId transaction_id, diff --git a/be/src/olap/txn_manager.h b/be/src/olap/txn_manager.h index d201494b24f9028..5944bbf0fc31368 100644 --- a/be/src/olap/txn_manager.h +++ b/be/src/olap/txn_manager.h @@ -36,7 +36,6 @@ #include "common/status.h" #include "olap/olap_common.h" -#include "olap/partial_update_info.h" #include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_meta.h" @@ -52,6 +51,7 @@ namespace doris { class DeltaWriter; class OlapMeta; struct TabletPublishStatistics; +struct PartialUpdateInfo; enum class TxnState { NOT_FOUND = 0, @@ -63,6 +63,13 @@ enum class TxnState { }; enum class PublishStatus { INIT = 0, PREPARE = 1, SUCCEED = 2 }; +struct TxnPublishInfo { + int64_t publish_version {-1}; + int64_t base_compaction_cnt {-1}; + int64_t cumulative_compaction_cnt {-1}; + int64_t cumulative_point {-1}; +}; + struct TabletTxnInfo { PUniqueId load_id; RowsetSharedPtr rowset; @@ -74,24 +81,33 @@ struct TabletTxnInfo { int64_t creation_time; bool ingest {false}; std::shared_ptr partial_update_info; + + // for cloud only, used to determine if a retry CloudTabletCalcDeleteBitmapTask + // needs to re-calculate the delete bitmap std::shared_ptr publish_status; - TxnState state {TxnState::PREPARED}; + TxnPublishInfo publish_info; + TxnState state {TxnState::PREPARED}; TabletTxnInfo() = default; TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset) - : load_id(load_id), rowset(rowset), creation_time(UnixSeconds()) {} + : load_id(std::move(load_id)), + rowset(std::move(rowset)), + creation_time(UnixSeconds()) {} TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset, bool ingest_arg) - : load_id(load_id), rowset(rowset), creation_time(UnixSeconds()), ingest(ingest_arg) {} + : load_id(std::move(load_id)), + rowset(std::move(rowset)), + creation_time(UnixSeconds()), + ingest(ingest_arg) {} TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset, bool merge_on_write, - DeleteBitmapPtr delete_bitmap, const RowsetIdUnorderedSet& ids) - : load_id(load_id), - rowset(rowset), + DeleteBitmapPtr delete_bitmap, RowsetIdUnorderedSet ids) + : load_id(std::move(load_id)), + rowset(std::move(rowset)), unique_key_merge_on_write(merge_on_write), - delete_bitmap(delete_bitmap), - rowset_ids(ids), + delete_bitmap(std::move(delete_bitmap)), + rowset_ids(std::move(ids)), creation_time(UnixSeconds()) {} void prepare() { state = TxnState::PREPARED; } @@ -145,8 +161,8 @@ class TxnManager { Status commit_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, - const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, - bool is_recovery); + const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, bool is_recovery, + std::shared_ptr partial_update_info = nullptr); Status publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet, TTransactionId transaction_id, const Version& version, @@ -161,8 +177,8 @@ class TxnManager { Status commit_txn(OlapMeta* meta, TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, TabletUid tablet_uid, const PUniqueId& load_id, - const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, - bool is_recovery); + const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, bool is_recovery, + std::shared_ptr partial_update_info = nullptr); // remove a txn from txn manager // not persist rowset meta because diff --git a/be/src/olap/wal/wal_manager.cpp b/be/src/olap/wal/wal_manager.cpp index 06937a32b81dc57..a7e33e7383f5974 100644 --- a/be/src/olap/wal/wal_manager.cpp +++ b/be/src/olap/wal/wal_manager.cpp @@ -475,6 +475,11 @@ Status WalManager::update_wal_dir_estimated_wal_bytes(const std::string& wal_dir Status WalManager::_update_wal_dir_info_thread() { while (!_stop.load()) { + if (!ExecEnv::ready()) { + LOG(INFO) << "Sleep 1s to wait for storage engine init."; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + continue; + } static_cast(_wal_dirs_info->update_all_wal_dir_limit()); static_cast(_wal_dirs_info->update_all_wal_dir_used()); LOG_EVERY_N(INFO, 100) << "Scheduled(every 10s) WAL info: " << get_wal_dirs_info_string(); diff --git a/be/src/olap/wal/wal_table.cpp b/be/src/olap/wal/wal_table.cpp index ef98bb58ae48a47..3a29af5cfb93a7d 100644 --- a/be/src/olap/wal/wal_table.cpp +++ b/be/src/olap/wal/wal_table.cpp @@ -85,17 +85,22 @@ void WalTable::_pick_relay_wals() { Status WalTable::_relay_wal_one_by_one() { std::vector> need_retry_wals; - std::vector> need_delete_wals; for (auto wal_info : _replaying_queue) { wal_info->add_retry_num(); auto st = _replay_wal_internal(wal_info->get_wal_path()); auto msg = st.msg(); if (st.ok() || st.is() || st.is() || st.is() || - msg.find("LabelAlreadyUsedException") != msg.npos) { + (msg.find("LabelAlreadyUsedException") != msg.npos && + (msg.find("[COMMITTED]") != msg.npos || msg.find("[VISIBLE]") != msg.npos))) { LOG(INFO) << "succeed to replay wal=" << wal_info->get_wal_path() << ", st=" << st.to_string(); - need_delete_wals.push_back(wal_info); + // delete wal + WARN_IF_ERROR(_exec_env->wal_mgr()->delete_wal(_table_id, wal_info->get_wal_id()), + "failed to delete wal=" + wal_info->get_wal_path()); + if (config::group_commit_wait_replay_wal_finish) { + RETURN_IF_ERROR(_exec_env->wal_mgr()->notify_relay_wal(wal_info->get_wal_id())); + } } else { doris::wal_fail << 1; LOG(WARNING) << "failed to replay wal=" << wal_info->get_wal_path() @@ -110,13 +115,6 @@ Status WalTable::_relay_wal_one_by_one() { _replay_wal_map.emplace(retry_wal_info->get_wal_path(), retry_wal_info); } } - for (auto delete_wal_info : need_delete_wals) { - [[maybe_unused]] auto st = - _exec_env->wal_mgr()->delete_wal(_table_id, delete_wal_info->get_wal_id()); - if (config::group_commit_wait_replay_wal_finish) { - RETURN_IF_ERROR(_exec_env->wal_mgr()->notify_relay_wal(delete_wal_info->get_wal_id())); - } - } return Status::OK(); } @@ -166,16 +164,14 @@ Status WalTable::_try_abort_txn(int64_t db_id, std::string& label) { request.__set_auth_code(0); // this is a fake, fe not check it now request.__set_db_id(db_id); request.__set_label(label); - std::string reason = "relay wal with label " + label; - request.__set_reason(reason); + request.__set_reason("relay wal with label " + label); TLoadTxnRollbackResult result; TNetworkAddress master_addr = _exec_env->master_info()->network_address; auto st = ThriftRpcHelper::rpc( master_addr.hostname, master_addr.port, [&request, &result](FrontendServiceConnection& client) { client->loadTxnRollback(result, request); - }, - 10000L); + }); auto result_status = Status::create(result.status); LOG(INFO) << "abort label " << label << ", st:" << st << ", result_status:" << result_status; return result_status; @@ -195,6 +191,8 @@ Status WalTable::_replay_wal_internal(const std::string& wal) { [[maybe_unused]] auto st = _try_abort_txn(_db_id, label); } #endif + DBUG_EXECUTE_IF("WalTable.replay_wals.stop", + { return Status::InternalError("WalTable.replay_wals.stop"); }); return _replay_one_wal_with_streamload(wal_id, wal, label); } diff --git a/be/src/pipeline/common/runtime_filter_consumer.cpp b/be/src/pipeline/common/runtime_filter_consumer.cpp index 57397efd21185f5..817c76a79af47c3 100644 --- a/be/src/pipeline/common/runtime_filter_consumer.cpp +++ b/be/src/pipeline/common/runtime_filter_consumer.cpp @@ -101,43 +101,22 @@ void RuntimeFilterConsumer::init_runtime_filter_dependency( } } -Status RuntimeFilterConsumer::_acquire_runtime_filter(bool pipeline_x) { +Status RuntimeFilterConsumer::_acquire_runtime_filter() { SCOPED_TIMER(_acquire_runtime_filter_timer); std::vector vexprs; for (size_t i = 0; i < _runtime_filter_descs.size(); ++i) { auto runtime_filter = _runtime_filter_ctxs[i].runtime_filter; - if (pipeline_x) { - runtime_filter->update_state(); - if (runtime_filter->is_ready() && !_runtime_filter_ctxs[i].apply_mark) { - // Runtime filter has been applied in open phase. - RETURN_IF_ERROR(runtime_filter->get_push_expr_ctxs(_probe_ctxs, vexprs, false)); - _runtime_filter_ctxs[i].apply_mark = true; - } else if (!_runtime_filter_ctxs[i].apply_mark) { - // Runtime filter is timeout. - _is_all_rf_applied = false; - } - } else { - bool ready = runtime_filter->is_ready(); - if (!ready) { - ready = runtime_filter->await(); - } - if (ready && !_runtime_filter_ctxs[i].apply_mark) { - RETURN_IF_ERROR(runtime_filter->get_push_expr_ctxs(_probe_ctxs, vexprs, false)); - _runtime_filter_ctxs[i].apply_mark = true; - } else if (runtime_filter->current_state() == RuntimeFilterState::NOT_READY && - !_runtime_filter_ctxs[i].apply_mark) { - *_blocked_by_rf = true; - } else if (!_runtime_filter_ctxs[i].apply_mark) { - DCHECK(runtime_filter->current_state() != RuntimeFilterState::NOT_READY); - _is_all_rf_applied = false; - } + runtime_filter->update_state(); + if (runtime_filter->is_ready() && !_runtime_filter_ctxs[i].apply_mark) { + // Runtime filter has been applied in open phase. + RETURN_IF_ERROR(runtime_filter->get_push_expr_ctxs(_probe_ctxs, vexprs, false)); + _runtime_filter_ctxs[i].apply_mark = true; + } else if (!_runtime_filter_ctxs[i].apply_mark) { + // Runtime filter is timeout. + _is_all_rf_applied = false; } } RETURN_IF_ERROR(_append_rf_into_conjuncts(vexprs)); - if (!pipeline_x && *_blocked_by_rf) { - return Status::WaitForRf("Runtime filters are neither not ready nor timeout"); - } - return Status::OK(); } diff --git a/be/src/pipeline/common/runtime_filter_consumer.h b/be/src/pipeline/common/runtime_filter_consumer.h index 4b500a916f0e470..03868355875454f 100644 --- a/be/src/pipeline/common/runtime_filter_consumer.h +++ b/be/src/pipeline/common/runtime_filter_consumer.h @@ -47,7 +47,7 @@ class RuntimeFilterConsumer { // Register and get all runtime filters at Init phase. Status _register_runtime_filter(bool need_local_merge); // Get all arrived runtime filters at Open phase. - Status _acquire_runtime_filter(bool pipeline_x); + Status _acquire_runtime_filter(); // Append late-arrival runtime filters to the vconjunct_ctx. Status _append_rf_into_conjuncts(const std::vector& vexprs); diff --git a/be/src/pipeline/dependency.cpp b/be/src/pipeline/dependency.cpp index 5e1ce79a1eb3eb9..560efec94e1d18b 100644 --- a/be/src/pipeline/dependency.cpp +++ b/be/src/pipeline/dependency.cpp @@ -188,10 +188,12 @@ void LocalExchangeSharedState::sub_running_sink_operators() { } } -void LocalExchangeSharedState::sub_running_source_operators() { +void LocalExchangeSharedState::sub_running_source_operators( + LocalExchangeSourceLocalState& local_state) { std::unique_lock lc(le_lock); if (exchanger->_running_source_operators.fetch_sub(1) == 1) { _set_always_ready(); + exchanger->finalize(local_state); } } @@ -397,4 +399,6 @@ Status AggSharedState::_destroy_agg_status(vectorized::AggregateDataPtr data) { return Status::OK(); } +LocalExchangeSharedState::~LocalExchangeSharedState() = default; + } // namespace doris::pipeline diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index 8adc24d3b4ed7c4..6d3f836dfcb4cf8 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -49,7 +49,7 @@ class Dependency; class PipelineTask; struct BasicSharedState; using DependencySPtr = std::shared_ptr; -using DependencyMap = std::map>; +class LocalExchangeSourceLocalState; static constexpr auto SLOW_DEPENDENCY_THRESHOLD = 60 * 1000L * 1000L * 1000L; static constexpr auto TIME_UNIT_DEPENDENCY_LOG = 30 * 1000L * 1000L * 1000L; @@ -88,20 +88,11 @@ class Dependency : public std::enable_shared_from_this { public: ENABLE_FACTORY_CREATOR(Dependency); Dependency(int id, int node_id, std::string name) - : _id(id), - _node_id(node_id), - _name(std::move(name)), - _is_write_dependency(false), - _ready(false) {} + : _id(id), _node_id(node_id), _name(std::move(name)), _ready(false) {} Dependency(int id, int node_id, std::string name, bool ready) - : _id(id), - _node_id(node_id), - _name(std::move(name)), - _is_write_dependency(true), - _ready(ready) {} + : _id(id), _node_id(node_id), _name(std::move(name)), _ready(ready) {} virtual ~Dependency() = default; - bool is_write_dependency() const { return _is_write_dependency; } [[nodiscard]] int id() const { return _id; } [[nodiscard]] virtual std::string name() const { return _name; } BasicSharedState* shared_state() { return _shared_state; } @@ -118,12 +109,10 @@ class Dependency : public std::enable_shared_from_this { // Notify downstream pipeline tasks this dependency is ready. void set_ready(); void set_ready_to_read() { - DCHECK(_is_write_dependency) << debug_string(); DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); _shared_state->source_deps.front()->set_ready(); } void set_block_to_read() { - DCHECK(_is_write_dependency) << debug_string(); DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); _shared_state->source_deps.front()->block(); } @@ -166,7 +155,6 @@ class Dependency : public std::enable_shared_from_this { const int _id; const int _node_id; const std::string _name; - const bool _is_write_dependency; std::atomic _ready; BasicSharedState* _shared_state = nullptr; @@ -328,7 +316,6 @@ struct AggSharedState : public BasicSharedState { vectorized::VExprContextSPtrs probe_expr_ctxs; size_t input_num_rows = 0; std::vector values; - std::unique_ptr agg_profile_arena; /// The total size of the row from the aggregate functions. size_t total_size_of_aggregate_states = 0; size_t align_aggregate_states = 1; @@ -661,15 +648,6 @@ struct PartitionSortNodeSharedState : public BasicSharedState { std::mutex sink_eos_lock; }; -class AsyncWriterDependency final : public Dependency { -public: - using SharedState = BasicSharedState; - ENABLE_FACTORY_CREATOR(AsyncWriterDependency); - AsyncWriterDependency(int id, int node_id) - : Dependency(id, node_id, "AsyncWriterDependency", true) {} - ~AsyncWriterDependency() override = default; -}; - using SetHashTableVariants = std::variant>, @@ -761,8 +739,9 @@ struct SetSharedState : public BasicSharedState { // (select 0) intersect (select null) the build side hash table should not // ignore null value. std::vector data_types; - for (const auto& ctx : child_exprs_lists[0]) { - data_types.emplace_back(build_not_ignore_null[0] + for (int i = 0; i < child_exprs_lists[0].size(); i++) { + const auto& ctx = child_exprs_lists[0][i]; + data_types.emplace_back(build_not_ignore_null[i] ? make_nullable(ctx->root()->data_type()) : ctx->root()->data_type()); } @@ -831,20 +810,21 @@ struct LocalExchangeSharedState : public BasicSharedState { public: ENABLE_FACTORY_CREATOR(LocalExchangeSharedState); LocalExchangeSharedState(int num_instances); + ~LocalExchangeSharedState() override; std::unique_ptr exchanger {}; std::vector mem_trackers; std::atomic mem_usage = 0; // We need to make sure to add mem_usage first and then enqueue, otherwise sub mem_usage may cause negative mem_usage during concurrent dequeue. std::mutex le_lock; - void create_source_dependencies(int operator_id, int node_id) { + virtual void create_dependencies(int local_exchange_id) { for (auto& source_dep : source_deps) { - source_dep = std::make_shared(operator_id, node_id, + source_dep = std::make_shared(local_exchange_id, local_exchange_id, "LOCAL_EXCHANGE_OPERATOR_DEPENDENCY"); source_dep->set_shared_state(this); } - }; + } void sub_running_sink_operators(); - void sub_running_source_operators(); + void sub_running_source_operators(LocalExchangeSourceLocalState& local_state); void _set_always_ready() { for (auto& dep : source_deps) { DCHECK(dep); @@ -856,7 +836,10 @@ struct LocalExchangeSharedState : public BasicSharedState { } } - Dependency* get_dep_by_channel_id(int channel_id) { return source_deps[channel_id].get(); } + virtual std::vector get_dep_by_channel_id(int channel_id) { + return {source_deps[channel_id]}; + } + virtual Dependency* get_sink_dep_by_channel_id(int channel_id) { return nullptr; } void set_ready_to_read(int channel_id) { auto& dep = source_deps[channel_id]; @@ -867,28 +850,82 @@ struct LocalExchangeSharedState : public BasicSharedState { void add_mem_usage(int channel_id, size_t delta, bool update_total_mem_usage = true) { mem_trackers[channel_id]->consume(delta); if (update_total_mem_usage) { - add_total_mem_usage(delta); + add_total_mem_usage(delta, channel_id); } } - void sub_mem_usage(int channel_id, size_t delta, bool update_total_mem_usage = true) { - mem_trackers[channel_id]->release(delta); - if (update_total_mem_usage) { - sub_total_mem_usage(delta); - } - } + void sub_mem_usage(int channel_id, size_t delta) { mem_trackers[channel_id]->release(delta); } - void add_total_mem_usage(size_t delta) { - if (mem_usage.fetch_add(delta) > config::local_exchange_buffer_mem_limit) { + virtual void add_total_mem_usage(size_t delta, int channel_id = 0) { + if (mem_usage.fetch_add(delta) + delta > config::local_exchange_buffer_mem_limit) { sink_deps.front()->block(); } } - void sub_total_mem_usage(size_t delta) { - if (mem_usage.fetch_sub(delta) <= config::local_exchange_buffer_mem_limit) { + virtual void sub_total_mem_usage(size_t delta, int channel_id = 0) { + auto prev_usage = mem_usage.fetch_sub(delta); + DCHECK_GE(prev_usage - delta, 0) << "prev_usage: " << prev_usage << " delta: " << delta + << " channel_id: " << channel_id; + if (prev_usage - delta <= config::local_exchange_buffer_mem_limit) { sink_deps.front()->set_ready(); } } }; +struct LocalMergeExchangeSharedState : public LocalExchangeSharedState { + ENABLE_FACTORY_CREATOR(LocalMergeExchangeSharedState); + LocalMergeExchangeSharedState(int num_instances) + : LocalExchangeSharedState(num_instances), + _queues_mem_usage(num_instances), + _each_queue_limit(config::local_exchange_buffer_mem_limit / num_instances) { + for (size_t i = 0; i < num_instances; i++) { + _queues_mem_usage[i] = 0; + } + } + + void create_dependencies(int local_exchange_id) override { + sink_deps.resize(source_deps.size()); + for (size_t i = 0; i < source_deps.size(); i++) { + source_deps[i] = + std::make_shared(local_exchange_id, local_exchange_id, + "LOCAL_MERGE_EXCHANGE_OPERATOR_DEPENDENCY"); + source_deps[i]->set_shared_state(this); + sink_deps[i] = std::make_shared( + local_exchange_id, local_exchange_id, + "LOCAL_MERGE_EXCHANGE_OPERATOR_SINK_DEPENDENCY", true); + sink_deps[i]->set_shared_state(this); + } + } + + void sub_total_mem_usage(size_t delta, int channel_id) override { + auto prev_usage = _queues_mem_usage[channel_id].fetch_sub(delta); + DCHECK_GE(prev_usage - delta, 0) << "prev_usage: " << prev_usage << " delta: " << delta + << " channel_id: " << channel_id; + if (prev_usage - delta <= _each_queue_limit) { + sink_deps[channel_id]->set_ready(); + } + if (_queues_mem_usage[channel_id] == 0) { + source_deps[channel_id]->block(); + } + } + void add_total_mem_usage(size_t delta, int channel_id) override { + if (_queues_mem_usage[channel_id].fetch_add(delta) + delta > _each_queue_limit) { + sink_deps[channel_id]->block(); + } + source_deps[channel_id]->set_ready(); + } + + Dependency* get_sink_dep_by_channel_id(int channel_id) override { + return sink_deps[channel_id].get(); + } + + std::vector get_dep_by_channel_id(int channel_id) override { + return source_deps; + } + +private: + std::vector _queues_mem_usage; + const int64_t _each_queue_limit; +}; + } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp index f3a6942c33fba8a..a287d7fb2786b2f 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp @@ -48,7 +48,7 @@ namespace doris::pipeline { /// is in a random order. This means that we assume that the reduction factor will /// increase over time. AggSinkLocalState::AggSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) - : Base(parent, state) {} + : Base(parent, state), _agg_profile_arena(std::make_unique()) {} Status AggSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); @@ -97,11 +97,10 @@ Status AggSinkLocalState::open(RuntimeState* state) { RETURN_IF_ERROR( p._probe_expr_ctxs[i]->clone(state, Base::_shared_state->probe_expr_ctxs[i])); } - Base::_shared_state->agg_profile_arena = std::make_unique(); if (Base::_shared_state->probe_expr_ctxs.empty()) { _agg_data->without_key = reinterpret_cast( - Base::_shared_state->agg_profile_arena->alloc(p._total_size_of_aggregate_states)); + _agg_profile_arena->alloc(p._total_size_of_aggregate_states)); if (p._is_merge) { _executor = std::make_unique>(); @@ -748,7 +747,6 @@ Status AggSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { } const auto& agg_functions = tnode.agg_node.aggregate_functions; - _external_agg_bytes_threshold = state->external_agg_bytes_threshold(); _is_merge = std::any_of(agg_functions.cbegin(), agg_functions.cend(), [](const auto& e) { return e.nodes[0].agg_expr.is_merge_agg; }); diff --git a/be/src/pipeline/exec/aggregation_sink_operator.h b/be/src/pipeline/exec/aggregation_sink_operator.h index 10a8119914045cf..579b9eda1a6e9f9 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.h +++ b/be/src/pipeline/exec/aggregation_sink_operator.h @@ -120,6 +120,7 @@ class AggSinkLocalState : public PipelineXSinkLocalState { AggregatedDataVariants* _agg_data = nullptr; vectorized::Arena* _agg_arena_pool = nullptr; + std::unique_ptr _agg_profile_arena; std::unique_ptr _executor = nullptr; }; @@ -189,7 +190,6 @@ class AggSinkOperatorX final : public DataSinkOperatorX { /// The total size of the row from the aggregate functions. size_t _total_size_of_aggregate_states = 0; - size_t _external_agg_bytes_threshold; // group by k1,k2 vectorized::VExprContextSPtrs _probe_expr_ctxs; ObjectPool* _pool = nullptr; diff --git a/be/src/pipeline/exec/aggregation_source_operator.cpp b/be/src/pipeline/exec/aggregation_source_operator.cpp index 0c05c965f1f8849..3264ad56f3c5852 100644 --- a/be/src/pipeline/exec/aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/aggregation_source_operator.cpp @@ -37,7 +37,7 @@ AggLocalState::AggLocalState(RuntimeState* state, OperatorXBase* parent) Status AggLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); _get_results_timer = ADD_TIMER(profile(), "GetResultsTime"); _serialize_result_timer = ADD_TIMER(profile(), "SerializeResultTime"); _hash_table_iterate_timer = ADD_TIMER(profile(), "HashTableIterateTime"); diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index 406108fbc4f5299..3583642273be95f 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -34,7 +34,8 @@ AnalyticLocalState::AnalyticLocalState(RuntimeState* state, OperatorXBase* paren _rows_end_offset(0), _fn_place_ptr(nullptr), _agg_functions_size(0), - _agg_functions_created(false) {} + _agg_functions_created(false), + _agg_arena_pool(std::make_unique()) {} //_partition_by_columns,_order_by_columns save in blocks, so if need to calculate the boundary, may find in which blocks firstly BlockRowPos AnalyticLocalState::_compare_row_to_find_end(int idx, BlockRowPos start, @@ -168,7 +169,6 @@ Status AnalyticLocalState::open(RuntimeState* state) { RETURN_IF_ERROR(PipelineXLocalState::open(state)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); - _agg_arena_pool = std::make_unique(); auto& p = _parent->cast(); _agg_functions_size = p._agg_functions.size(); diff --git a/be/src/pipeline/exec/exchange_sink_buffer.h b/be/src/pipeline/exec/exchange_sink_buffer.h index 2a0d75e42cb17c7..2d30a492a0d8f93 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.h +++ b/be/src/pipeline/exec/exchange_sink_buffer.h @@ -53,19 +53,6 @@ class ExchangeSinkLocalState; namespace vectorized { class PipChannel; -template -struct AtomicWrapper { - std::atomic _value; - - AtomicWrapper() : _value() {} - - AtomicWrapper(const std::atomic& a) : _value(a.load()) {} - - AtomicWrapper(const AtomicWrapper& other) : _value(other._value.load()) {} - - AtomicWrapper& operator=(const AtomicWrapper& other) { _value.store(other._a.load()); } -}; - // We use BroadcastPBlockHolder to hold a broadcasted PBlock. For broadcast shuffle, one PBlock // will be shared between different channel, so we have to use a ref count to mark if this // PBlock is available for next serialization. diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp index 2a8aa56dc62b2f5..f1f6c2d0c5d1ccc 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.cpp +++ b/be/src/pipeline/exec/exchange_sink_operator.cpp @@ -99,7 +99,10 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf // Make sure brpc stub is ready before execution. for (int i = 0; i < channels.size(); ++i) { RETURN_IF_ERROR(channels[i]->init_stub(state)); + _wait_channel_timer.push_back(_profile->add_nonzero_counter( + fmt::format("WaitForLocalExchangeBuffer{}", i), TUnit ::TIME_NS, timer_name, 1)); } + _wait_broadcast_buffer_timer = ADD_CHILD_TIMER(_profile, "WaitForBroadcastBuffer", timer_name); return Status::OK(); } @@ -142,8 +145,6 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { _sink_buffer->set_broadcast_dependency(_broadcast_dependency); _broadcast_pb_mem_limiter = vectorized::BroadcastPBlockHolderMemLimiter::create_shared(_broadcast_dependency); - _wait_broadcast_buffer_timer = - ADD_CHILD_TIMER(_profile, "WaitForBroadcastBuffer", timer_name); } else if (local_size > 0) { size_t dep_id = 0; for (auto* channel : channels) { @@ -151,9 +152,6 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { if (auto dep = channel->get_local_channel_dependency()) { _local_channels_dependency.push_back(dep); DCHECK(_local_channels_dependency[dep_id] != nullptr); - _wait_channel_timer.push_back(_profile->add_nonzero_counter( - fmt::format("WaitForLocalExchangeBuffer{}", dep_id), TUnit ::TIME_NS, - timer_name, 1)); dep_id++; } else { LOG(WARNING) << "local recvr is null: query id = " diff --git a/be/src/pipeline/exec/file_scan_operator.cpp b/be/src/pipeline/exec/file_scan_operator.cpp index 98cc91824f64c18..8c1e4d19407ff67 100644 --- a/be/src/pipeline/exec/file_scan_operator.cpp +++ b/be/src/pipeline/exec/file_scan_operator.cpp @@ -44,9 +44,9 @@ Status FileScanLocalState::_init_scanners(std::list* s _kv_cache.reset(new vectorized::ShardedKVCache(shard_num)); for (int i = 0; i < _max_scanners; ++i) { std::unique_ptr scanner = vectorized::VFileScanner::create_unique( - state(), this, p._limit, _split_source, _scanner_profile.get(), _kv_cache.get()); - RETURN_IF_ERROR( - scanner->prepare(_conjuncts, &_colname_to_value_range, &p._colname_to_slot_id)); + state(), this, p._limit, _split_source, _scanner_profile.get(), _kv_cache.get(), + &_colname_to_value_range, &p._colname_to_slot_id); + RETURN_IF_ERROR(scanner->prepare(state(), _conjuncts)); scanners->push_back(std::move(scanner)); } return Status::OK(); @@ -73,11 +73,13 @@ void FileScanLocalState::set_scan_ranges(RuntimeState* state, auto split_source = scan_range.split_source; RuntimeProfile::Counter* get_split_timer = ADD_TIMER(_runtime_profile, "GetSplitTime"); _split_source = std::make_shared( - state, get_split_timer, split_source.split_source_id, split_source.num_splits); + state, get_split_timer, split_source.split_source_id, split_source.num_splits, + _max_scanners); } } if (_split_source == nullptr) { - _split_source = std::make_shared(scan_ranges); + _split_source = + std::make_shared(scan_ranges, _max_scanners); } _max_scanners = std::min(_max_scanners, _split_source->num_scan_ranges()); if (scan_ranges.size() > 0 && @@ -91,6 +93,7 @@ void FileScanLocalState::set_scan_ranges(RuntimeState* state, Status FileScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(ScanLocalState::init(state, info)); + SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); _output_tuple_id = p._output_tuple_id; return Status::OK(); diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 3a55fdd9b8698eb..cde42eae1e14df4 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -37,7 +37,7 @@ HashJoinBuildSinkLocalState::HashJoinBuildSinkLocalState(DataSinkOperatorXBase* Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(JoinBuildSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); _shared_state->join_op_variants = p._join_op_variants; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index 374cf506861431a..d953e80b70150f9 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -21,16 +21,20 @@ #include "common/logging.h" #include "pipeline/exec/operator.h" +#include "runtime/descriptors.h" +#include "vec/common/assert_cast.h" +#include "vec/data_types/data_type_nullable.h" namespace doris::pipeline { HashJoinProbeLocalState::HashJoinProbeLocalState(RuntimeState* state, OperatorXBase* parent) - : JoinProbeLocalState(state, parent) {} + : JoinProbeLocalState(state, parent), + _process_hashtable_ctx_variants(std::make_unique()) {} Status HashJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(JoinProbeLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); _shared_state->probe_ignore_null = p._probe_ignore_null; _probe_expr_ctxs.resize(p._probe_expr_ctxs.size()); @@ -68,7 +72,6 @@ Status HashJoinProbeLocalState::open(RuntimeState* state) { SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(JoinProbeLocalState::open(state)); - _process_hashtable_ctx_variants = std::make_unique(); auto& p = _parent->cast(); std::visit( [&](auto&& join_op_variants, auto have_other_join_conjunct) { @@ -615,6 +618,54 @@ Status HashJoinProbeOperatorX::prepare(RuntimeState* state) { _left_table_data_types = vectorized::VectorizedUtils::get_data_types(_child_x->row_desc()); _right_table_column_names = vectorized::VectorizedUtils::get_column_names(_build_side_child->row_desc()); + + std::vector slots_to_check; + for (const auto& tuple_descriptor : _intermediate_row_desc->tuple_descriptors()) { + for (const auto& slot : tuple_descriptor->slots()) { + slots_to_check.emplace_back(slot); + } + } + + if (_is_mark_join) { + const auto* last_one = slots_to_check.back(); + slots_to_check.pop_back(); + auto data_type = last_one->get_data_type_ptr(); + if (!data_type->is_nullable()) { + return Status::InternalError( + "The last column for mark join should be Nullable(UInt8), not {}", + data_type->get_name()); + } + + const auto& null_data_type = assert_cast(*data_type); + if (null_data_type.get_nested_type()->get_type_id() != vectorized::TypeIndex::UInt8) { + return Status::InternalError( + "The last column for mark join should be Nullable(UInt8), not {}", + data_type->get_name()); + } + } + + const int right_col_idx = + (_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _left_table_data_types.size(); + size_t idx = 0; + for (const auto* slot : slots_to_check) { + auto data_type = slot->get_data_type_ptr(); + auto target_data_type = idx < right_col_idx ? _left_table_data_types[idx] + : _right_table_data_types[idx - right_col_idx]; + ++idx; + if (data_type->equals(*target_data_type)) { + continue; + } + + auto data_type_non_nullable = vectorized::remove_nullable(data_type); + if (data_type_non_nullable->equals(*target_data_type)) { + continue; + } + + return Status::InternalError("intermediate slot({}) data type not match: '{}' vs '{}'", + slot->id(), data_type->get_name(), + _left_table_data_types[idx]->get_name()); + } + _build_side_child.reset(); return Status::OK(); } diff --git a/be/src/pipeline/exec/hive_table_sink_operator.cpp b/be/src/pipeline/exec/hive_table_sink_operator.cpp index b931d48e832fca2..f7cb31eea5ef736 100644 --- a/be/src/pipeline/exec/hive_table_sink_operator.cpp +++ b/be/src/pipeline/exec/hive_table_sink_operator.cpp @@ -24,23 +24,10 @@ namespace doris::pipeline { Status HiveTableSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); RETURN_IF_ERROR(_writer->init_properties(p._pool)); return Status::OK(); } -Status HiveTableSinkLocalState::close(RuntimeState* state, Status exec_status) { - if (Base::_closed) { - return Status::OK(); - } - SCOPED_TIMER(_close_timer); - SCOPED_TIMER(exec_time_counter()); - if (_closed) { - return _close_status; - } - _close_status = Base::close(state, exec_status); - return _close_status; -} - } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/hive_table_sink_operator.h b/be/src/pipeline/exec/hive_table_sink_operator.h index e06338e14279a60..bee90a9f6c60606 100644 --- a/be/src/pipeline/exec/hive_table_sink_operator.h +++ b/be/src/pipeline/exec/hive_table_sink_operator.h @@ -39,11 +39,7 @@ class HiveTableSinkLocalState final return Base::open(state); } - Status close(RuntimeState* state, Status exec_status) override; friend class HiveTableSinkOperatorX; - -private: - Status _close_status = Status::OK(); }; class HiveTableSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/iceberg_table_sink_operator.cpp b/be/src/pipeline/exec/iceberg_table_sink_operator.cpp index f63f23ddec5b274..44bde4e8812fe1e 100644 --- a/be/src/pipeline/exec/iceberg_table_sink_operator.cpp +++ b/be/src/pipeline/exec/iceberg_table_sink_operator.cpp @@ -24,23 +24,10 @@ namespace doris::pipeline { Status IcebergTableSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); RETURN_IF_ERROR(_writer->init_properties(p._pool)); return Status::OK(); } -Status IcebergTableSinkLocalState::close(RuntimeState* state, Status exec_status) { - if (Base::_closed) { - return Status::OK(); - } - SCOPED_TIMER(_close_timer); - SCOPED_TIMER(exec_time_counter()); - if (_closed) { - return _close_status; - } - _close_status = Base::close(state, exec_status); - return _close_status; -} - } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/iceberg_table_sink_operator.h b/be/src/pipeline/exec/iceberg_table_sink_operator.h index 09df1c20b403c0b..dd93d6934e1ac02 100644 --- a/be/src/pipeline/exec/iceberg_table_sink_operator.h +++ b/be/src/pipeline/exec/iceberg_table_sink_operator.h @@ -38,12 +38,7 @@ class IcebergTableSinkLocalState final SCOPED_TIMER(_open_timer); return Base::open(state); } - - Status close(RuntimeState* state, Status exec_status) override; friend class IcebergTableSinkOperatorX; - -private: - Status _close_status = Status::OK(); }; class IcebergTableSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp index 25bc28b5d432f09..1028bca7ce2ca44 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp @@ -51,7 +51,7 @@ Status MultiCastDataStreamSourceLocalState::open(RuntimeState* state) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(Base::open(state)); - RETURN_IF_ERROR(_acquire_runtime_filter(true)); + RETURN_IF_ERROR(_acquire_runtime_filter()); auto& p = _parent->cast(); _output_expr_contexts.resize(p._output_expr_contexts.size()); for (size_t i = 0; i < p._output_expr_contexts.size(); i++) { diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp index e01505b5f793967..9e44a399bd8ad01 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp @@ -61,7 +61,7 @@ NestedLoopJoinBuildSinkLocalState::NestedLoopJoinBuildSinkLocalState(DataSinkOpe Status NestedLoopJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(JoinBuildSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); _shared_state->join_op_variants = p._join_op_variants; _runtime_filters.resize(p._runtime_filter_descs.size()); diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index bee550f1db5291d..95bfd3980417b91 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -129,6 +129,8 @@ Status OlapScanLocalState::_init_profile() { _inverted_index_query_cache_miss_counter = ADD_COUNTER(_segment_profile, "InvertedIndexQueryCacheMiss", TUnit::UNIT); _inverted_index_query_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryTime"); + _inverted_index_query_null_bitmap_timer = + ADD_TIMER(_segment_profile, "InvertedIndexQueryNullBitmapTime"); _inverted_index_query_bitmap_copy_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapCopyTime"); _inverted_index_query_bitmap_op_timer = @@ -137,6 +139,10 @@ Status OlapScanLocalState::_init_profile() { ADD_TIMER(_segment_profile, "InvertedIndexSearcherOpenTime"); _inverted_index_searcher_search_timer = ADD_TIMER(_segment_profile, "InvertedIndexSearcherSearchTime"); + _inverted_index_searcher_cache_hit_counter = + ADD_COUNTER(_segment_profile, "InvertedIndexSearcherCacheHit", TUnit::UNIT); + _inverted_index_searcher_cache_miss_counter = + ADD_COUNTER(_segment_profile, "InvertedIndexSearcherCacheMiss", TUnit::UNIT); _output_index_result_column_timer = ADD_TIMER(_segment_profile, "OutputIndexResultColumnTimer"); diff --git a/be/src/pipeline/exec/olap_scan_operator.h b/be/src/pipeline/exec/olap_scan_operator.h index ca98b17118999fb..83f838dd0fc47c6 100644 --- a/be/src/pipeline/exec/olap_scan_operator.h +++ b/be/src/pipeline/exec/olap_scan_operator.h @@ -174,6 +174,7 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _inverted_index_filter_counter = nullptr; RuntimeProfile::Counter* _inverted_index_filter_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_query_null_bitmap_timer = nullptr; RuntimeProfile::Counter* _inverted_index_query_cache_hit_counter = nullptr; RuntimeProfile::Counter* _inverted_index_query_cache_miss_counter = nullptr; RuntimeProfile::Counter* _inverted_index_query_timer = nullptr; @@ -181,6 +182,8 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _inverted_index_query_bitmap_op_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_open_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_search_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_cache_hit_counter = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_cache_miss_counter = nullptr; RuntimeProfile::Counter* _output_index_result_column_timer = nullptr; diff --git a/be/src/pipeline/exec/olap_table_sink_operator.h b/be/src/pipeline/exec/olap_table_sink_operator.h index 74decf9c2787292..5eafc2ea25fe2fd 100644 --- a/be/src/pipeline/exec/olap_table_sink_operator.h +++ b/be/src/pipeline/exec/olap_table_sink_operator.h @@ -32,11 +32,7 @@ class OlapTableSinkLocalState final ENABLE_FACTORY_CREATOR(OlapTableSinkLocalState); OlapTableSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) : Base(parent, state) {}; - Status close(RuntimeState* state, Status exec_status) override; friend class OlapTableSinkOperatorX; - -private: - Status _close_status = Status::OK(); }; class OlapTableSinkOperatorX final : public DataSinkOperatorX { public: diff --git a/be/src/pipeline/exec/olap_table_sink_v2_operator.h b/be/src/pipeline/exec/olap_table_sink_v2_operator.h index 2cd82016f9e7251..4ffd062f99eadc0 100644 --- a/be/src/pipeline/exec/olap_table_sink_v2_operator.h +++ b/be/src/pipeline/exec/olap_table_sink_v2_operator.h @@ -32,11 +32,7 @@ class OlapTableSinkV2LocalState final ENABLE_FACTORY_CREATOR(OlapTableSinkV2LocalState); OlapTableSinkV2LocalState(DataSinkOperatorXBase* parent, RuntimeState* state) : Base(parent, state) {}; - Status close(RuntimeState* state, Status exec_status) override; friend class OlapTableSinkV2OperatorX; - -private: - Status _close_status = Status::OK(); }; class OlapTableSinkV2OperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/operator.cpp b/be/src/pipeline/exec/operator.cpp index 8a13fa1f5701b73..5f9a904abf23cff 100644 --- a/be/src/pipeline/exec/operator.cpp +++ b/be/src/pipeline/exec/operator.cpp @@ -261,7 +261,7 @@ Status OperatorXBase::do_projections(RuntimeState* state, vectorized::Block* ori vectorized::Block input_block = *origin_block; std::vector result_column_ids; - for (const auto& projections : _intermediate_projections) { + for (const auto& projections : local_state->_intermediate_projections) { result_column_ids.resize(projections.size()); for (int i = 0; i < projections.size(); i++) { RETURN_IF_ERROR(projections[i]->execute(&input_block, &result_column_ids[i])); @@ -452,7 +452,7 @@ Status PipelineXLocalState::init(RuntimeState* state, LocalState DCHECK(info.le_state_map.find(_parent->operator_id()) != info.le_state_map.end()); _shared_state = info.le_state_map.at(_parent->operator_id()).first.get(); - _dependency = _shared_state->get_dep_by_channel_id(info.task_idx); + _dependency = _shared_state->get_dep_by_channel_id(info.task_idx).front().get(); _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL( _runtime_profile, "WaitForDependency[" + _dependency->name() + "]Time", 1); } else if (info.shared_state) { @@ -538,7 +538,7 @@ Status PipelineXSinkLocalState::init(RuntimeState* state, LocalSink if constexpr (std::is_same_v) { DCHECK(info.le_state_map.find(_parent->dests_id().front()) != info.le_state_map.end()); _dependency = info.le_state_map.at(_parent->dests_id().front()).second.get(); - _shared_state = (SharedState*)_dependency->shared_state(); + _shared_state = _dependency->shared_state()->template cast(); } else { _shared_state = info.shared_state->template cast(); _dependency = _shared_state->create_sink_dependency( @@ -621,10 +621,10 @@ template requires(std::is_base_of_v) Status AsyncWriterSink::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - _writer.reset(new Writer(info.tsink, _output_vexpr_ctxs)); - _async_writer_dependency = - AsyncWriterDependency::create_shared(_parent->operator_id(), _parent->node_id()); - _writer->set_dependency(_async_writer_dependency.get(), _finish_dependency.get()); + _async_writer_dependency = Dependency::create_shared(_parent->operator_id(), _parent->node_id(), + "AsyncWriterDependency", true); + _writer.reset(new Writer(info.tsink, _output_vexpr_ctxs, _async_writer_dependency, + _finish_dependency)); _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL( _profile, "WaitForDependency[" + _async_writer_dependency->name() + "]Time", 1); diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h index 2db981ba88e80b3..9d5496904617ae9 100644 --- a/be/src/pipeline/exec/operator.h +++ b/be/src/pipeline/exec/operator.h @@ -871,8 +871,7 @@ class AsyncWriterSink : public PipelineXSinkLocalState { vectorized::VExprContextSPtrs _output_vexpr_ctxs; std::unique_ptr _writer; - std::shared_ptr _async_writer_dependency; - + std::shared_ptr _async_writer_dependency; std::shared_ptr _finish_dependency; }; diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.cpp b/be/src/pipeline/exec/partition_sort_sink_operator.cpp index 62dafd548492054..0c165350613ac2c 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/partition_sort_sink_operator.cpp @@ -99,7 +99,7 @@ Status PartitionBlocks::do_partition_topn_sort() { Status PartitionSortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); RETURN_IF_ERROR(p._vsort_exec_exprs.clone(state, _vsort_exec_exprs)); _partition_expr_ctxs.resize(p._partition_expr_ctxs.size()); @@ -108,13 +108,13 @@ Status PartitionSortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo RETURN_IF_ERROR(p._partition_expr_ctxs[i]->clone(state, _partition_expr_ctxs[i])); } _partition_exprs_num = p._partition_exprs_num; - _partitioned_data = std::make_unique(); - _agg_arena_pool = std::make_unique(); _hash_table_size_counter = ADD_COUNTER(_profile, "HashTableSize", TUnit::UNIT); _build_timer = ADD_TIMER(_profile, "HashTableBuildTime"); _selector_block_timer = ADD_TIMER(_profile, "SelectorBlockTime"); _emplace_key_timer = ADD_TIMER(_profile, "EmplaceKeyTime"); _passthrough_rows_counter = ADD_COUNTER(_profile, "PassThroughRowsCounter", TUnit::UNIT); + _sorted_partition_input_rows_counter = + ADD_COUNTER(_profile, "SortedPartitionInputRows", TUnit::UNIT); _partition_sort_info = std::make_shared( &_vsort_exec_exprs, p._limit, 0, p._pool, p._is_asc_order, p._nulls_first, p._child_x->row_desc(), state, _profile, p._has_global_limit, p._partition_inner_limit, @@ -173,7 +173,6 @@ Status PartitionSortSinkOperatorX::sink(RuntimeState* state, vectorized::Block* SCOPED_TIMER(local_state.exec_time_counter()); if (current_rows > 0) { COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)input_block->rows()); - local_state.child_input_rows = local_state.child_input_rows + current_rows; if (UNLIKELY(_partition_exprs_num == 0)) { if (UNLIKELY(local_state._value_places.empty())) { local_state._value_places.push_back(_pool->add(new PartitionBlocks( @@ -185,10 +184,9 @@ Status PartitionSortSinkOperatorX::sink(RuntimeState* state, vectorized::Block* //if is TWO_PHASE_GLOBAL, must be sort all data thought partition num threshold have been exceeded. if (_topn_phase != TPartTopNPhase::TWO_PHASE_GLOBAL && local_state._num_partition > config::partition_topn_partition_threshold && - local_state.child_input_rows < 10000 * local_state._num_partition) { + local_state._sorted_partition_input_rows < 10000 * local_state._num_partition) { { - COUNTER_UPDATE(local_state._passthrough_rows_counter, - (int64_t)input_block->rows()); + COUNTER_UPDATE(local_state._passthrough_rows_counter, (int64_t)current_rows); std::lock_guard lock(local_state._shared_state->buffer_mutex); local_state._shared_state->blocks_buffer.push(std::move(*input_block)); // buffer have data, source could read this. @@ -198,6 +196,8 @@ Status PartitionSortSinkOperatorX::sink(RuntimeState* state, vectorized::Block* RETURN_IF_ERROR(_split_block_by_partition(input_block, local_state, eos)); RETURN_IF_CANCELLED(state); input_block->clear_column_data(); + local_state._sorted_partition_input_rows = + local_state._sorted_partition_input_rows + current_rows; } } } @@ -220,6 +220,8 @@ Status PartitionSortSinkOperatorX::sink(RuntimeState* state, vectorized::Block* } COUNTER_SET(local_state._hash_table_size_counter, int64_t(local_state._num_partition)); + COUNTER_SET(local_state._sorted_partition_input_rows_counter, + local_state._sorted_partition_input_rows); //so all data from child have sink completed { std::unique_lock lc(local_state._shared_state->sink_eos_lock); diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.h b/be/src/pipeline/exec/partition_sort_sink_operator.h index b7e83763f1dd944..5c1484ed3bc7327 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.h +++ b/be/src/pipeline/exec/partition_sort_sink_operator.h @@ -214,7 +214,9 @@ class PartitionSortSinkLocalState : public PipelineXSinkLocalState(parent, state) {} + : PipelineXSinkLocalState(parent, state), + _partitioned_data(std::make_unique()), + _agg_arena_pool(std::make_unique()) {} Status init(RuntimeState* state, LocalSinkStateInfo& info) override; @@ -224,7 +226,7 @@ class PartitionSortSinkLocalState : public PipelineXSinkLocalState _value_places; int _num_partition = 0; std::vector _partition_columns; @@ -238,6 +240,7 @@ class PartitionSortSinkLocalState : public PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); _get_sorted_timer = ADD_TIMER(profile(), "GetSortedTime"); + _sorted_partition_output_rows_counter = + ADD_COUNTER(profile(), "SortedPartitionOutputRows", TUnit::UNIT); return Status::OK(); } @@ -57,7 +59,7 @@ Status PartitionSortSourceOperatorX::get_block(RuntimeState* state, vectorized:: } if (!output_block->empty()) { COUNTER_UPDATE(local_state.blocks_returned_counter(), 1); - COUNTER_UPDATE(local_state.rows_returned_counter(), output_block->rows()); + local_state._num_rows_returned += output_block->rows(); } return Status::OK(); } @@ -79,7 +81,7 @@ Status PartitionSortSourceOperatorX::get_block(RuntimeState* state, vectorized:: } if (!output_block->empty()) { COUNTER_UPDATE(local_state.blocks_returned_counter(), 1); - COUNTER_UPDATE(local_state.rows_returned_counter(), output_block->rows()); + local_state._num_rows_returned += output_block->rows(); } return Status::OK(); } @@ -98,7 +100,7 @@ Status PartitionSortSourceOperatorX::get_sorted_block(RuntimeState* state, //current sort have eos, so get next idx auto rows = local_state._shared_state->partition_sorts[local_state._sort_idx] ->get_output_rows(); - local_state._num_rows_returned += rows; + COUNTER_UPDATE(local_state._sorted_partition_output_rows_counter, rows); local_state._shared_state->partition_sorts[local_state._sort_idx].reset(nullptr); local_state._sort_idx++; } diff --git a/be/src/pipeline/exec/partition_sort_source_operator.h b/be/src/pipeline/exec/partition_sort_source_operator.h index 4b5589c0e8f0cd1..1f75e1f49d4cf76 100644 --- a/be/src/pipeline/exec/partition_sort_source_operator.h +++ b/be/src/pipeline/exec/partition_sort_source_operator.h @@ -34,14 +34,14 @@ class PartitionSortSourceLocalState final ENABLE_FACTORY_CREATOR(PartitionSortSourceLocalState); using Base = PipelineXLocalState; PartitionSortSourceLocalState(RuntimeState* state, OperatorXBase* parent) - : PipelineXLocalState(state, parent), - _get_sorted_timer(nullptr) {} + : PipelineXLocalState(state, parent) {} Status init(RuntimeState* state, LocalStateInfo& info) override; private: friend class PartitionSortSourceOperatorX; - RuntimeProfile::Counter* _get_sorted_timer; + RuntimeProfile::Counter* _get_sorted_timer = nullptr; + RuntimeProfile::Counter* _sorted_partition_output_rows_counter = nullptr; std::atomic _sort_idx = 0; }; diff --git a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp index 4399f3c7045c003..980217fe6958911 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp @@ -34,6 +34,7 @@ PartitionedAggSinkLocalState::PartitionedAggSinkLocalState(DataSinkOperatorXBase std::make_shared(parent->operator_id(), parent->node_id(), parent->get_name() + "_SPILL_DEPENDENCY", true); } + Status PartitionedAggSinkLocalState::init(doris::RuntimeState* state, doris::pipeline::LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); @@ -66,6 +67,7 @@ Status PartitionedAggSinkLocalState::open(RuntimeState* state) { SCOPED_TIMER(Base::_open_timer); return Base::open(state); } + Status PartitionedAggSinkLocalState::close(RuntimeState* state, Status exec_status) { SCOPED_TIMER(Base::exec_time_counter()); SCOPED_TIMER(Base::_close_timer); @@ -210,7 +212,7 @@ Status PartitionedAggSinkLocalState::setup_in_memory_agg_op(RuntimeState* state) _runtime_state->set_be_number(state->be_number()); _runtime_state->set_desc_tbl(&state->desc_tbl()); - _runtime_state->set_pipeline_x_runtime_filter_mgr(state->local_runtime_filter_mgr()); + _runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); _runtime_state->set_task_id(state->task_id()); auto& parent = Base::_parent->template cast(); diff --git a/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp b/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp index a8c4e7b0bcc53ef..5e030e7ab49d10f 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp @@ -36,18 +36,20 @@ PartitionedAggLocalState::PartitionedAggLocalState(RuntimeState* state, Operator Status PartitionedAggLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); _init_counters(); return Status::OK(); } Status PartitionedAggLocalState::open(RuntimeState* state) { + RETURN_IF_ERROR(Base::open(state)); + SCOPED_TIMER(_open_timer); if (_opened) { return Status::OK(); } _opened = true; RETURN_IF_ERROR(setup_in_memory_agg_op(state)); - return Base::open(state); + return Status::OK(); } void PartitionedAggLocalState::_init_counters() { @@ -172,7 +174,7 @@ Status PartitionedAggLocalState::setup_in_memory_agg_op(RuntimeState* state) { _runtime_state->set_desc_tbl(&state->desc_tbl()); _runtime_state->resize_op_id_to_local_state(state->max_operator_id()); - _runtime_state->set_pipeline_x_runtime_filter_mgr(state->local_runtime_filter_mgr()); + _runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); auto& parent = Base::_parent->template cast(); diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp index 8118b669ef84bcb..6dc1616e0eb6893 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp @@ -625,8 +625,7 @@ Status PartitionedHashJoinProbeOperatorX::_setup_internal_operators( local_state._runtime_state->set_desc_tbl(&state->desc_tbl()); local_state._runtime_state->resize_op_id_to_local_state(-1); - local_state._runtime_state->set_pipeline_x_runtime_filter_mgr( - state->local_runtime_filter_mgr()); + local_state._runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); local_state._in_mem_shared_state_sptr = _inner_sink_operator->create_shared_state(); diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp index 1aeb9213d83ee73..fc17ef41be62c80 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp @@ -447,7 +447,7 @@ Status PartitionedHashJoinSinkOperatorX::_setup_internal_operator(RuntimeState* local_state._shared_state->inner_runtime_state->set_desc_tbl(&state->desc_tbl()); local_state._shared_state->inner_runtime_state->resize_op_id_to_local_state(-1); - local_state._shared_state->inner_runtime_state->set_pipeline_x_runtime_filter_mgr( + local_state._shared_state->inner_runtime_state->set_runtime_filter_mgr( state->local_runtime_filter_mgr()); local_state._shared_state->inner_shared_state = std::dynamic_pointer_cast( diff --git a/be/src/pipeline/exec/result_file_sink_operator.cpp b/be/src/pipeline/exec/result_file_sink_operator.cpp index 029bea7494ef581..8871a299cbb16ed 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.cpp +++ b/be/src/pipeline/exec/result_file_sink_operator.cpp @@ -85,7 +85,7 @@ Status ResultFileSinkOperatorX::open(RuntimeState* state) { Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); + SCOPED_TIMER(_init_timer); _sender_id = info.sender_id; _brpc_wait_timer = ADD_TIMER(_profile, "BrpcSendTime.Wait"); @@ -105,7 +105,7 @@ Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& i _writer.reset(new (std::nothrow) vectorized::VFileResultWriter( p._file_opts.get(), p._storage_type, state->fragment_instance_id(), _output_vexpr_ctxs, _sender, nullptr, state->return_object_data_as_binary(), - p._output_row_descriptor)); + p._output_row_descriptor, _async_writer_dependency, _finish_dependency)); } else { // init channel _output_block = vectorized::Block::create_unique( @@ -113,7 +113,8 @@ Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& i _writer.reset(new (std::nothrow) vectorized::VFileResultWriter( p._file_opts.get(), p._storage_type, state->fragment_instance_id(), _output_vexpr_ctxs, nullptr, _output_block.get(), - state->return_object_data_as_binary(), p._output_row_descriptor)); + state->return_object_data_as_binary(), p._output_row_descriptor, + _async_writer_dependency, _finish_dependency)); std::map fragment_id_to_channel_index; for (int i = 0; i < p._dests.size(); ++i) { @@ -129,7 +130,6 @@ Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& i RETURN_IF_ERROR(_channel->init_stub(state)); } } - _writer->set_dependency(_async_writer_dependency.get(), _finish_dependency.get()); _writer->set_header_info(p._header_type, p._header); return Status::OK(); } diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index bc9e6a4f42931c5..70d45e743cf4dca 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -22,6 +22,7 @@ #include #include +#include "common/config.h" #include "common/object_pool.h" #include "exec/rowid_fetcher.h" #include "pipeline/exec/operator.h" @@ -50,9 +51,10 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) if (state->query_options().enable_parallel_result_sink) { _sender = _parent->cast()._sender; } else { + auto& p = _parent->cast(); RETURN_IF_ERROR(state->exec_env()->result_mgr()->create_sender( - fragment_instance_id, RESULT_SINK_BUFFER_SIZE, &_sender, state->execution_timeout(), - state->batch_size())); + fragment_instance_id, p._result_sink_buffer_size_rows, &_sender, + state->execution_timeout(), state->batch_size())); } _sender->set_dependency(fragment_instance_id, _dependency->shared_from_this()); return Status::OK(); @@ -110,6 +112,11 @@ ResultSinkOperatorX::ResultSinkOperatorX(int operator_id, const RowDescriptor& r } else { _sink_type = sink.type; } + if (_sink_type == TResultSinkType::ARROW_FLIGHT_PROTOCAL) { + _result_sink_buffer_size_rows = config::arrow_flight_result_sink_buffer_size_rows; + } else { + _result_sink_buffer_size_rows = RESULT_SINK_BUFFER_SIZE; + } _fetch_option = sink.fetch_option; _name = "ResultSink"; } @@ -129,8 +136,8 @@ Status ResultSinkOperatorX::prepare(RuntimeState* state) { if (state->query_options().enable_parallel_result_sink) { RETURN_IF_ERROR(state->exec_env()->result_mgr()->create_sender( - state->query_id(), RESULT_SINK_BUFFER_SIZE, &_sender, state->execution_timeout(), - state->batch_size())); + state->query_id(), _result_sink_buffer_size_rows, &_sender, + state->execution_timeout(), state->batch_size())); } return Status::OK(); } diff --git a/be/src/pipeline/exec/result_sink_operator.h b/be/src/pipeline/exec/result_sink_operator.h index 7ec7d43ec2b03a1..06b961b2a316941 100644 --- a/be/src/pipeline/exec/result_sink_operator.h +++ b/be/src/pipeline/exec/result_sink_operator.h @@ -152,6 +152,7 @@ class ResultSinkOperatorX final : public DataSinkOperatorX Status _second_phase_fetch_data(RuntimeState* state, vectorized::Block* final_block); TResultSinkType::type _sink_type; + int _result_sink_buffer_size_rows; // set file options when sink type is FILE std::unique_ptr _file_opts = nullptr; diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index fcb50f1f70f18b2..381f18e17281abe 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -102,7 +102,7 @@ Status ScanLocalState::open(RuntimeState* state) { RETURN_IF_ERROR( p._common_expr_ctxs_push_down[i]->clone(state, _common_expr_ctxs_push_down[i])); } - RETURN_IF_ERROR(_acquire_runtime_filter(true)); + RETURN_IF_ERROR(_acquire_runtime_filter()); _stale_expr_ctxs.resize(p._stale_expr_ctxs.size()); for (size_t i = 0; i < _stale_expr_ctxs.size(); i++) { RETURN_IF_ERROR(p._stale_expr_ctxs[i]->clone(state, _stale_expr_ctxs[i])); @@ -1223,8 +1223,9 @@ Status ScanLocalState::_start_scanners( state(), this, p._output_tuple_desc, p.output_row_descriptor(), scanners, p.limit(), state()->scan_queue_mem_limit(), _scan_dependency, // 1. If data distribution is ignored , we use 1 instance to scan. - // 2. Else, file scanner will consume much memory so we use config::doris_scanner_thread_pool_thread_num / query_parallel_instance_num scanners to scan. - p.ignore_data_distribution() && !p.is_file_scan_operator() + // 2. Else if this operator is not file scan operator, we use config::doris_scanner_thread_pool_thread_num scanners to scan. + // 3. Else, file scanner will consume much memory so we use config::doris_scanner_thread_pool_thread_num / query_parallel_instance_num scanners to scan. + p.ignore_data_distribution() || !p.is_file_scan_operator() ? 1 : state()->query_parallel_instance_num()); return Status::OK(); @@ -1392,13 +1393,9 @@ Status ScanOperatorX::init(const TPlanNode& tnode, RuntimeState* const TQueryOptions& query_options = state->query_options(); if (query_options.__isset.max_scan_key_num) { _max_scan_key_num = query_options.max_scan_key_num; - } else { - _max_scan_key_num = config::doris_max_scan_key_num; } if (query_options.__isset.max_pushdown_conditions_per_column) { _max_pushdown_conditions_per_column = query_options.max_pushdown_conditions_per_column; - } else { - _max_pushdown_conditions_per_column = config::max_pushdown_conditions_per_column; } // tnode.olap_scan_node.push_down_agg_type_opt field is deprecated // Introduced a new field : tnode.push_down_agg_type_opt @@ -1478,12 +1475,7 @@ Status ScanOperatorX::get_block(RuntimeState* state, vectorized: // remove them when query leave scan node to avoid other nodes use block->columns() to make a wrong decision Defer drop_block_temp_column {[&]() { std::unique_lock l(local_state._block_lock); - auto all_column_names = block->get_names(); - for (auto& name : all_column_names) { - if (name.rfind(BeConsts::BLOCK_TEMP_COLUMN_PREFIX, 0) == 0) { - block->erase(name); - } - } + block->erase_tmp_columns(); }}; if (state->is_cancelled()) { diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h index 45b9a382e9b7d94..cbbeb75998d6b79 100644 --- a/be/src/pipeline/exec/scan_operator.h +++ b/be/src/pipeline/exec/scan_operator.h @@ -451,8 +451,8 @@ class ScanOperatorX : public OperatorX { std::unordered_map _colname_to_slot_id; // These two values are from query_options - int _max_scan_key_num; - int _max_pushdown_conditions_per_column; + int _max_scan_key_num = 48; + int _max_pushdown_conditions_per_column = 1024; // If the query like select * from table limit 10; then the query should run in // single scanner to avoid too many scanners which will cause lots of useless read. diff --git a/be/src/pipeline/exec/schema_scan_operator.cpp b/be/src/pipeline/exec/schema_scan_operator.cpp index f46589880958ee8..8ff0a570e3393e5 100644 --- a/be/src/pipeline/exec/schema_scan_operator.cpp +++ b/be/src/pipeline/exec/schema_scan_operator.cpp @@ -48,6 +48,7 @@ Status SchemaScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { // new one scanner _schema_scanner = SchemaScanner::create(schema_table->schema_table_type()); + _schema_scanner->set_dependency(_data_dependency, _finish_dependency); if (nullptr == _schema_scanner) { return Status::InternalError("schema scanner get nullptr pointer."); } @@ -59,7 +60,7 @@ Status SchemaScanLocalState::open(RuntimeState* state) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(PipelineXLocalState<>::open(state)); - return _schema_scanner->start(state); + return _schema_scanner->get_next_block_async(state); } SchemaScanOperatorX::SchemaScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, @@ -119,6 +120,17 @@ Status SchemaScanOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { _common_scanner_param->catalog = state->obj_pool()->add(new std::string(tnode.schema_scan_node.catalog)); } + + if (tnode.schema_scan_node.__isset.fe_addr_list) { + for (const auto& fe_addr : tnode.schema_scan_node.fe_addr_list) { + _common_scanner_param->fe_addr_list.insert(fe_addr); + } + } else if (tnode.schema_scan_node.__isset.ip && tnode.schema_scan_node.__isset.port) { + TNetworkAddress fe_addr; + fe_addr.hostname = tnode.schema_scan_node.ip; + fe_addr.port = tnode.schema_scan_node.port; + _common_scanner_param->fe_addr_list.insert(fe_addr); + } return Status::OK(); } @@ -226,8 +238,12 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, vectorized::Block* bl while (true) { RETURN_IF_CANCELLED(state); + if (local_state._data_dependency->is_blocked_by() != nullptr) { + break; + } // get all slots from schema table. - RETURN_IF_ERROR(local_state._schema_scanner->get_next_block(&src_block, &schema_eos)); + RETURN_IF_ERROR( + local_state._schema_scanner->get_next_block(state, &src_block, &schema_eos)); if (schema_eos) { *eos = true; diff --git a/be/src/pipeline/exec/schema_scan_operator.h b/be/src/pipeline/exec/schema_scan_operator.h index 8f2b73f5123f0df..aa2bff7e6440a24 100644 --- a/be/src/pipeline/exec/schema_scan_operator.h +++ b/be/src/pipeline/exec/schema_scan_operator.h @@ -35,18 +35,30 @@ class SchemaScanLocalState final : public PipelineXLocalState<> { ENABLE_FACTORY_CREATOR(SchemaScanLocalState); SchemaScanLocalState(RuntimeState* state, OperatorXBase* parent) - : PipelineXLocalState<>(state, parent) {} + : PipelineXLocalState<>(state, parent) { + _finish_dependency = + std::make_shared(parent->operator_id(), parent->node_id(), + parent->get_name() + "_FINISH_DEPENDENCY", true); + _data_dependency = std::make_shared(parent->operator_id(), parent->node_id(), + parent->get_name() + "_DEPENDENCY", true); + } ~SchemaScanLocalState() override = default; Status init(RuntimeState* state, LocalStateInfo& info) override; Status open(RuntimeState* state) override; + Dependency* finishdependency() override { return _finish_dependency.get(); } + std::vector dependencies() const override { return {_data_dependency.get()}; } + private: friend class SchemaScanOperatorX; SchemaScannerParam _scanner_param; std::unique_ptr _schema_scanner; + + std::shared_ptr _finish_dependency; + std::shared_ptr _data_dependency; }; class SchemaScanOperatorX final : public OperatorX { diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp index 5fc38f3ca706acf..6c76f9a57a3ee27 100644 --- a/be/src/pipeline/exec/set_sink_operator.cpp +++ b/be/src/pipeline/exec/set_sink_operator.cpp @@ -130,9 +130,13 @@ Status SetSinkOperatorX::_extract_build_column( block.get_by_position(result_col_id).column = block.get_by_position(result_col_id).column->convert_to_full_column_if_const(); } + // Do make nullable should not change the origin column and type in origin block + // which may cause coredump problem if (local_state._shared_state->build_not_ignore_null[i]) { - block.get_by_position(result_col_id).column = - make_nullable(block.get_by_position(result_col_id).column); + auto column_ptr = make_nullable(block.get_by_position(result_col_id).column, false); + block.insert( + {column_ptr, make_nullable(block.get_by_position(result_col_id).type), ""}); + result_col_id = block.columns() - 1; } raw_ptrs[i] = block.get_by_position(result_col_id).column.get(); diff --git a/be/src/pipeline/exec/sort_sink_operator.cpp b/be/src/pipeline/exec/sort_sink_operator.cpp index 7230116a1a070a0..5a29b0bfe614fb1 100644 --- a/be/src/pipeline/exec/sort_sink_operator.cpp +++ b/be/src/pipeline/exec/sort_sink_operator.cpp @@ -100,7 +100,7 @@ Status SortSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { auto* query_ctx = state->get_query_ctx(); // init runtime predicate - if (query_ctx->has_runtime_predicate(_node_id)) { + if (query_ctx->has_runtime_predicate(_node_id) && _algorithm == TSortAlgorithm::HEAP_SORT) { query_ctx->get_runtime_predicate(_node_id).set_detected_source(); } return Status::OK(); diff --git a/be/src/pipeline/exec/spill_sort_sink_operator.cpp b/be/src/pipeline/exec/spill_sort_sink_operator.cpp index 4c6eb290ef11f30..94196a0354e5cfd 100644 --- a/be/src/pipeline/exec/spill_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/spill_sort_sink_operator.cpp @@ -84,7 +84,7 @@ Status SpillSortSinkLocalState::setup_in_memory_sort_op(RuntimeState* state) { _runtime_state->set_be_number(state->be_number()); _runtime_state->set_desc_tbl(&state->desc_tbl()); - _runtime_state->set_pipeline_x_runtime_filter_mgr(state->local_runtime_filter_mgr()); + _runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); auto& parent = Base::_parent->template cast(); Base::_shared_state->in_mem_shared_state_sptr = diff --git a/be/src/pipeline/exec/spill_sort_source_operator.cpp b/be/src/pipeline/exec/spill_sort_source_operator.cpp index 72304291f6dcfe2..967e13d1fa527b1 100644 --- a/be/src/pipeline/exec/spill_sort_source_operator.cpp +++ b/be/src/pipeline/exec/spill_sort_source_operator.cpp @@ -219,7 +219,7 @@ Status SpillSortLocalState::setup_in_memory_sort_op(RuntimeState* state) { _runtime_state->set_desc_tbl(&state->desc_tbl()); _runtime_state->resize_op_id_to_local_state(state->max_operator_id()); - _runtime_state->set_pipeline_x_runtime_filter_mgr(state->local_runtime_filter_mgr()); + _runtime_state->set_runtime_filter_mgr(state->local_runtime_filter_mgr()); DCHECK(_shared_state->in_mem_shared_state); LocalStateInfo state_info { diff --git a/be/src/pipeline/exec/spill_utils.h b/be/src/pipeline/exec/spill_utils.h index 635a6a6bbbcf8a8..925e7df44e607e5 100644 --- a/be/src/pipeline/exec/spill_utils.h +++ b/be/src/pipeline/exec/spill_utils.h @@ -40,17 +40,18 @@ class SpillRunnable : public Runnable { ~SpillRunnable() override = default; void run() override { + // Should lock task context before scope task, because the _state maybe + // destroyed when run is called. + auto task_context_holder = _task_context_holder.lock(); + if (!task_context_holder) { + return; + } SCOPED_ATTACH_TASK(_state); Defer defer([&] { std::function tmp; std::swap(tmp, _func); }); - auto task_context_holder = _task_context_holder.lock(); - if (!task_context_holder) { - return; - } - auto shared_state_holder = _shared_state_holder.lock(); if (!shared_state_holder) { return; diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp index ba51a2da39be70a..91a2c630418194a 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp @@ -27,8 +27,9 @@ LocalExchangeSinkLocalState::~LocalExchangeSinkLocalState() = default; std::vector LocalExchangeSinkLocalState::dependencies() const { auto deps = Base::dependencies(); - auto exchanger_deps = _exchanger->local_sink_state_dependency(_channel_id); - for (auto* dep : exchanger_deps) { + + auto dep = _shared_state->get_sink_dep_by_channel_id(_channel_id); + if (dep != nullptr) { deps.push_back(dep); } return deps; @@ -39,7 +40,7 @@ Status LocalExchangeSinkOperatorX::init(ExchangeType type, const int num_buckets const std::map& shuffle_idx_to_instance_idx) { _name = "LOCAL_EXCHANGE_SINK_OPERATOR (" + get_exchange_type_name(type) + ")"; _type = type; - if (_type == ExchangeType::HASH_SHUFFLE) { + if (_type == ExchangeType::HASH_SHUFFLE || _type == ExchangeType::BUCKET_HASH_SHUFFLE) { // For shuffle join, if data distribution has been broken by previous operator, we // should use a HASH_SHUFFLE local exchanger to shuffle data again. To be mentioned, // we should use map shuffle idx to instance idx because all instances will be @@ -56,17 +57,17 @@ Status LocalExchangeSinkOperatorX::init(ExchangeType type, const int num_buckets _shuffle_idx_to_instance_idx[i] = {i, i}; } } - _partitioner.reset(new vectorized::Crc32HashPartitioner( - _num_partitions)); - RETURN_IF_ERROR(_partitioner->init(_texprs)); - } else if (_type == ExchangeType::BUCKET_HASH_SHUFFLE) { _partitioner.reset( - new vectorized::Crc32HashPartitioner(num_buckets)); + _type == ExchangeType::HASH_SHUFFLE + ? new vectorized::Crc32HashPartitioner( + _num_partitions) + : new vectorized::Crc32HashPartitioner( + num_buckets)); RETURN_IF_ERROR(_partitioner->init(_texprs)); } - return Status::OK(); } + Status LocalExchangeSinkOperatorX::prepare(RuntimeState* state) { if (_type == ExchangeType::HASH_SHUFFLE || _type == ExchangeType::BUCKET_HASH_SHUFFLE) { RETURN_IF_ERROR(_partitioner->prepare(state, _child_x->row_desc())); diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h index 0ff1df260012b71..faa48d209f4b1e6 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h @@ -56,6 +56,8 @@ class LocalExchangeSinkLocalState final : public PipelineXSinkLocalState + friend class Exchanger; ExchangerBase* _exchanger = nullptr; diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp index 56f0a157cdee8de..2d20b8f365cd7dd 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp @@ -27,14 +27,6 @@ Status LocalExchangeSourceLocalState::init(RuntimeState* state, LocalStateInfo& SCOPED_TIMER(_init_timer); _channel_id = info.task_idx; _shared_state->mem_trackers[_channel_id] = _mem_tracker.get(); - return Status::OK(); -} - -Status LocalExchangeSourceLocalState::open(RuntimeState* state) { - SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); - RETURN_IF_ERROR(Base::open(state)); - _exchanger = _shared_state->exchanger.get(); DCHECK(_exchanger != nullptr); _get_block_failed_counter = @@ -44,6 +36,18 @@ Status LocalExchangeSourceLocalState::open(RuntimeState* state) { _copy_data_timer = ADD_TIMER(profile(), "CopyDataTime"); } + if (_exchanger->get_type() == ExchangeType::LOCAL_MERGE_SORT && _channel_id == 0) { + _local_merge_deps = _shared_state->get_dep_by_channel_id(_channel_id); + DCHECK_GT(_local_merge_deps.size(), 1); + _deps_counter.resize(_local_merge_deps.size()); + static const std::string timer_name = "WaitForDependencyTime"; + _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, timer_name, 1); + for (size_t i = 0; i < _deps_counter.size(); i++) { + _deps_counter[i] = _runtime_profile->add_nonzero_counter( + fmt::format("WaitForData{}", i), TUnit ::TIME_NS, timer_name, 1); + } + } + return Status::OK(); } @@ -52,34 +56,53 @@ Status LocalExchangeSourceLocalState::close(RuntimeState* state) { return Status::OK(); } + for (size_t i = 0; i < _local_merge_deps.size(); i++) { + COUNTER_SET(_deps_counter[i], _local_merge_deps[i]->watcher_elapse_time()); + } + if (_exchanger) { _exchanger->close(*this); } if (_shared_state) { - _shared_state->sub_running_source_operators(); + _shared_state->sub_running_source_operators(*this); } + std::vector {}.swap(_local_merge_deps); return Base::close(state); } std::vector LocalExchangeSourceLocalState::dependencies() const { - auto deps = Base::dependencies(); - auto exchanger_deps = _exchanger->local_state_dependency(_channel_id); - for (auto* dep : exchanger_deps) { - deps.push_back(dep); + if (_exchanger->get_type() == ExchangeType::LOCAL_MERGE_SORT && _channel_id == 0) { + // If this is a local merge exchange, source operator is runnable only if all sink operators + // set dependencies ready + std::vector deps; + auto le_deps = _shared_state->get_dep_by_channel_id(_channel_id); + DCHECK_GT(_local_merge_deps.size(), 1); + // If this is a local merge exchange, we should use all dependencies here. + for (auto& dep : _local_merge_deps) { + deps.push_back(dep.get()); + } + return deps; + } else if (_exchanger->get_type() == ExchangeType::LOCAL_MERGE_SORT && _channel_id != 0) { + // If this is a local merge exchange and is not the first task, source operators always + // return empty result so no dependencies here. + return {}; + } else { + return Base::dependencies(); } - return deps; } std::string LocalExchangeSourceLocalState::debug_string(int indentation_level) const { fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, "{}, _channel_id: {}, _num_partitions: {}, _num_senders: {}, _num_sources: {}, " - "_running_sink_operators: {}, _running_source_operators: {}, mem_usage: {}", + "_running_sink_operators: {}, _running_source_operators: {}, mem_usage: {}, " + "data queue info: {}", Base::debug_string(indentation_level), _channel_id, _exchanger->_num_partitions, _exchanger->_num_senders, _exchanger->_num_sources, _exchanger->_running_sink_operators, _exchanger->_running_source_operators, - _shared_state->mem_usage.load()); + _shared_state->mem_usage.load(), + _exchanger->data_queue_debug_string(_channel_id)); size_t i = 0; fmt::format_to(debug_string_buffer, ", MemTrackers: "); for (auto* mem_tracker : _shared_state->mem_trackers) { diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.h b/be/src/pipeline/local_exchange/local_exchange_source_operator.h index f9fa4cfa4edfe3c..7bf92add63d7028 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.h @@ -36,7 +36,6 @@ class LocalExchangeSourceLocalState final : public PipelineXLocalState + friend class Exchanger; ExchangerBase* _exchanger = nullptr; int _channel_id; RuntimeProfile::Counter* _get_block_failed_counter = nullptr; RuntimeProfile::Counter* _copy_data_timer = nullptr; + std::vector _deps_counter; + std::vector _local_merge_deps; }; class LocalExchangeSourceOperatorX final : public OperatorX { diff --git a/be/src/pipeline/local_exchange/local_exchanger.cpp b/be/src/pipeline/local_exchange/local_exchanger.cpp index 27b7fc7e7fd3f7d..be4071cef9a7ff9 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.cpp +++ b/be/src/pipeline/local_exchange/local_exchanger.cpp @@ -26,6 +26,87 @@ namespace doris::pipeline { +template +void Exchanger::_enqueue_data_and_set_ready(int channel_id, + LocalExchangeSinkLocalState& local_state, + BlockType&& block) { + size_t allocated_bytes = 0; + // PartitionedBlock is used by shuffle exchanger. + // PartitionedBlock will be push into multiple queues with different row ranges, so it will be + // referenced multiple times. Otherwise, we only ref the block once because it is only push into + // one queue. + if constexpr (std::is_same_v || + std::is_same_v) { + allocated_bytes = block.first->data_block.allocated_bytes(); + } else { + block->ref(1); + allocated_bytes = block->data_block.allocated_bytes(); + } + std::unique_lock l(_m); + local_state._shared_state->add_mem_usage(channel_id, allocated_bytes, + !std::is_same_v); + if (_data_queue[channel_id].enqueue(std::move(block))) { + local_state._shared_state->set_ready_to_read(channel_id); + } else { + local_state._shared_state->sub_mem_usage(channel_id, allocated_bytes); + // `enqueue(block)` return false iff this queue's source operator is already closed so we + // just unref the block. + if constexpr (std::is_same_v || + std::is_same_v) { + block.first->unref(local_state._shared_state, allocated_bytes); + } else { + block->unref(local_state._shared_state, allocated_bytes); + } + } +} + +template +bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_state, + BlockType& block, bool* eos, + vectorized::Block* data_block) { + return _dequeue_data(local_state, block, eos, data_block, local_state._channel_id); +} + +template +bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_state, + BlockType& block, bool* eos, vectorized::Block* data_block, + int channel_id) { + bool all_finished = _running_sink_operators == 0; + if (_data_queue[channel_id].try_dequeue(block)) { + if constexpr (std::is_same_v || + std::is_same_v) { + local_state._shared_state->sub_mem_usage(channel_id, + block.first->data_block.allocated_bytes()); + } else { + local_state._shared_state->sub_mem_usage(channel_id, + block->data_block.allocated_bytes()); + data_block->swap(block->data_block); + block->unref(local_state._shared_state, data_block->allocated_bytes()); + } + return true; + } else if (all_finished) { + *eos = true; + } else { + std::unique_lock l(_m); + if (_data_queue[channel_id].try_dequeue(block)) { + if constexpr (std::is_same_v || + std::is_same_v) { + local_state._shared_state->sub_mem_usage(channel_id, + block.first->data_block.allocated_bytes()); + } else { + local_state._shared_state->sub_mem_usage(channel_id, + block->data_block.allocated_bytes()); + data_block->swap(block->data_block); + block->unref(local_state._shared_state, data_block->allocated_bytes()); + } + return true; + } + COUNTER_UPDATE(local_state._get_block_failed_counter, 1); + local_state._dependency->block(); + } + return false; +} + Status ShuffleExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, LocalExchangeSinkLocalState& local_state) { { @@ -45,12 +126,11 @@ Status ShuffleExchanger::sink(RuntimeState* state, vectorized::Block* in_block, void ShuffleExchanger::close(LocalExchangeSourceLocalState& local_state) { PartitionedBlock partitioned_block; + bool eos; + vectorized::Block block; _data_queue[local_state._channel_id].set_eos(); - while (_data_queue[local_state._channel_id].try_dequeue(partitioned_block)) { - auto block_wrapper = partitioned_block.first; - local_state._shared_state->sub_mem_usage( - local_state._channel_id, block_wrapper->data_block.allocated_bytes(), false); - block_wrapper->unref(local_state._shared_state); + while (_dequeue_data(local_state, partitioned_block, &eos, &block)) { + partitioned_block.first->unref(local_state._shared_state); } } @@ -59,32 +139,24 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block PartitionedBlock partitioned_block; vectorized::MutableBlock mutable_block; - auto get_data = [&](vectorized::Block* result_block) -> Status { + auto get_data = [&]() -> Status { do { const auto* offset_start = partitioned_block.second.row_idxs->data() + partitioned_block.second.offset_start; auto block_wrapper = partitioned_block.first; - local_state._shared_state->sub_mem_usage( - local_state._channel_id, block_wrapper->data_block.allocated_bytes(), false); RETURN_IF_ERROR(mutable_block.add_rows(&block_wrapper->data_block, offset_start, offset_start + partitioned_block.second.length)); block_wrapper->unref(local_state._shared_state); - } while (mutable_block.rows() < state->batch_size() && - _data_queue[local_state._channel_id].try_dequeue(partitioned_block)); + } while (mutable_block.rows() < state->batch_size() && !*eos && + _dequeue_data(local_state, partitioned_block, eos, block)); return Status::OK(); }; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(partitioned_block)) { + if (_dequeue_data(local_state, partitioned_block, eos, block)) { SCOPED_TIMER(local_state._copy_data_timer); mutable_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->data_block); - RETURN_IF_ERROR(get_data(block)); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); + RETURN_IF_ERROR(get_data()); } return Status::OK(); } @@ -92,7 +164,6 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, bool eos, LocalExchangeSinkLocalState& local_state) { - auto& data_queue = _data_queue; const auto rows = block->rows(); auto row_idx = std::make_shared>(rows); { @@ -111,11 +182,11 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest } vectorized::Block data_block; - std::shared_ptr new_block_wrapper; + std::shared_ptr new_block_wrapper; if (_free_blocks.try_dequeue(data_block)) { - new_block_wrapper = ShuffleBlockWrapper::create_shared(std::move(data_block)); + new_block_wrapper = BlockWrapper::create_shared(std::move(data_block)); } else { - new_block_wrapper = ShuffleBlockWrapper::create_shared(block->clone_empty()); + new_block_wrapper = BlockWrapper::create_shared(block->clone_empty()); } new_block_wrapper->data_block.swap(*block); @@ -123,7 +194,15 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest return Status::OK(); } local_state._shared_state->add_total_mem_usage(new_block_wrapper->data_block.allocated_bytes()); + auto bucket_seq_to_instance_idx = + local_state._parent->cast()._bucket_seq_to_instance_idx; if (get_type() == ExchangeType::HASH_SHUFFLE) { + /** + * If type is `HASH_SHUFFLE`, data are hash-shuffled and distributed to all instances of + * all BEs. So we need a shuffleId-To-InstanceId mapping. + * For example, row 1 get a hash value 1 which means we should distribute to instance 1 on + * BE 1 and row 2 get a hash value 2 which means we should distribute to instance 1 on BE 3. + */ const auto& map = local_state._parent->cast() ._shuffle_idx_to_instance_idx; new_block_wrapper->ref(map.size()); @@ -133,57 +212,54 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest uint32_t start = local_state._partition_rows_histogram[it.first]; uint32_t size = local_state._partition_rows_histogram[it.first + 1] - start; if (size > 0) { - local_state._shared_state->add_mem_usage( - it.second, new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[it.second].enqueue({new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(it.second); - } else { - local_state._shared_state->sub_mem_usage( - it.second, new_block_wrapper->data_block.allocated_bytes(), false); - new_block_wrapper->unref(local_state._shared_state); - } + _enqueue_data_and_set_ready(it.second, local_state, + {new_block_wrapper, {row_idx, start, size}}); } else { new_block_wrapper->unref(local_state._shared_state); } } } else if (_num_senders != _num_sources || _ignore_source_data_distribution) { + // In this branch, data just should be distributed equally into all instances. new_block_wrapper->ref(_num_partitions); for (size_t i = 0; i < _num_partitions; i++) { uint32_t start = local_state._partition_rows_histogram[i]; uint32_t size = local_state._partition_rows_histogram[i + 1] - start; if (size > 0) { - local_state._shared_state->add_mem_usage( - i % _num_sources, new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[i % _num_sources].enqueue( - {new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(i % _num_sources); - } else { - local_state._shared_state->sub_mem_usage( - i % _num_sources, new_block_wrapper->data_block.allocated_bytes(), - false); - new_block_wrapper->unref(local_state._shared_state); - } + _enqueue_data_and_set_ready(i % _num_sources, local_state, + {new_block_wrapper, {row_idx, start, size}}); + } else { + new_block_wrapper->unref(local_state._shared_state); + } + } + } else if (bucket_seq_to_instance_idx.empty()) { + /** + * If type is `BUCKET_HASH_SHUFFLE` and `_bucket_seq_to_instance_idx` is empty, which + * means no scan operators is included in this fragment so we also need a `HASH_SHUFFLE` here. + */ + const auto& map = local_state._parent->cast() + ._shuffle_idx_to_instance_idx; + DCHECK(!map.empty()); + new_block_wrapper->ref(map.size()); + for (const auto& it : map) { + DCHECK(it.second >= 0 && it.second < _num_partitions) + << it.first << " : " << it.second << " " << _num_partitions; + uint32_t start = local_state._partition_rows_histogram[it.first]; + uint32_t size = local_state._partition_rows_histogram[it.first + 1] - start; + if (size > 0) { + _enqueue_data_and_set_ready(it.second, local_state, + {new_block_wrapper, {row_idx, start, size}}); } else { new_block_wrapper->unref(local_state._shared_state); } } } else { new_block_wrapper->ref(_num_partitions); - auto map = - local_state._parent->cast()._bucket_seq_to_instance_idx; for (size_t i = 0; i < _num_partitions; i++) { uint32_t start = local_state._partition_rows_histogram[i]; uint32_t size = local_state._partition_rows_histogram[i + 1] - start; if (size > 0) { - local_state._shared_state->add_mem_usage( - map[i], new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[map[i]].enqueue({new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(map[i]); - } else { - local_state._shared_state->sub_mem_usage( - map[i], new_block_wrapper->data_block.allocated_bytes(), false); - new_block_wrapper->unref(local_state._shared_state); - } + _enqueue_data_and_set_ready(bucket_seq_to_instance_idx[i], local_state, + {new_block_wrapper, {row_idx, start, size}}); } else { new_block_wrapper->unref(local_state._shared_state); } @@ -196,48 +272,42 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest Status PassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, LocalExchangeSinkLocalState& local_state) { vectorized::Block new_block; + BlockWrapperSPtr wrapper; if (!_free_blocks.try_dequeue(new_block)) { new_block = {in_block->clone_empty()}; } new_block.swap(*in_block); + wrapper = BlockWrapper::create_shared(std::move(new_block)); auto channel_id = (local_state._channel_id++) % _num_partitions; - size_t memory_usage = new_block.allocated_bytes(); - local_state._shared_state->add_mem_usage(channel_id, memory_usage); - if (_data_queue[channel_id].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(channel_id); - } else { - local_state._shared_state->sub_mem_usage(channel_id, memory_usage); - } + _enqueue_data_and_set_ready(channel_id, local_state, std::move(wrapper)); return Status::OK(); } void PassthroughExchanger::close(LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; + BlockWrapperSPtr wrapper; + bool eos; _data_queue[local_state._channel_id].set_eos(); - while (_data_queue[local_state._channel_id].try_dequeue(next_block)) { - local_state._shared_state->sub_mem_usage(local_state._channel_id, - next_block.allocated_bytes()); + while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + // do nothing } } -Status PassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { +void PassToOneExchanger::close(LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { - block->swap(next_block); - local_state._shared_state->sub_mem_usage(local_state._channel_id, block->allocated_bytes()); - if (_free_block_limit == 0 || - _free_blocks.size_approx() < _free_block_limit * _num_sources) { - _free_blocks.enqueue(std::move(next_block)); - } - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); + BlockWrapperSPtr wrapper; + bool eos; + _data_queue[local_state._channel_id].set_eos(); + while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + // do nothing } +} + +Status PassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, + LocalExchangeSourceLocalState& local_state) { + BlockWrapperSPtr next_block; + _dequeue_data(local_state, next_block, eos, block); return Status::OK(); } @@ -245,9 +315,9 @@ Status PassToOneExchanger::sink(RuntimeState* state, vectorized::Block* in_block LocalExchangeSinkLocalState& local_state) { vectorized::Block new_block(in_block->clone_empty()); new_block.swap(*in_block); - if (_data_queue[0].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(0); - } + + BlockWrapperSPtr wrapper = BlockWrapper::create_shared(std::move(new_block)); + _enqueue_data_and_set_ready(0, local_state, std::move(wrapper)); return Status::OK(); } @@ -258,16 +328,8 @@ Status PassToOneExchanger::get_block(RuntimeState* state, vectorized::Block* blo *eos = true; return Status::OK(); } - vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[0].try_dequeue(next_block)) { - *block = std::move(next_block); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); - } + BlockWrapperSPtr next_block; + _dequeue_data(local_state, next_block, eos, block); return Status::OK(); } @@ -277,23 +339,39 @@ Status LocalMergeSortExchanger::sink(RuntimeState* state, vectorized::Block* in_ if (!_free_blocks.try_dequeue(new_block)) { new_block = {in_block->clone_empty()}; } - new_block.swap(*in_block); DCHECK_LE(local_state._channel_id, _data_queue.size()); - size_t memory_usage = new_block.allocated_bytes(); - add_mem_usage(local_state, memory_usage); - - if (_data_queue[local_state._channel_id].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(0); - } else { - sub_mem_usage(local_state, memory_usage); - } + new_block.swap(*in_block); + _enqueue_data_and_set_ready(local_state._channel_id, local_state, + BlockWrapper::create_shared(std::move(new_block))); if (eos) { - _queue_deps[local_state._channel_id]->set_always_ready(); + local_state._shared_state->source_deps[local_state._channel_id]->set_always_ready(); } return Status::OK(); } +void ExchangerBase::finalize(LocalExchangeSourceLocalState& local_state) { + DCHECK(_running_source_operators == 0); + vectorized::Block block; + while (_free_blocks.try_dequeue(block)) { + // do nothing + } +} +void LocalMergeSortExchanger::finalize(LocalExchangeSourceLocalState& local_state) { + BlockWrapperSPtr next_block; + vectorized::Block block; + bool eos; + int id = 0; + for (auto& data_queue : _data_queue) { + data_queue.set_eos(); + while (_dequeue_data(local_state, next_block, &eos, &block, id)) { + block = vectorized::Block(); + } + id++; + } + ExchangerBase::finalize(local_state); +} + Status LocalMergeSortExchanger::build_merger(RuntimeState* state, LocalExchangeSourceLocalState& local_state) { RETURN_IF_ERROR(_sort_source->build_merger(state, _merger, local_state.profile())); @@ -301,18 +379,8 @@ Status LocalMergeSortExchanger::build_merger(RuntimeState* state, for (int channel_id = 0; channel_id < _num_partitions; channel_id++) { vectorized::BlockSupplier block_supplier = [&, id = channel_id](vectorized::Block* block, bool* eos) { - vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[id].try_dequeue(next_block)) { - block->swap(next_block); - if (_free_block_limit == 0 || - _free_blocks.size_approx() < _free_block_limit * _num_sources) { - _free_blocks.enqueue(std::move(next_block)); - } - sub_mem_usage(local_state, id, block->allocated_bytes()); - } else if (all_finished) { - *eos = true; - } + BlockWrapperSPtr next_block; + _dequeue_data(local_state, next_block, eos, block, id); return Status::OK(); }; child_block_suppliers.push_back(block_supplier); @@ -346,90 +414,52 @@ Status LocalMergeSortExchanger::get_block(RuntimeState* state, vectorized::Block return Status::OK(); } -void LocalMergeSortExchanger::sub_mem_usage(LocalExchangeSinkLocalState& local_state, - int64_t delta) { - const auto channel_id = local_state._channel_id; - local_state._shared_state->mem_trackers[channel_id]->release(delta); - if (_queues_mem_usege[channel_id].fetch_sub(delta) > _each_queue_limit) { - _sink_deps[channel_id]->set_ready(); - } - // if queue empty , block this queue - if (_queues_mem_usege[channel_id] == 0) { - _queue_deps[channel_id]->block(); - } -} - -void LocalMergeSortExchanger::add_mem_usage(LocalExchangeSinkLocalState& local_state, - int64_t delta) { - const auto channel_id = local_state._channel_id; - local_state._shared_state->mem_trackers[channel_id]->consume(delta); - if (_queues_mem_usege[channel_id].fetch_add(delta) > _each_queue_limit) { - _sink_deps[channel_id]->block(); - } - _queue_deps[channel_id]->set_ready(); -} - -void LocalMergeSortExchanger::sub_mem_usage(LocalExchangeSourceLocalState& local_state, - int channel_id, int64_t delta) { - local_state._shared_state->mem_trackers[channel_id]->release(delta); - if (_queues_mem_usege[channel_id].fetch_sub(delta) <= _each_queue_limit) { - _sink_deps[channel_id]->set_ready(); - } - // if queue empty , block this queue - if (_queues_mem_usege[channel_id] == 0) { - _queue_deps[channel_id]->block(); - } -} - -std::vector LocalMergeSortExchanger::local_sink_state_dependency(int channel_id) { - DCHECK(_sink_deps[channel_id]); - return {_sink_deps[channel_id].get()}; -} - -std::vector LocalMergeSortExchanger::local_state_dependency(int channel_id) { - if (channel_id != 0) { - return {}; - } - std::vector deps; - for (auto depSptr : _queue_deps) { - deps.push_back(depSptr.get()); - } - return deps; -} - Status BroadcastExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, LocalExchangeSinkLocalState& local_state) { + if (in_block->empty()) { + return Status::OK(); + } + vectorized::Block new_block; + if (!_free_blocks.try_dequeue(new_block)) { + new_block = {in_block->clone_empty()}; + } + new_block.swap(*in_block); + auto wrapper = BlockWrapper::create_shared(std::move(new_block)); + local_state._shared_state->add_total_mem_usage(wrapper->data_block.allocated_bytes()); + wrapper->ref(_num_partitions); for (size_t i = 0; i < _num_partitions; i++) { - auto mutable_block = vectorized::MutableBlock::create_unique(in_block->clone_empty()); - RETURN_IF_ERROR(mutable_block->add_rows(in_block, 0, in_block->rows())); - if (_data_queue[i].enqueue(mutable_block->to_block())) { - local_state._shared_state->set_ready_to_read(i); - } + _enqueue_data_and_set_ready(i, local_state, {wrapper, {0, wrapper->data_block.rows()}}); } return Status::OK(); } void BroadcastExchanger::close(LocalExchangeSourceLocalState& local_state) { - vectorized::Block next_block; + BroadcastBlock partitioned_block; + bool eos; + vectorized::Block block; _data_queue[local_state._channel_id].set_eos(); - while (_data_queue[local_state._channel_id].try_dequeue(next_block)) { - // do nothing + while (_dequeue_data(local_state, partitioned_block, &eos, &block)) { + partitioned_block.first->unref(local_state._shared_state); } } Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, LocalExchangeSourceLocalState& local_state) { - vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { - *block = std::move(next_block); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); + BroadcastBlock partitioned_block; + + if (_dequeue_data(local_state, partitioned_block, eos, block)) { + SCOPED_TIMER(local_state._copy_data_timer); + vectorized::MutableBlock mutable_block = + vectorized::VectorizedUtils::build_mutable_mem_reuse_block( + block, partitioned_block.first->data_block); + auto block_wrapper = partitioned_block.first; + RETURN_IF_ERROR(mutable_block.add_rows(&block_wrapper->data_block, + partitioned_block.second.offset_start, + partitioned_block.second.length)); + block_wrapper->unref(local_state._shared_state); } + return Status::OK(); } @@ -442,13 +472,8 @@ Status AdaptivePassthroughExchanger::_passthrough_sink(RuntimeState* state, } new_block.swap(*in_block); auto channel_id = (local_state._channel_id++) % _num_partitions; - size_t memory_usage = new_block.allocated_bytes(); - local_state._shared_state->add_mem_usage(channel_id, memory_usage); - if (_data_queue[channel_id].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(channel_id); - } else { - local_state._shared_state->sub_mem_usage(channel_id, memory_usage); - } + _enqueue_data_and_set_ready(channel_id, local_state, + BlockWrapper::create_shared(std::move(new_block))); return Status::OK(); } @@ -477,7 +502,6 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, bool eos, LocalExchangeSinkLocalState& local_state) { - auto& data_queue = _data_queue; const auto rows = block->rows(); auto row_idx = std::make_shared>(rows); { @@ -504,13 +528,8 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, RETURN_IF_ERROR(mutable_block->add_rows(block, start, size)); auto new_block = mutable_block->to_block(); - size_t memory_usage = new_block.allocated_bytes(); - local_state._shared_state->add_mem_usage(i, memory_usage); - if (data_queue[i].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(i); - } else { - local_state._shared_state->sub_mem_usage(i, memory_usage); - } + _enqueue_data_and_set_ready(i, local_state, + BlockWrapper::create_shared(std::move(new_block))); } } return Status::OK(); @@ -531,22 +550,19 @@ Status AdaptivePassthroughExchanger::sink(RuntimeState* state, vectorized::Block Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, LocalExchangeSourceLocalState& local_state) { + BlockWrapperSPtr next_block; + _dequeue_data(local_state, next_block, eos, block); + return Status::OK(); +} + +void AdaptivePassthroughExchanger::close(LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { - block->swap(next_block); - if (_free_block_limit == 0 || - _free_blocks.size_approx() < _free_block_limit * _num_sources) { - _free_blocks.enqueue(std::move(next_block)); - } - local_state._shared_state->sub_mem_usage(local_state._channel_id, block->allocated_bytes()); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); + bool eos; + BlockWrapperSPtr wrapper; + _data_queue[local_state._channel_id].set_eos(); + while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + // do nothing } - return Status::OK(); } } // namespace doris::pipeline diff --git a/be/src/pipeline/local_exchange/local_exchanger.h b/be/src/pipeline/local_exchange/local_exchanger.h index 2c4f8f5b78509e4..72c0a0ed3a6c703 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.h +++ b/be/src/pipeline/local_exchange/local_exchanger.h @@ -24,9 +24,22 @@ namespace doris::pipeline { class LocalExchangeSourceLocalState; class LocalExchangeSinkLocalState; -struct ShuffleBlockWrapper; +struct BlockWrapper; class SortSourceOperatorX; +/** + * One exchanger is hold by one `LocalExchangeSharedState`. And one `LocalExchangeSharedState` is + * shared by all local exchange sink operators and source operators with the same id. + * + * In exchanger, two block queues is maintained, one is data block queue and another is free block queue. + * + * In details, data block queue has queues as many as source operators. Each source operator will get + * data block from the corresponding queue. Data blocks is push into the queue by sink operators. One + * sink operator will push blocks into one or more queues. + * + * Free block is used to reuse the allocated memory. To reduce the memory limit, we also use a conf + * to limit the size of free block queue. + */ class ExchangerBase { public: ExchangerBase(int running_sink_operators, int num_partitions, int free_block_limit) @@ -50,14 +63,17 @@ class ExchangerBase { virtual Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, LocalExchangeSinkLocalState& local_state) = 0; virtual ExchangeType get_type() const = 0; + // Called if a local exchanger source operator are closed. Free the unused data block in data_queue. virtual void close(LocalExchangeSourceLocalState& local_state) = 0; + // Called if all local exchanger source operators are closed. We free the memory in + // `_free_blocks` here. + virtual void finalize(LocalExchangeSourceLocalState& local_state); - virtual std::vector local_sink_state_dependency(int channel_id) { return {}; } - virtual std::vector local_state_dependency(int channel_id) { return {}; } + virtual std::string data_queue_debug_string(int i) = 0; protected: friend struct LocalExchangeSharedState; - friend struct ShuffleBlockWrapper; + friend struct BlockWrapper; friend class LocalExchangeSourceLocalState; friend class LocalExchangeSinkOperatorX; friend class LocalExchangeSinkLocalState; @@ -76,7 +92,13 @@ struct PartitionedRowIdxs { uint32_t length; }; -using PartitionedBlock = std::pair, PartitionedRowIdxs>; +using PartitionedBlock = std::pair, PartitionedRowIdxs>; + +struct RowRange { + uint32_t offset_start; + size_t length; +}; +using BroadcastBlock = std::pair, RowRange>; template struct BlockQueue { @@ -106,6 +128,8 @@ struct BlockQueue { void set_eos() { eos = true; } }; +using BlockWrapperSPtr = std::shared_ptr; + template class Exchanger : public ExchangerBase { public: @@ -115,18 +139,55 @@ class Exchanger : public ExchangerBase { : ExchangerBase(running_sink_operators, num_sources, num_partitions, free_block_limit) { } ~Exchanger() override = default; + std::string data_queue_debug_string(int i) override { + return fmt::format("Data Queue {}: [size approx = {}, eos = {}]", i, + _data_queue[i].data_queue.size_approx(), _data_queue[i].eos); + } protected: + // Enqueue data block and set downstream source operator to read. + void _enqueue_data_and_set_ready(int channel_id, LocalExchangeSinkLocalState& local_state, + BlockType&& block); + bool _dequeue_data(LocalExchangeSourceLocalState& local_state, BlockType& block, bool* eos, + vectorized::Block* data_block); + bool _dequeue_data(LocalExchangeSourceLocalState& local_state, BlockType& block, bool* eos, + vectorized::Block* data_block, int channel_id); std::vector> _data_queue; + +private: + std::mutex _m; }; class LocalExchangeSourceLocalState; class LocalExchangeSinkLocalState; -struct ShuffleBlockWrapper { - ENABLE_FACTORY_CREATOR(ShuffleBlockWrapper); - ShuffleBlockWrapper(vectorized::Block&& data_block_) : data_block(std::move(data_block_)) {} +/** + * `BlockWrapper` is used to wrap a data block with a reference count. + * + * In function `unref()`, if `ref_count` decremented to 0, which means this block is not needed by + * operators, so we put it into `_free_blocks` to reuse its memory if needed and refresh memory usage + * in current queue. + * + * Note: `ref_count` will be larger than 1 only if this block is shared between multiple queues in + * shuffle exchanger. + */ +struct BlockWrapper { + ENABLE_FACTORY_CREATOR(BlockWrapper); + BlockWrapper(vectorized::Block&& data_block_) : data_block(std::move(data_block_)) {} + ~BlockWrapper() { DCHECK_EQ(ref_count.load(), 0); } void ref(int delta) { ref_count += delta; } + void unref(LocalExchangeSharedState* shared_state, size_t allocated_bytes) { + if (ref_count.fetch_sub(1) == 1) { + shared_state->sub_total_mem_usage(allocated_bytes); + if (shared_state->exchanger->_free_block_limit == 0 || + shared_state->exchanger->_free_blocks.size_approx() < + shared_state->exchanger->_free_block_limit * + shared_state->exchanger->_num_sources) { + data_block.clear_column_data(); + shared_state->exchanger->_free_blocks.enqueue(std::move(data_block)); + } + } + } void unref(LocalExchangeSharedState* shared_state) { if (ref_count.fetch_sub(1) == 1) { shared_state->sub_total_mem_usage(data_block.allocated_bytes()); @@ -185,12 +246,12 @@ class BucketShuffleExchanger final : public ShuffleExchanger { ExchangeType get_type() const override { return ExchangeType::BUCKET_HASH_SHUFFLE; } }; -class PassthroughExchanger final : public Exchanger { +class PassthroughExchanger final : public Exchanger { public: ENABLE_FACTORY_CREATOR(PassthroughExchanger); PassthroughExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, - free_block_limit) { + : Exchanger(running_sink_operators, num_partitions, + free_block_limit) { _data_queue.resize(num_partitions); } ~PassthroughExchanger() override = default; @@ -203,12 +264,12 @@ class PassthroughExchanger final : public Exchanger { void close(LocalExchangeSourceLocalState& local_state) override; }; -class PassToOneExchanger final : public Exchanger { +class PassToOneExchanger final : public Exchanger { public: ENABLE_FACTORY_CREATOR(PassToOneExchanger); PassToOneExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, - free_block_limit) { + : Exchanger(running_sink_operators, num_partitions, + free_block_limit) { _data_queue.resize(num_partitions); } ~PassToOneExchanger() override = default; @@ -218,28 +279,17 @@ class PassToOneExchanger final : public Exchanger { Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, LocalExchangeSourceLocalState& local_state) override; ExchangeType get_type() const override { return ExchangeType::PASS_TO_ONE; } - void close(LocalExchangeSourceLocalState& local_state) override {} + void close(LocalExchangeSourceLocalState& local_state) override; }; -class LocalMergeSortExchanger final : public Exchanger { +class LocalMergeSortExchanger final : public Exchanger { public: ENABLE_FACTORY_CREATOR(LocalMergeSortExchanger); LocalMergeSortExchanger(std::shared_ptr sort_source, int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, - free_block_limit), - _sort_source(std::move(sort_source)), - _queues_mem_usege(num_partitions), - _each_queue_limit(config::local_exchange_buffer_mem_limit / num_partitions) { + : Exchanger(running_sink_operators, num_partitions, free_block_limit), + _sort_source(std::move(sort_source)) { _data_queue.resize(num_partitions); - for (size_t i = 0; i < num_partitions; i++) { - _queues_mem_usege[i] = 0; - _sink_deps.push_back( - std::make_shared(0, 0, "LOCAL_MERGE_SORT_SINK_DEPENDENCY", true)); - _queue_deps.push_back( - std::make_shared(0, 0, "LOCAL_MERGE_SORT_QUEUE_DEPENDENCY")); - _queue_deps.back()->block(); - } } ~LocalMergeSortExchanger() override = default; Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, @@ -251,33 +301,20 @@ class LocalMergeSortExchanger final : public Exchanger { Status build_merger(RuntimeState* statem, LocalExchangeSourceLocalState& local_state); - std::vector local_sink_state_dependency(int channel_id) override; - - std::vector local_state_dependency(int channel_id) override; - - void add_mem_usage(LocalExchangeSinkLocalState& local_state, int64_t delta); - void sub_mem_usage(LocalExchangeSinkLocalState& local_state, int64_t delta); - void sub_mem_usage(LocalExchangeSourceLocalState& local_state, int channel_id, int64_t delta); void close(LocalExchangeSourceLocalState& local_state) override {} + void finalize(LocalExchangeSourceLocalState& local_state) override; private: - // only channel_id = 0 , build _merger and use it - std::unique_ptr _merger; std::shared_ptr _sort_source; - std::vector _sink_deps; std::vector _queues_mem_usege; - // if cur queue is empty, block this queue - std::vector _queue_deps; - const int64_t _each_queue_limit; }; -class BroadcastExchanger final : public Exchanger { +class BroadcastExchanger final : public Exchanger { public: ENABLE_FACTORY_CREATOR(BroadcastExchanger); BroadcastExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, - free_block_limit) { + : Exchanger(running_sink_operators, num_partitions, free_block_limit) { _data_queue.resize(num_partitions); } ~BroadcastExchanger() override = default; @@ -292,13 +329,13 @@ class BroadcastExchanger final : public Exchanger { //The code in AdaptivePassthroughExchanger is essentially // a copy of ShuffleExchanger and PassthroughExchanger. -class AdaptivePassthroughExchanger : public Exchanger { +class AdaptivePassthroughExchanger : public Exchanger { public: ENABLE_FACTORY_CREATOR(AdaptivePassthroughExchanger); AdaptivePassthroughExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, - free_block_limit) { + : Exchanger(running_sink_operators, num_partitions, + free_block_limit) { _data_queue.resize(num_partitions); } Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, @@ -308,7 +345,7 @@ class AdaptivePassthroughExchanger : public Exchanger { LocalExchangeSourceLocalState& local_state) override; ExchangeType get_type() const override { return ExchangeType::ADAPTIVE_PASSTHROUGH; } - void close(LocalExchangeSourceLocalState& local_state) override {} + void close(LocalExchangeSourceLocalState& local_state) override; private: Status _passthrough_sink(RuntimeState* state, vectorized::Block* in_block, bool eos, diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index 45f49fc09b94b34..c3942e8286e8e31 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -176,6 +176,12 @@ void PipelineFragmentContext::cancel(const Status reason) { if (reason.is()) { LOG(WARNING) << "PipelineFragmentContext is cancelled due to timeout : " << debug_string(); } + + if (reason.is()) { + LOG_WARNING("PipelineFragmentContext is cancelled due to illegal state : {}", + this->debug_string()); + } + _query_ctx->cancel(reason, _fragment_id); if (reason.is()) { _is_report_on_cancel = false; @@ -392,7 +398,7 @@ Status PipelineFragmentContext::_build_pipeline_tasks( runtime_state->set_total_load_streams(request.total_load_streams); runtime_state->set_num_local_sink(request.num_local_sink); DCHECK(runtime_filter_mgr); - runtime_state->set_pipeline_x_runtime_filter_mgr(runtime_filter_mgr.get()); + runtime_state->set_runtime_filter_mgr(runtime_filter_mgr.get()); }; auto filterparams = std::make_unique(); @@ -697,7 +703,10 @@ Status PipelineFragmentContext::_add_local_exchange_impl( is_shuffled_hash_join, shuffle_idx_to_instance_idx)); // 2. Create and initialize LocalExchangeSharedState. - auto shared_state = LocalExchangeSharedState::create_shared(_num_instances); + std::shared_ptr shared_state = + data_distribution.distribution_type == ExchangeType::LOCAL_MERGE_SORT + ? LocalMergeExchangeSharedState::create_shared(_num_instances) + : LocalExchangeSharedState::create_shared(_num_instances); switch (data_distribution.distribution_type) { case ExchangeType::HASH_SHUFFLE: shared_state->exchanger = ShuffleExchanger::create_unique( @@ -730,11 +739,20 @@ Status PipelineFragmentContext::_add_local_exchange_impl( : 0); break; case ExchangeType::PASS_TO_ONE: - shared_state->exchanger = BroadcastExchanger::create_unique( - cur_pipe->num_tasks(), _num_instances, - _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit - : 0); + if (_runtime_state->enable_share_hash_table_for_broadcast_join()) { + // If shared hash table is enabled for BJ, hash table will be built by only one task + shared_state->exchanger = PassToOneExchanger::create_unique( + cur_pipe->num_tasks(), _num_instances, + _runtime_state->query_options().__isset.local_exchange_free_blocks_limit + ? _runtime_state->query_options().local_exchange_free_blocks_limit + : 0); + } else { + shared_state->exchanger = BroadcastExchanger::create_unique( + cur_pipe->num_tasks(), _num_instances, + _runtime_state->query_options().__isset.local_exchange_free_blocks_limit + ? _runtime_state->query_options().local_exchange_free_blocks_limit + : 0); + } break; case ExchangeType::LOCAL_MERGE_SORT: { auto child_op = cur_pipe->sink_x()->child_x(); @@ -788,7 +806,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( } operator_xs.insert(operator_xs.begin(), source_op); - shared_state->create_source_dependencies(source_op->operator_id(), source_op->node_id()); + shared_state->create_dependencies(local_exchange_id); // 5. Set children for two pipelines separately. std::vector> new_children; @@ -1470,7 +1488,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo break; } default: - return Status::InternalError("Unsupported exec type in pipelineX: {}", + return Status::InternalError("Unsupported exec type in pipeline: {}", print_plan_node_type(tnode.node_type)); } diff --git a/be/src/pipeline/pipeline_task.cpp b/be/src/pipeline/pipeline_task.cpp index b8a52575b976820..8692075622a9066 100644 --- a/be/src/pipeline/pipeline_task.cpp +++ b/be/src/pipeline/pipeline_task.cpp @@ -153,8 +153,6 @@ Status PipelineTask::_extract_dependencies() { { auto* local_state = _state->get_sink_local_state(); write_dependencies = local_state->dependencies(); - DCHECK(std::all_of(write_dependencies.begin(), write_dependencies.end(), - [](auto* dep) { return dep->is_write_dependency(); })); auto* fin_dep = local_state->finishdependency(); if (fin_dep) { finish_dependencies.push_back(fin_dep); @@ -304,6 +302,7 @@ Status PipelineTask::execute(bool* eos) { if (cpu_qs) { cpu_qs->add_cpu_nanos(delta_cpu_time); } + query_context()->update_wg_cpu_adder(delta_cpu_time); }}; if (_wait_to_start()) { return Status::OK(); @@ -407,10 +406,9 @@ bool PipelineTask::should_revoke_memory(RuntimeState* state, int64_t revocable_m } return false; } else if (is_wg_mem_low_water_mark) { - int64_t query_weighted_limit = 0; - int64_t query_weighted_consumption = 0; - query_ctx->get_weighted_memory(query_weighted_limit, query_weighted_consumption); - if (query_weighted_limit == 0 || query_weighted_consumption < query_weighted_limit) { + int64_t spill_threshold = query_ctx->spill_threshold(); + int64_t memory_usage = query_ctx->query_mem_tracker->consumption(); + if (spill_threshold == 0 || memory_usage < spill_threshold) { return false; } auto big_memory_operator_num = query_ctx->get_running_big_mem_op_num(); @@ -419,7 +417,7 @@ bool PipelineTask::should_revoke_memory(RuntimeState* state, int64_t revocable_m if (0 == big_memory_operator_num) { return false; } else { - mem_limit_of_op = query_weighted_limit / big_memory_operator_num; + mem_limit_of_op = spill_threshold / big_memory_operator_num; } LOG_EVERY_T(INFO, 1) << "query " << print_id(state->query_id()) @@ -428,10 +426,8 @@ bool PipelineTask::should_revoke_memory(RuntimeState* state, int64_t revocable_m << ", mem_limit_of_op: " << PrettyPrinter::print_bytes(mem_limit_of_op) << ", min_revocable_mem_bytes: " << PrettyPrinter::print_bytes(min_revocable_mem_bytes) - << ", query_weighted_consumption: " - << PrettyPrinter::print_bytes(query_weighted_consumption) - << ", query_weighted_limit: " - << PrettyPrinter::print_bytes(query_weighted_limit) + << ", memory_usage: " << PrettyPrinter::print_bytes(memory_usage) + << ", spill_threshold: " << PrettyPrinter::print_bytes(spill_threshold) << ", big_memory_operator_num: " << big_memory_operator_num; return (revocable_mem_bytes > mem_limit_of_op || revocable_mem_bytes > min_revocable_mem_bytes); diff --git a/be/src/pipeline/pipeline_task.h b/be/src/pipeline/pipeline_task.h index 63f464c03ad36c8..8fb4b4eb7992f5c 100644 --- a/be/src/pipeline/pipeline_task.h +++ b/be/src/pipeline/pipeline_task.h @@ -136,6 +136,7 @@ class PipelineTask { bool is_finalized() const { return _finalized; } void clear_blocking_state() { + _state->get_query_ctx()->get_execution_dependency()->set_always_ready(); // We use a lock to assure all dependencies are not deconstructed here. std::unique_lock lc(_dependency_lock); if (!_finalized) { diff --git a/be/src/pipeline/task_queue.cpp b/be/src/pipeline/task_queue.cpp index 24d711442404597..ea9fb09e260c0b2 100644 --- a/be/src/pipeline/task_queue.cpp +++ b/be/src/pipeline/task_queue.cpp @@ -131,37 +131,46 @@ Status PriorityTaskQueue::push(PipelineTask* task) { return Status::OK(); } -int PriorityTaskQueue::task_size() { - std::unique_lock lock(_work_size_mutex); - return _total_task_size; -} - MultiCoreTaskQueue::~MultiCoreTaskQueue() = default; -MultiCoreTaskQueue::MultiCoreTaskQueue(size_t core_size) : TaskQueue(core_size), _closed(false) { - _prio_task_queue_list = std::make_unique(core_size); +MultiCoreTaskQueue::MultiCoreTaskQueue(int core_size) : TaskQueue(core_size), _closed(false) { + _prio_task_queue_list = + std::make_shared>>(core_size); + for (int i = 0; i < core_size; i++) { + (*_prio_task_queue_list)[i] = std::make_unique(); + } } void MultiCoreTaskQueue::close() { + if (_closed) { + return; + } _closed = true; for (int i = 0; i < _core_size; ++i) { - _prio_task_queue_list[i].close(); + (*_prio_task_queue_list)[i]->close(); } + std::atomic_store(&_prio_task_queue_list, + std::shared_ptr>>(nullptr)); } -PipelineTask* MultiCoreTaskQueue::take(size_t core_id) { +PipelineTask* MultiCoreTaskQueue::take(int core_id) { PipelineTask* task = nullptr; + auto prio_task_queue_list = + std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); while (!_closed) { - task = _prio_task_queue_list[core_id].try_take(false); + DCHECK(prio_task_queue_list->size() > core_id) + << " list size: " << prio_task_queue_list->size() << " core_id: " << core_id + << " _core_size: " << _core_size << " _next_core: " << _next_core.load(); + task = (*prio_task_queue_list)[core_id]->try_take(false); if (task) { task->set_core_id(core_id); break; } - task = _steal_take(core_id); + task = _steal_take(core_id, *prio_task_queue_list); if (task) { break; } - task = _prio_task_queue_list[core_id].take(WAIT_CORE_TASK_TIMEOUT_MS /* timeout_ms */); + task = (*prio_task_queue_list)[core_id]->take(WAIT_CORE_TASK_TIMEOUT_MS /* timeout_ms */); if (task) { task->set_core_id(core_id); break; @@ -173,16 +182,17 @@ PipelineTask* MultiCoreTaskQueue::take(size_t core_id) { return task; } -PipelineTask* MultiCoreTaskQueue::_steal_take(size_t core_id) { +PipelineTask* MultiCoreTaskQueue::_steal_take( + int core_id, std::vector>& prio_task_queue_list) { DCHECK(core_id < _core_size); - size_t next_id = core_id; - for (size_t i = 1; i < _core_size; ++i) { + int next_id = core_id; + for (int i = 1; i < _core_size; ++i) { ++next_id; if (next_id == _core_size) { next_id = 0; } DCHECK(next_id < _core_size); - auto task = _prio_task_queue_list[next_id].try_take(true); + auto task = prio_task_queue_list[next_id]->try_take(true); if (task) { task->set_core_id(next_id); return task; @@ -199,16 +209,20 @@ Status MultiCoreTaskQueue::push_back(PipelineTask* task) { return push_back(task, core_id); } -Status MultiCoreTaskQueue::push_back(PipelineTask* task, size_t core_id) { +Status MultiCoreTaskQueue::push_back(PipelineTask* task, int core_id) { DCHECK(core_id < _core_size); task->put_in_runnable_queue(); - return _prio_task_queue_list[core_id].push(task); + auto prio_task_queue_list = + std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); + return (*prio_task_queue_list)[core_id]->push(task); } void MultiCoreTaskQueue::update_statistics(PipelineTask* task, int64_t time_spent) { task->inc_runtime_ns(time_spent); - _prio_task_queue_list[task->get_core_id()].inc_sub_queue_runtime(task->get_queue_level(), - time_spent); + auto prio_task_queue_list = + std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); + (*prio_task_queue_list)[task->get_core_id()]->inc_sub_queue_runtime(task->get_queue_level(), + time_spent); } } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/task_queue.h b/be/src/pipeline/task_queue.h index 74ed91875673c7c..e48deb517575db3 100644 --- a/be/src/pipeline/task_queue.h +++ b/be/src/pipeline/task_queue.h @@ -35,25 +35,25 @@ namespace doris::pipeline { class TaskQueue { public: - TaskQueue(size_t core_size) : _core_size(core_size) {} + TaskQueue(int core_size) : _core_size(core_size) {} virtual ~TaskQueue(); virtual void close() = 0; // Get the task by core id. // TODO: To think the logic is useful? - virtual PipelineTask* take(size_t core_id) = 0; + virtual PipelineTask* take(int core_id) = 0; // push from scheduler virtual Status push_back(PipelineTask* task) = 0; // push from worker - virtual Status push_back(PipelineTask* task, size_t core_id) = 0; + virtual Status push_back(PipelineTask* task, int core_id) = 0; virtual void update_statistics(PipelineTask* task, int64_t time_spent) {} int cores() const { return _core_size; } protected: - size_t _core_size; + int _core_size; static constexpr auto WAIT_CORE_TASK_TIMEOUT_MS = 100; }; @@ -103,8 +103,6 @@ class PriorityTaskQueue { _sub_queues[level].inc_runtime(runtime); } - int task_size(); - private: PipelineTask* _try_take_unprotected(bool is_steal); static constexpr auto LEVEL_QUEUE_TIME_FACTOR = 2; @@ -128,27 +126,28 @@ class PriorityTaskQueue { // Need consider NUMA architecture class MultiCoreTaskQueue : public TaskQueue { public: - explicit MultiCoreTaskQueue(size_t core_size); + explicit MultiCoreTaskQueue(int core_size); ~MultiCoreTaskQueue() override; void close() override; // Get the task by core id. - PipelineTask* take(size_t core_id) override; + PipelineTask* take(int core_id) override; // TODO combine these methods to `push_back(task, core_id = -1)` Status push_back(PipelineTask* task) override; - Status push_back(PipelineTask* task, size_t core_id) override; + Status push_back(PipelineTask* task, int core_id) override; void update_statistics(PipelineTask* task, int64_t time_spent) override; private: - PipelineTask* _steal_take(size_t core_id); + PipelineTask* _steal_take( + int core_id, std::vector>& prio_task_queue_list); - std::unique_ptr _prio_task_queue_list; - std::atomic _next_core = 0; + std::shared_ptr>> _prio_task_queue_list; + std::atomic _next_core = 0; std::atomic _closed; }; diff --git a/be/src/pipeline/task_scheduler.cpp b/be/src/pipeline/task_scheduler.cpp index 7a860440228ab4a..8be30773ee11f1a 100644 --- a/be/src/pipeline/task_scheduler.cpp +++ b/be/src/pipeline/task_scheduler.cpp @@ -52,13 +52,13 @@ TaskScheduler::~TaskScheduler() { Status TaskScheduler::start() { int cores = _task_queue->cores(); - // Must be mutil number of cpu cores RETURN_IF_ERROR(ThreadPoolBuilder(_name) .set_min_threads(cores) .set_max_threads(cores) .set_max_queue_size(0) .set_cgroup_cpu_ctl(_cgroup_cpu_ctl) .build(&_fix_thread_pool)); + LOG_INFO("TaskScheduler set cores").tag("size", cores); _markers.resize(cores, true); for (size_t i = 0; i < cores; ++i) { RETURN_IF_ERROR(_fix_thread_pool->submit_func([this, i] { _do_work(i); })); diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 3434d01a59e80cc..f751aeb5d822743 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -36,6 +36,13 @@ #include "runtime/frontend_info.h" // TODO(zhiqiang): find a way to remove this include header #include "util/threadpool.h" +namespace orc { +class MemoryPool; +} +namespace arrow { +class MemoryPool; +} + namespace doris { namespace vectorized { class VDataStreamMgr; @@ -173,6 +180,9 @@ class ExecEnv { std::shared_ptr segcompaction_mem_tracker() { return _segcompaction_mem_tracker; } + std::shared_ptr stream_load_pipe_tracker() { + return _stream_load_pipe_tracker; + } std::shared_ptr point_query_executor_mem_tracker() { return _point_query_executor_mem_tracker; } @@ -193,13 +203,12 @@ class ExecEnv { } ThreadPool* send_table_stats_thread_pool() { return _send_table_stats_thread_pool.get(); } ThreadPool* s3_file_upload_thread_pool() { return _s3_file_upload_thread_pool.get(); } - ThreadPool* send_report_thread_pool() { return _send_report_thread_pool.get(); } - ThreadPool* join_node_thread_pool() { return _join_node_thread_pool.get(); } ThreadPool* lazy_release_obj_pool() { return _lazy_release_obj_pool.get(); } ThreadPool* non_block_close_thread_pool(); + ThreadPool* s3_file_system_thread_pool() { return _s3_file_system_thread_pool.get(); } Status init_pipeline_task_scheduler(); - void init_file_cache_factory(); + void init_file_cache_factory(std::vector& cache_paths); io::FileCacheFactory* file_cache_factory() { return _file_cache_factory; } UserFunctionCache* user_function_cache() { return _user_function_cache; } FragmentMgr* fragment_mgr() { return _fragment_mgr; } @@ -211,6 +220,9 @@ class ExecEnv { BrpcClientCache* brpc_internal_client_cache() const { return _internal_client_cache; } + BrpcClientCache* brpc_streaming_client_cache() const { + return _streaming_client_cache; + } BrpcClientCache* brpc_function_client_cache() const { return _function_client_cache; } @@ -305,6 +317,9 @@ class ExecEnv { segment_v2::TmpFileDirs* get_tmp_file_dirs() { return _tmp_file_dirs.get(); } io::FDCache* file_cache_open_fd_cache() const { return _file_cache_open_fd_cache.get(); } + orc::MemoryPool* orc_memory_pool() { return _orc_memory_pool; } + arrow::MemoryPool* arrow_memory_pool() { return _arrow_memory_pool; } + private: ExecEnv(); @@ -347,6 +362,7 @@ class ExecEnv { std::shared_ptr _brpc_iobuf_block_memory_tracker; // Count the memory consumption of segment compaction tasks. std::shared_ptr _segcompaction_mem_tracker; + std::shared_ptr _stream_load_pipe_tracker; // Tracking memory may be shared between multiple queries. std::shared_ptr _point_query_executor_mem_tracker; @@ -364,13 +380,11 @@ class ExecEnv { std::unique_ptr _send_table_stats_thread_pool; // Threadpool used to upload local file to s3 std::unique_ptr _s3_file_upload_thread_pool; - // Pool used by fragment manager to send profile or status to FE coordinator - std::unique_ptr _send_report_thread_pool; // Pool used by join node to build hash table - std::unique_ptr _join_node_thread_pool; // Pool to use a new thread to release object std::unique_ptr _lazy_release_obj_pool; std::unique_ptr _non_block_close_thread_pool; + std::unique_ptr _s3_file_system_thread_pool; FragmentMgr* _fragment_mgr = nullptr; pipeline::TaskScheduler* _without_group_task_scheduler = nullptr; @@ -387,6 +401,7 @@ class ExecEnv { // TODO(zhiqiang): Do not use shared_ptr in exec_env, we can not control its life cycle. std::shared_ptr _new_load_stream_mgr; BrpcClientCache* _internal_client_cache = nullptr; + BrpcClientCache* _streaming_client_cache = nullptr; BrpcClientCache* _function_client_cache = nullptr; std::shared_ptr _stream_load_executor; @@ -435,6 +450,9 @@ class ExecEnv { std::unique_ptr _pipeline_tracer_ctx; std::unique_ptr _tmp_file_dirs; doris::vectorized::SpillStreamManager* _spill_stream_mgr = nullptr; + + orc::MemoryPool* _orc_memory_pool = nullptr; + arrow::MemoryPool* _arrow_memory_pool = nullptr; }; template <> diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 32fbc4e0af415c4..53fe1993139cb0a 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -44,6 +44,7 @@ #include "io/cache/block_file_cache_factory.h" #include "io/cache/fs_file_cache_storage.h" #include "io/fs/file_meta_cache.h" +#include "io/fs/local_file_reader.h" #include "olap/memtable_memory_limiter.h" #include "olap/olap_define.h" #include "olap/options.h" @@ -100,6 +101,8 @@ #include "util/threadpool.h" #include "util/thrift_rpc_helper.h" #include "util/timezone_utils.h" +#include "vec/exec/format/orc/orc_memory_pool.h" +#include "vec/exec/format/parquet/arrow_memory_pool.h" #include "vec/exec/scan/scanner_scheduler.h" #include "vec/runtime/vdata_stream_mgr.h" #include "vec/sink/delta_writer_v2_pool.h" @@ -243,17 +246,6 @@ Status ExecEnv::_init(const std::vector& store_paths, // min num equal to fragment pool's min num // max num is useless because it will start as many as requested in the past // queue size is useless because the max thread num is very large - static_cast(ThreadPoolBuilder("SendReportThreadPool") - .set_min_threads(config::fragment_pool_thread_num_min) - .set_max_threads(std::numeric_limits::max()) - .set_max_queue_size(config::fragment_pool_queue_size) - .build(&_send_report_thread_pool)); - - static_cast(ThreadPoolBuilder("JoinNodeThreadPool") - .set_min_threads(config::fragment_pool_thread_num_min) - .set_max_threads(std::numeric_limits::max()) - .set_max_queue_size(config::fragment_pool_queue_size) - .build(&_join_node_thread_pool)); static_cast(ThreadPoolBuilder("LazyReleaseMemoryThreadPool") .set_min_threads(1) .set_max_threads(1) @@ -263,12 +255,21 @@ Status ExecEnv::_init(const std::vector& store_paths, .set_min_threads(config::min_nonblock_close_thread_num) .set_max_threads(config::max_nonblock_close_thread_num) .build(&_non_block_close_thread_pool)); + static_cast(ThreadPoolBuilder("S3FileSystemThreadPool") + .set_min_threads(config::min_s3_file_system_thread_num) + .set_max_threads(config::max_s3_file_system_thread_num) + .build(&_s3_file_system_thread_pool)); // NOTE: runtime query statistics mgr could be visited by query and daemon thread // so it should be created before all query begin and deleted after all query and daemon thread stoppped _runtime_query_statistics_mgr = new RuntimeQueryStatisticsMgr(); + CgroupCpuCtl::init_doris_cgroup_path(); _file_cache_factory = new io::FileCacheFactory(); - init_file_cache_factory(); + std::vector cache_paths; + init_file_cache_factory(cache_paths); + doris::io::BeConfDataDirReader::init_be_conf_data_dir(store_paths, spill_store_paths, + cache_paths); + _pipeline_tracer_ctx = std::make_unique(); // before query RETURN_IF_ERROR(init_pipeline_task_scheduler()); _workload_group_manager = new WorkloadGroupMgr(); @@ -287,7 +288,10 @@ Status ExecEnv::_init(const std::vector& store_paths, _load_stream_mgr = std::make_unique(num_flush_threads); _new_load_stream_mgr = NewLoadStreamMgr::create_shared(); _internal_client_cache = new BrpcClientCache(); - _function_client_cache = new BrpcClientCache(); + _streaming_client_cache = + new BrpcClientCache("baidu_std", "single", "streaming"); + _function_client_cache = + new BrpcClientCache(config::function_service_protocol); if (config::is_cloud_mode()) { _stream_load_executor = std::make_shared(this); } else { @@ -339,7 +343,8 @@ Status ExecEnv::_init(const std::vector& store_paths, options.broken_paths = broken_paths; options.backend_uid = doris::UniqueId::gen_uid(); if (config::is_cloud_mode()) { - std::cout << "start BE in cloud mode" << std::endl; + std::cout << "start BE in cloud mode, cloud_unique_id: " << config::cloud_unique_id + << ", meta_service_endpoint: " << config::meta_service_endpoint << std::endl; _storage_engine = std::make_unique(options.backend_uid); } else { std::cout << "start BE in local mode" << std::endl; @@ -384,7 +389,7 @@ Status ExecEnv::init_pipeline_task_scheduler() { return Status::OK(); } -void ExecEnv::init_file_cache_factory() { +void ExecEnv::init_file_cache_factory(std::vector& cache_paths) { // Load file cache before starting up daemon threads to make sure StorageEngine is read. if (doris::config::enable_file_cache) { if (config::file_cache_each_block_size > config::s3_write_buffer_size || @@ -397,7 +402,6 @@ void ExecEnv::init_file_cache_factory() { exit(-1); } std::unordered_set cache_path_set; - std::vector cache_paths; Status rest = doris::parse_conf_cache_paths(doris::config::file_cache_path, cache_paths); if (!rest) { LOG(FATAL) << "parse config file cache path failed, path=" @@ -514,21 +518,18 @@ Status ExecEnv::_init_mem_env() { // SegmentLoader caches segments in rowset granularity. So the size of // opened files will greater than segment_cache_capacity. int64_t segment_cache_capacity = config::segment_cache_capacity; - if (segment_cache_capacity < 0 || segment_cache_capacity > fd_number * 1 / 5) { - segment_cache_capacity = fd_number * 1 / 5; + int64_t segment_cache_fd_limit = fd_number / 100 * config::segment_cache_fd_percentage; + if (segment_cache_capacity < 0 || segment_cache_capacity > segment_cache_fd_limit) { + segment_cache_capacity = segment_cache_fd_limit; } int64_t segment_cache_mem_limit = MemInfo::mem_limit() / 100 * config::segment_cache_memory_percentage; - // config::segment_cache_memory_percentage; - int64_t min_segment_cache_mem_limit = - min(segment_cache_mem_limit, segment_cache_capacity * - config::estimated_num_columns_per_segment * - config::estimated_mem_per_column_reader); - _segment_loader = new SegmentLoader(min_segment_cache_mem_limit, segment_cache_capacity); + + _segment_loader = new SegmentLoader(segment_cache_mem_limit, segment_cache_capacity); LOG(INFO) << "segment_cache_capacity <= fd_number * 1 / 5, fd_number: " << fd_number << " segment_cache_capacity: " << segment_cache_capacity - << " min_segment_cache_mem_limit " << min_segment_cache_mem_limit; + << " min_segment_cache_mem_limit " << segment_cache_mem_limit; _schema_cache = new SchemaCache(config::schema_cache_capacity); @@ -573,6 +574,10 @@ Status ExecEnv::_init_mem_env() { << PrettyPrinter::print(inverted_index_cache_limit, TUnit::BYTES) << ", origin config value: " << config::inverted_index_query_cache_limit; + // init orc memory pool + _orc_memory_pool = new doris::vectorized::ORCMemoryPool(); + _arrow_memory_pool = new doris::vectorized::ArrowMemoryPool(); + return Status::OK(); } @@ -601,6 +606,8 @@ void ExecEnv::init_mem_tracker() { MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "SubcolumnsTree"); _s3_file_buffer_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "S3FileBuffer"); + _stream_load_pipe_tracker = + MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "StreamLoadPipe"); } void ExecEnv::_register_metrics() { @@ -666,10 +673,9 @@ void ExecEnv::destroy() { } SAFE_SHUTDOWN(_buffered_reader_prefetch_thread_pool); SAFE_SHUTDOWN(_s3_file_upload_thread_pool); - SAFE_SHUTDOWN(_join_node_thread_pool); SAFE_SHUTDOWN(_lazy_release_obj_pool); SAFE_SHUTDOWN(_non_block_close_thread_pool); - SAFE_SHUTDOWN(_send_report_thread_pool); + SAFE_SHUTDOWN(_s3_file_system_thread_pool); SAFE_SHUTDOWN(_send_batch_thread_pool); _deregister_metrics(); @@ -701,6 +707,7 @@ void ExecEnv::destroy() { SAFE_DELETE(_routine_load_task_executor); // _stream_load_executor SAFE_DELETE(_function_client_cache); + SAFE_DELETE(_streaming_client_cache); SAFE_DELETE(_internal_client_cache); SAFE_DELETE(_bfd_parser); @@ -711,10 +718,9 @@ void ExecEnv::destroy() { SAFE_DELETE(_file_cache_factory); SAFE_DELETE(_runtime_filter_timer_queue); // TODO(zhiqiang): Maybe we should call shutdown before release thread pool? - _join_node_thread_pool.reset(nullptr); _lazy_release_obj_pool.reset(nullptr); _non_block_close_thread_pool.reset(nullptr); - _send_report_thread_pool.reset(nullptr); + _s3_file_system_thread_pool.reset(nullptr); _send_table_stats_thread_pool.reset(nullptr); _buffered_reader_prefetch_thread_pool.reset(nullptr); _s3_file_upload_thread_pool.reset(nullptr); @@ -751,6 +757,9 @@ void ExecEnv::destroy() { // We should free task scheduler finally because task queue / scheduler maybe used by pipelineX. SAFE_DELETE(_without_group_task_scheduler); + SAFE_DELETE(_arrow_memory_pool); + SAFE_DELETE(_orc_memory_pool); + // dns cache is a global instance and need to be released at last SAFE_DELETE(_dns_cache); diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 057dca4a2ee18bb..58bd2681ea7a621 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -17,6 +17,7 @@ #include "runtime/fragment_mgr.h" +#include #include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -45,11 +47,13 @@ #include "common/status.h" // IWYU pragma: no_include #include // IWYU pragma: keep +#include #include #include #include #include #include +#include #include #include "cloud/config.h" @@ -133,6 +137,103 @@ std::string to_load_error_http_path(const std::string& file_name) { using apache::thrift::TException; using apache::thrift::transport::TTransportException; +static Status _do_fetch_running_queries_rpc(const FrontendInfo& fe_info, + std::unordered_set& query_set) { + TFetchRunningQueriesResult rpc_result; + TFetchRunningQueriesRequest rpc_request; + + Status client_status; + const int32 timeout_ms = 3 * 1000; + FrontendServiceConnection rpc_client(ExecEnv::GetInstance()->frontend_client_cache(), + fe_info.info.coordinator_address, timeout_ms, + &client_status); + // Abort this fe. + if (!client_status.ok()) { + LOG_WARNING("Failed to get client for {}, reason is {}", + PrintThriftNetworkAddress(fe_info.info.coordinator_address), + client_status.to_string()); + return Status::InternalError("Failed to get client for {}, reason is {}", + PrintThriftNetworkAddress(fe_info.info.coordinator_address), + client_status.to_string()); + } + + // do rpc + try { + try { + rpc_client->fetchRunningQueries(rpc_result, rpc_request); + } catch (const apache::thrift::transport::TTransportException& e) { + LOG_WARNING("Transport exception reason: {}, reopening", e.what()); + client_status = rpc_client.reopen(config::thrift_rpc_timeout_ms); + if (!client_status.ok()) { + LOG_WARNING("Reopen failed, reason: {}", client_status.to_string_no_stack()); + return Status::InternalError("Reopen failed, reason: {}", + client_status.to_string_no_stack()); + } + + rpc_client->fetchRunningQueries(rpc_result, rpc_request); + } + } catch (apache::thrift::TException& e) { + // During upgrading cluster or meet any other network error. + LOG_WARNING("Failed to fetch running queries from {}, reason: {}", + PrintThriftNetworkAddress(fe_info.info.coordinator_address), e.what()); + return Status::InternalError("Failed to fetch running queries from {}, reason: {}", + PrintThriftNetworkAddress(fe_info.info.coordinator_address), + e.what()); + } + + // Avoid logic error in frontend. + if (rpc_result.__isset.status == false || rpc_result.status.status_code != TStatusCode::OK) { + LOG_WARNING("Failed to fetch running queries from {}, reason: {}", + PrintThriftNetworkAddress(fe_info.info.coordinator_address), + doris::to_string(rpc_result.status.status_code)); + return Status::InternalError("Failed to fetch running queries from {}, reason: {}", + PrintThriftNetworkAddress(fe_info.info.coordinator_address), + doris::to_string(rpc_result.status.status_code)); + } + + if (rpc_result.__isset.running_queries == false) { + return Status::InternalError("Failed to fetch running queries from {}, reason: {}", + PrintThriftNetworkAddress(fe_info.info.coordinator_address), + "running_queries is not set"); + } + + query_set = std::unordered_set(rpc_result.running_queries.begin(), + rpc_result.running_queries.end()); + return Status::OK(); +}; + +static std::map> _get_all_running_queries_from_fe() { + const std::map& running_fes = + ExecEnv::GetInstance()->get_running_frontends(); + + std::map> result; + std::vector qualified_fes; + + for (const auto& fe : running_fes) { + // Only consider normal frontend. + if (fe.first.port != 0 && fe.second.info.process_uuid != 0) { + qualified_fes.push_back(fe.second); + } else { + return {}; + } + } + + for (const auto& fe_addr : qualified_fes) { + const int64_t process_uuid = fe_addr.info.process_uuid; + std::unordered_set query_set; + Status st = _do_fetch_running_queries_rpc(fe_addr, query_set); + if (!st.ok()) { + // Empty result, cancel worker will not do anything + return {}; + } + + // frontend_info and process_uuid has been checked in rpc threads. + result[process_uuid] = query_set; + } + + return result; +} + FragmentMgr::FragmentMgr(ExecEnv* exec_env) : _exec_env(exec_env), _stop_background_threads_latch(1) { _entity = DorisMetrics::instance()->metric_registry()->register_entity("FragmentMgr"); @@ -143,24 +244,15 @@ FragmentMgr::FragmentMgr(ExecEnv* exec_env) &_cancel_thread); CHECK(s.ok()) << s.to_string(); - // TODO(zc): we need a better thread-pool - // now one user can use all the thread pool, others have no resource. - s = ThreadPoolBuilder("FragmentMgrThreadPool") - .set_min_threads(config::fragment_pool_thread_num_min) - .set_max_threads(config::fragment_pool_thread_num_max) - .set_max_queue_size(config::fragment_pool_queue_size) + s = ThreadPoolBuilder("FragmentMgrAsyncWorkThreadPool") + .set_min_threads(config::fragment_mgr_asynic_work_pool_thread_num_min) + .set_max_threads(config::fragment_mgr_asynic_work_pool_thread_num_max) + .set_max_queue_size(config::fragment_mgr_asynic_work_pool_queue_size) .build(&_thread_pool); REGISTER_HOOK_METRIC(fragment_thread_pool_queue_size, [this]() { return _thread_pool->get_queue_size(); }); CHECK(s.ok()) << s.to_string(); - - s = ThreadPoolBuilder("FragmentInstanceReportThreadPool") - .set_min_threads(48) - .set_max_threads(512) - .set_max_queue_size(102400) - .build(&_async_report_thread_pool); - CHECK(s.ok()) << s.to_string(); } FragmentMgr::~FragmentMgr() = default; @@ -172,9 +264,6 @@ void FragmentMgr::stop() { if (_cancel_thread) { _cancel_thread->join(); } - // Stop all the worker, should wait for a while? - // _thread_pool->wait_for(); - _thread_pool->shutdown(); // Only me can delete { @@ -182,7 +271,7 @@ void FragmentMgr::stop() { _query_ctx_map.clear(); _pipeline_map.clear(); } - _async_report_thread_pool->shutdown(); + _thread_pool->shutdown(); } std::string FragmentMgr::to_http_path(const std::string& file_name) { @@ -195,7 +284,7 @@ std::string FragmentMgr::to_http_path(const std::string& file_name) { Status FragmentMgr::trigger_pipeline_context_report( const ReportStatusRequest req, std::shared_ptr&& ctx) { - return _async_report_thread_pool->submit_func([this, req, ctx]() { + return _thread_pool->submit_func([this, req, ctx]() { SCOPED_ATTACH_TASK(ctx->get_query_ctx()->query_mem_tracker); coordinator_callback(req); if (!req.done) { @@ -479,11 +568,13 @@ void FragmentMgr::coordinator_callback(const ReportStatusRequest& req) { static void empty_function(RuntimeState*, Status*) {} -Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params) { +Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params, + const QuerySource query_source) { return Status::InternalError("Non-pipeline is disabled!"); } -Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params) { +Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, + const QuerySource query_source) { if (params.txn_conf.need_txn) { std::shared_ptr stream_load_ctx = std::make_shared(_exec_env); @@ -515,7 +606,7 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params) { RETURN_IF_ERROR(_exec_env->stream_load_executor()->execute_plan_fragment(stream_load_ctx)); return Status::OK(); } else { - return exec_plan_fragment(params, empty_function); + return exec_plan_fragment(params, query_source, empty_function); } } @@ -578,7 +669,10 @@ std::shared_ptr FragmentMgr::get_or_erase_query_ctx_with_lock( template Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, bool pipeline, + QuerySource query_source, std::shared_ptr& query_ctx) { + DBUG_EXECUTE_IF("FragmentMgr._get_query_ctx.failed", + { return Status::InternalError("FragmentMgr._get_query_ctx.failed"); }); if (params.is_simplified_param) { // Get common components from _query_ctx_map std::lock_guard lock(_lock); @@ -607,9 +701,9 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo // This may be a first fragment request of the query. // Create the query fragments context. - query_ctx = - QueryContext::create_shared(query_id, _exec_env, params.query_options, params.coord, - pipeline, params.is_nereids, params.current_connect_fe); + query_ctx = QueryContext::create_shared(query_id, _exec_env, params.query_options, + params.coord, pipeline, params.is_nereids, + params.current_connect_fe, query_source); SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(query_ctx->query_mem_tracker); RETURN_IF_ERROR(DescriptorTbl::create(&(query_ctx->obj_pool), params.desc_tbl, &(query_ctx->desc_tbl))); @@ -658,7 +752,7 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo } Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params, - const FinishCallback& cb) { + QuerySource query_source, const FinishCallback& cb) { return Status::InternalError("Non-pipeline is disabled!"); } @@ -700,7 +794,7 @@ std::string FragmentMgr::dump_pipeline_tasks(TUniqueId& query_id) { } Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, - const FinishCallback& cb) { + QuerySource query_source, const FinishCallback& cb) { VLOG_ROW << "query: " << print_id(params.query_id) << " exec_plan_fragment params is " << apache::thrift::ThriftDebugString(params).c_str(); // sometimes TExecPlanFragmentParams debug string is too long and glog @@ -709,7 +803,7 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, << apache::thrift::ThriftDebugString(params.query_options).c_str(); std::shared_ptr query_ctx; - RETURN_IF_ERROR(_get_query_ctx(params, params.query_id, true, query_ctx)); + RETURN_IF_ERROR(_get_query_ctx(params, params.query_id, true, query_source, query_ctx)); SCOPED_ATTACH_TASK(query_ctx.get()); int64_t duration_ns = 0; std::shared_ptr context = @@ -840,12 +934,30 @@ void FragmentMgr::cancel_instance(const TUniqueId instance_id, const Status reas void FragmentMgr::cancel_worker() { LOG(INFO) << "FragmentMgr cancel worker start working."; + + timespec check_invalid_query_last_timestamp; + clock_gettime(CLOCK_MONOTONIC, &check_invalid_query_last_timestamp); + do { std::vector queries_lost_coordinator; std::vector queries_timeout; + std::vector queries_pipeline_task_leak; + // Fe process uuid -> set + std::map> running_queries_on_all_fes; + const std::map& running_fes = + ExecEnv::GetInstance()->get_running_frontends(); timespec now; clock_gettime(CLOCK_MONOTONIC, &now); + + if (now.tv_sec - check_invalid_query_last_timestamp.tv_sec > + config::pipeline_task_leakage_detect_period_secs) { + check_invalid_query_last_timestamp = now; + running_queries_on_all_fes = _get_all_running_queries_from_fe(); + } else { + running_queries_on_all_fes.clear(); + } + { std::lock_guard lock(_lock); for (auto& pipeline_itr : _pipeline_map) { @@ -865,8 +977,6 @@ void FragmentMgr::cancel_worker() { } } - const auto& running_fes = ExecEnv::GetInstance()->get_running_frontends(); - // We use a very conservative cancel strategy. // 0. If there are no running frontends, do not cancel any queries. // 1. If query's process uuid is zero, do not cancel @@ -880,15 +990,37 @@ void FragmentMgr::cancel_worker() { } else { for (const auto& it : _query_ctx_map) { if (auto q_ctx = it.second.lock()) { - if (q_ctx->get_fe_process_uuid() == 0) { + const int64_t fe_process_uuid = q_ctx->get_fe_process_uuid(); + + if (fe_process_uuid == 0) { // zero means this query is from a older version fe or // this fe is starting continue; } + // If the query is not running on the any frontends, cancel it. + if (auto itr = running_queries_on_all_fes.find(fe_process_uuid); + itr != running_queries_on_all_fes.end()) { + // Query not found on this frontend, and the query arrives before the last check + if (itr->second.find(it.first) == itr->second.end() && + // tv_nsec represents the number of nanoseconds that have elapsed since the time point stored in tv_sec. + // tv_sec is enough, we do not need to check tv_nsec. + q_ctx->get_query_arrival_timestamp().tv_sec < + check_invalid_query_last_timestamp.tv_sec && + q_ctx->get_query_source() == QuerySource::INTERNAL_FRONTEND) { + queries_pipeline_task_leak.push_back(q_ctx->query_id()); + LOG_INFO( + "Query {}, type {} is not found on any frontends, maybe it " + "is leaked.", + print_id(q_ctx->query_id()), + toString(q_ctx->get_query_source())); + continue; + } + } + auto itr = running_fes.find(q_ctx->coord_addr); if (itr != running_fes.end()) { - if (q_ctx->get_fe_process_uuid() == itr->second.info.process_uuid || + if (fe_process_uuid == itr->second.info.process_uuid || itr->second.info.process_uuid == 0) { continue; } else { @@ -942,9 +1074,18 @@ void FragmentMgr::cancel_worker() { "FragmentMgr cancel worker going to cancel timeout instance ")); } + for (const auto& qid : queries_pipeline_task_leak) { + // Cancel the query, and maybe try to report debug info to fe so that we can + // collect debug info by sql or http api instead of search log. + cancel_query(qid, Status::Error( + "Potential pipeline task leakage")); + } + for (const auto& qid : queries_lost_coordinator) { - cancel_query(qid, Status::InternalError("Coordinator dead.")); + cancel_query(qid, Status::Error( + "Source frontend is not running or restarted")); } + } while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(1))); LOG(INFO) << "FragmentMgr cancel worker is going to exit."; } @@ -1042,7 +1183,7 @@ Status FragmentMgr::exec_external_plan_fragment(const TScanOpenParams& params, exec_fragment_params.__set_query_options(query_options); VLOG_ROW << "external exec_plan_fragment params is " << apache::thrift::ThriftDebugString(exec_fragment_params).c_str(); - return exec_plan_fragment(exec_fragment_params); + return exec_plan_fragment(exec_fragment_params, QuerySource::EXTERNAL_CONNECTOR); } Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, @@ -1148,8 +1289,6 @@ Status FragmentMgr::sync_filter_size(const PSyncFilterSizeRequest* request) { Status FragmentMgr::merge_filter(const PMergeFilterRequest* request, butil::IOBufAsZeroCopyInputStream* attach_data) { UniqueId queryid = request->query_id(); - std::shared_ptr filter_controller; - RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); std::shared_ptr query_ctx; { @@ -1165,6 +1304,8 @@ Status FragmentMgr::merge_filter(const PMergeFilterRequest* request, } } SCOPED_ATTACH_TASK(query_ctx.get()); + std::shared_ptr filter_controller; + RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); auto merge_status = filter_controller->merge(request, attach_data); return merge_status; } diff --git a/be/src/runtime/fragment_mgr.h b/be/src/runtime/fragment_mgr.h index ad39c56cc1180a9..bc066066f7b6a6c 100644 --- a/be/src/runtime/fragment_mgr.h +++ b/be/src/runtime/fragment_mgr.h @@ -81,17 +81,19 @@ class FragmentMgr : public RestMonitorIface { void stop(); // execute one plan fragment - Status exec_plan_fragment(const TExecPlanFragmentParams& params); + Status exec_plan_fragment(const TExecPlanFragmentParams& params, const QuerySource query_type); - Status exec_plan_fragment(const TPipelineFragmentParams& params); + Status exec_plan_fragment(const TPipelineFragmentParams& params, const QuerySource query_type); void remove_pipeline_context( std::shared_ptr pipeline_context); // TODO(zc): report this is over - Status exec_plan_fragment(const TExecPlanFragmentParams& params, const FinishCallback& cb); + Status exec_plan_fragment(const TExecPlanFragmentParams& params, const QuerySource query_type, + const FinishCallback& cb); - Status exec_plan_fragment(const TPipelineFragmentParams& params, const FinishCallback& cb); + Status exec_plan_fragment(const TPipelineFragmentParams& params, const QuerySource query_type, + const FinishCallback& cb); Status start_query_execution(const PExecPlanFragmentStartRequest* request); @@ -155,7 +157,7 @@ class FragmentMgr : public RestMonitorIface { template Status _get_query_ctx(const Params& params, TUniqueId query_id, bool pipeline, - std::shared_ptr& query_ctx); + QuerySource query_type, std::shared_ptr& query_ctx); // This is input params ExecEnv* _exec_env = nullptr; @@ -182,8 +184,6 @@ class FragmentMgr : public RestMonitorIface { UIntGauge* timeout_canceled_fragment_count = nullptr; RuntimeFilterMergeController _runtimefilter_controller; - std::unique_ptr _async_report_thread_pool = - nullptr; // used for pipeliine context report }; uint64_t get_fragment_executing_count(); diff --git a/be/src/runtime/frontend_info.h b/be/src/runtime/frontend_info.h index a7e4b3f999b29e3..c4d3d710b3ca369 100644 --- a/be/src/runtime/frontend_info.h +++ b/be/src/runtime/frontend_info.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -28,4 +29,9 @@ struct FrontendInfo { std::time_t last_reveiving_time_ms; }; +struct FrontendAddrAndRunningQueries { + TNetworkAddress frontend_addr; + std::set running_queries; +}; + } // namespace doris diff --git a/be/src/runtime/group_commit_mgr.cpp b/be/src/runtime/group_commit_mgr.cpp index 30885fa1ac92184..3250379cf859240 100644 --- a/be/src/runtime/group_commit_mgr.cpp +++ b/be/src/runtime/group_commit_mgr.cpp @@ -38,12 +38,10 @@ namespace doris { Status LoadBlockQueue::add_block(RuntimeState* runtime_state, std::shared_ptr block, bool write_wal, UniqueId& load_id) { + DBUG_EXECUTE_IF("LoadBlockQueue.add_block.failed", + { return Status::InternalError("LoadBlockQueue.add_block.failed"); }); std::unique_lock l(mutex); RETURN_IF_ERROR(status); - auto start = std::chrono::steady_clock::now(); - DBUG_EXECUTE_IF("LoadBlockQueue.add_block.back_pressure_time_out", { - start = std::chrono::steady_clock::now() - std::chrono::milliseconds(120000); - }); if (UNLIKELY(runtime_state->is_cancelled())) { return runtime_state->cancel_reason(); } @@ -435,13 +433,18 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ Status result_status; DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.err_status", { status = Status::InternalError(""); }); + DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.load_error", + { status = Status::InternalError("load_error"); }); if (status.ok()) { + DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.commit_error", + { status = Status::InternalError(""); }); // commit txn TLoadTxnCommitRequest request; request.__set_auth_code(0); // this is a fake, fe not check it now request.__set_db_id(db_id); request.__set_table_id(table_id); request.__set_txnId(txn_id); + request.__set_thrift_rpc_timeout_ms(config::txn_commit_rpc_timeout_ms); request.__set_groupCommit(true); request.__set_receiveBytes(state->num_bytes_load_total()); if (_exec_env->master_info()->__isset.backend_id) { @@ -461,7 +464,7 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ [&request, &result](FrontendServiceConnection& client) { client->loadTxnCommit(result, request); }, - 10000L); + config::txn_commit_rpc_timeout_ms); result_status = Status::create(result.status); // DELETE_BITMAP_LOCK_ERROR will be retried if (result_status.ok() || !result_status.is()) { @@ -474,6 +477,8 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ .error(result_status); retry_times++; } + DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.commit_success_and_rpc_error", + { result_status = Status::InternalError("commit_success_and_rpc_error"); }); } else { // abort txn TLoadTxnRollbackRequest request; @@ -487,8 +492,7 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ master_addr.hostname, master_addr.port, [&request, &result](FrontendServiceConnection& client) { client->loadTxnRollback(result, request); - }, - 10000L); + }); result_status = Status::create(result.status); DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.err_status", { std ::string msg = "abort txn"; @@ -578,7 +582,8 @@ Status GroupCommitTable::_exec_plan_fragment(int64_t db_id, int64_t table_id, << ", st=" << finish_st.to_string(); } }; - return _exec_env->fragment_mgr()->exec_plan_fragment(pipeline_params, finish_cb); + return _exec_env->fragment_mgr()->exec_plan_fragment(pipeline_params, + QuerySource::GROUP_COMMIT_LOAD, finish_cb); } Status GroupCommitTable::get_load_block_queue(const TUniqueId& instance_id, diff --git a/be/src/runtime/load_stream.cpp b/be/src/runtime/load_stream.cpp index c818c4664a06899..aa1749caace3e64 100644 --- a/be/src/runtime/load_stream.cpp +++ b/be/src/runtime/load_stream.cpp @@ -62,7 +62,7 @@ TabletStream::TabletStream(PUniqueId load_id, int64_t id, int64_t txn_id, _txn_id(txn_id), _load_stream_mgr(load_stream_mgr) { load_stream_mgr->create_tokens(_flush_tokens); - _failed_st = std::make_shared(); + _status = Status::OK(); _profile = profile->create_child(fmt::format("TabletStream {}", id), true, true); _append_data_timer = ADD_TIMER(_profile, "AppendDataTime"); _add_segment_timer = ADD_TIMER(_profile, "AddSegmentTime"); @@ -71,7 +71,7 @@ TabletStream::TabletStream(PUniqueId load_id, int64_t id, int64_t txn_id, inline std::ostream& operator<<(std::ostream& ostr, const TabletStream& tablet_stream) { ostr << "load_id=" << tablet_stream._load_id << ", txn_id=" << tablet_stream._txn_id - << ", tablet_id=" << tablet_stream._id << ", status=" << *tablet_stream._failed_st; + << ", tablet_id=" << tablet_stream._id << ", status=" << tablet_stream._status; return ostr; } @@ -89,17 +89,20 @@ Status TabletStream::init(std::shared_ptr schema, int64_t }; _load_stream_writer = std::make_shared(&req, _profile); - auto st = _load_stream_writer->init(); - if (!st.ok()) { - _failed_st = std::make_shared(st); + DBUG_EXECUTE_IF("TabletStream.init.uninited_writer", { + _status = Status::Uninitialized("fault injection"); + return _status; + }); + _status = _load_stream_writer->init(); + if (!_status.ok()) { LOG(INFO) << "failed to init rowset builder due to " << *this; } - return st; + return _status; } Status TabletStream::append_data(const PStreamHeader& header, butil::IOBuf* data) { - if (!_failed_st->ok()) { - return *_failed_st; + if (!_status.ok()) { + return _status; } // dispatch add_segment request @@ -156,9 +159,9 @@ Status TabletStream::append_data(const PStreamHeader& header, butil::IOBuf* data file_type, new_segid); } } - if (!st.ok() && _failed_st->ok()) { - _failed_st = std::make_shared(st); - LOG(INFO) << "write data failed " << *this; + if (!st.ok() && _status.ok()) { + _status = st; + LOG(WARNING) << "write data failed " << st << ", " << *this; } }; auto& flush_token = _flush_tokens[new_segid % _flush_tokens.size()]; @@ -173,10 +176,11 @@ Status TabletStream::append_data(const PStreamHeader& header, butil::IOBuf* data timer.start(); while (flush_token->num_tasks() >= load_stream_flush_token_max_tasks) { if (timer.elapsed_time() / 1000 / 1000 >= load_stream_max_wait_flush_token_time_ms) { - return Status::Error( + _status = Status::Error( "wait flush token back pressure time is more than " "load_stream_max_wait_flush_token_time {}", load_stream_max_wait_flush_token_time_ms); + return _status; } bthread_usleep(2 * 1000); // 2ms } @@ -184,10 +188,18 @@ Status TabletStream::append_data(const PStreamHeader& header, butil::IOBuf* data int64_t time_ms = timer.elapsed_time() / 1000 / 1000; g_load_stream_flush_wait_ms << time_ms; g_load_stream_flush_running_threads << 1; - return flush_token->submit_func(flush_func); + auto st = flush_token->submit_func(flush_func); + if (!st.ok()) { + _status = st; + } + return _status; } Status TabletStream::add_segment(const PStreamHeader& header, butil::IOBuf* data) { + if (!_status.ok()) { + return _status; + } + SCOPED_TIMER(_add_segment_timer); DCHECK(header.has_segment_statistics()); SegmentStatistics stat(header.segment_statistics()); @@ -204,15 +216,17 @@ Status TabletStream::add_segment(const PStreamHeader& header, butil::IOBuf* data { std::lock_guard lock_guard(_lock); if (!_segids_mapping.contains(src_id)) { - return Status::InternalError( + _status = Status::InternalError( "add segment failed, no segment written by this src be yet, src_id={}, " "segment_id={}", src_id, segid); + return _status; } if (segid >= _segids_mapping[src_id]->size()) { - return Status::InternalError( + _status = Status::InternalError( "add segment failed, segment is never written, src_id={}, segment_id={}", src_id, segid); + return _status; } new_segid = _segids_mapping[src_id]->at(segid); } @@ -221,16 +235,24 @@ Status TabletStream::add_segment(const PStreamHeader& header, butil::IOBuf* data auto add_segment_func = [this, new_segid, stat, flush_schema]() { signal::set_signal_task_id(_load_id); auto st = _load_stream_writer->add_segment(new_segid, stat, flush_schema); - if (!st.ok() && _failed_st->ok()) { - _failed_st = std::make_shared(st); + if (!st.ok() && _status.ok()) { + _status = st; LOG(INFO) << "add segment failed " << *this; } }; auto& flush_token = _flush_tokens[new_segid % _flush_tokens.size()]; - return flush_token->submit_func(add_segment_func); + auto st = flush_token->submit_func(add_segment_func); + if (!st.ok()) { + _status = st; + } + return _status; } Status TabletStream::close() { + if (!_status.ok()) { + return _status; + } + SCOPED_TIMER(_close_wait_timer); bthread::Mutex mu; std::unique_lock lock(mu); @@ -247,23 +269,24 @@ Status TabletStream::close() { if (ret) { cv.wait(lock); } else { - return Status::Error( + _status = Status::Error( "there is not enough thread resource for close load"); + return _status; } - if (!_failed_st->ok()) { - return *_failed_st; - } if (_next_segid.load() != _num_segments) { - return Status::Corruption( + _status = Status::Corruption( "segment num mismatch in tablet {}, expected: {}, actual: {}, load_id: {}", _id, _num_segments, _next_segid.load(), print_id(_load_id)); + return _status; } - Status st = Status::OK(); - auto close_func = [this, &mu, &cv, &st]() { + auto close_func = [this, &mu, &cv]() { signal::set_signal_task_id(_load_id); - st = _load_stream_writer->close(); + auto st = _load_stream_writer->close(); + if (!st.ok() && _status.ok()) { + _status = st; + } std::lock_guard lock(mu); cv.notify_one(); }; @@ -271,10 +294,10 @@ Status TabletStream::close() { if (ret) { cv.wait(lock); } else { - return Status::Error( + _status = Status::Error( "there is not enough thread resource for close load"); } - return st; + return _status; } IndexStream::IndexStream(PUniqueId load_id, int64_t id, int64_t txn_id, @@ -298,7 +321,7 @@ Status IndexStream::append_data(const PStreamHeader& header, butil::IOBuf* data) std::lock_guard lock_guard(_lock); auto it = _tablet_streams_map.find(tablet_id); if (it == _tablet_streams_map.end()) { - RETURN_IF_ERROR(_init_tablet_stream(tablet_stream, tablet_id, header.partition_id())); + _init_tablet_stream(tablet_stream, tablet_id, header.partition_id()); } else { tablet_stream = it->second; } @@ -307,17 +330,19 @@ Status IndexStream::append_data(const PStreamHeader& header, butil::IOBuf* data) return tablet_stream->append_data(header, data); } -Status IndexStream::_init_tablet_stream(TabletStreamSharedPtr& tablet_stream, int64_t tablet_id, - int64_t partition_id) { +void IndexStream::_init_tablet_stream(TabletStreamSharedPtr& tablet_stream, int64_t tablet_id, + int64_t partition_id) { tablet_stream = std::make_shared(_load_id, tablet_id, _txn_id, _load_stream_mgr, _profile); _tablet_streams_map[tablet_id] = tablet_stream; - RETURN_IF_ERROR(tablet_stream->init(_schema, _id, partition_id)); - return Status::OK(); + auto st = tablet_stream->init(_schema, _id, partition_id); + if (!st.ok()) { + LOG(WARNING) << "tablet stream init failed " << *tablet_stream; + } } -Status IndexStream::close(const std::vector& tablets_to_commit, - std::vector* success_tablet_ids, FailedTablets* failed_tablets) { +void IndexStream::close(const std::vector& tablets_to_commit, + std::vector* success_tablet_ids, FailedTablets* failed_tablets) { std::lock_guard lock_guard(_lock); SCOPED_TIMER(_close_wait_timer); // open all need commit tablets @@ -328,8 +353,7 @@ Status IndexStream::close(const std::vector& tablets_to_commit, TabletStreamSharedPtr tablet_stream; auto it = _tablet_streams_map.find(tablet.tablet_id()); if (it == _tablet_streams_map.end()) { - RETURN_IF_ERROR( - _init_tablet_stream(tablet_stream, tablet.tablet_id(), tablet.partition_id())); + _init_tablet_stream(tablet_stream, tablet.tablet_id(), tablet.partition_id()); tablet_stream->add_num_segments(tablet.num_segments()); } else { it->second->add_num_segments(tablet.num_segments()); @@ -345,7 +369,6 @@ Status IndexStream::close(const std::vector& tablets_to_commit, failed_tablets->emplace_back(tablet_stream->id(), st); } } - return Status::OK(); } // TODO: Profile is temporary disabled, because: @@ -398,8 +421,8 @@ Status LoadStream::init(const POpenLoadStreamRequest* request) { return Status::OK(); } -Status LoadStream::close(int64_t src_id, const std::vector& tablets_to_commit, - std::vector* success_tablet_ids, FailedTablets* failed_tablets) { +void LoadStream::close(int64_t src_id, const std::vector& tablets_to_commit, + std::vector* success_tablet_ids, FailedTablets* failed_tablets) { std::lock_guard lock_guard(_lock); SCOPED_TIMER(_close_wait_timer); @@ -417,16 +440,14 @@ Status LoadStream::close(int64_t src_id, const std::vector& tablets_t if (_close_load_cnt < _total_streams) { // do not return commit info if there is remaining streams. - return Status::OK(); + return; } for (auto& [_, index_stream] : _index_streams_map) { - RETURN_IF_ERROR( - index_stream->close(_tablets_to_commit, success_tablet_ids, failed_tablets)); + index_stream->close(_tablets_to_commit, success_tablet_ids, failed_tablets); } LOG(INFO) << "close load " << *this << ", success_tablet_num=" << success_tablet_ids->size() << ", failed_tablet_num=" << failed_tablets->size(); - return Status::OK(); } void LoadStream::_report_result(StreamId stream, const Status& status, @@ -612,8 +633,8 @@ void LoadStream::_dispatch(StreamId id, const PStreamHeader& hdr, butil::IOBuf* std::vector success_tablet_ids; FailedTablets failed_tablets; std::vector tablets_to_commit(hdr.tablets().begin(), hdr.tablets().end()); - auto st = close(hdr.src_id(), tablets_to_commit, &success_tablet_ids, &failed_tablets); - _report_result(id, st, success_tablet_ids, failed_tablets, true); + close(hdr.src_id(), tablets_to_commit, &success_tablet_ids, &failed_tablets); + _report_result(id, Status::OK(), success_tablet_ids, failed_tablets, true); brpc::StreamClose(id); } break; case PStreamHeader::GET_SCHEMA: { diff --git a/be/src/runtime/load_stream.h b/be/src/runtime/load_stream.h index 80e69c784ad789c..427bc2dbb62cc83 100644 --- a/be/src/runtime/load_stream.h +++ b/be/src/runtime/load_stream.h @@ -66,7 +66,7 @@ class TabletStream { std::atomic _next_segid; int64_t _num_segments = 0; bthread::Mutex _lock; - std::shared_ptr _failed_st; + Status _status; PUniqueId _load_id; int64_t _txn_id; RuntimeProfile* _profile = nullptr; @@ -86,12 +86,12 @@ class IndexStream { Status append_data(const PStreamHeader& header, butil::IOBuf* data); - Status close(const std::vector& tablets_to_commit, - std::vector* success_tablet_ids, FailedTablets* failed_tablet_ids); + void close(const std::vector& tablets_to_commit, + std::vector* success_tablet_ids, FailedTablets* failed_tablet_ids); private: - Status _init_tablet_stream(TabletStreamSharedPtr& tablet_stream, int64_t tablet_id, - int64_t partition_id); + void _init_tablet_stream(TabletStreamSharedPtr& tablet_stream, int64_t tablet_id, + int64_t partition_id); private: int64_t _id; @@ -124,8 +124,8 @@ class LoadStream : public brpc::StreamInputHandler { } } - Status close(int64_t src_id, const std::vector& tablets_to_commit, - std::vector* success_tablet_ids, FailedTablets* failed_tablet_ids); + void close(int64_t src_id, const std::vector& tablets_to_commit, + std::vector* success_tablet_ids, FailedTablets* failed_tablet_ids); // callbacks called by brpc int on_received_messages(StreamId id, butil::IOBuf* const messages[], size_t size) override; diff --git a/be/src/runtime/load_stream_writer.cpp b/be/src/runtime/load_stream_writer.cpp index 3e66787a9bd372a..ca78311b8ea2508 100644 --- a/be/src/runtime/load_stream_writer.cpp +++ b/be/src/runtime/load_stream_writer.cpp @@ -84,6 +84,8 @@ LoadStreamWriter::LoadStreamWriter(WriteRequest* context, RuntimeProfile* profil } LoadStreamWriter::~LoadStreamWriter() { + g_load_stream_file_writer_cnt << -_segment_file_writers.size(); + g_load_stream_file_writer_cnt << -_inverted_file_writers.size(); g_load_stream_writer_cnt << -1; } @@ -140,7 +142,6 @@ Status LoadStreamWriter::close_writer(uint32_t segid, FileType file_type) { file_type == FileType::SEGMENT_FILE ? _segment_file_writers : _inverted_file_writers; { std::lock_guard lock_guard(_lock); - DBUG_EXECUTE_IF("LoadStreamWriter.close_writer.uninited_writer", { _is_init = false; }); if (!_is_init) { return Status::Corruption("close_writer failed, LoadStreamWriter is not inited"); } @@ -165,7 +166,6 @@ Status LoadStreamWriter::close_writer(uint32_t segid, FileType file_type) { _is_canceled = true; return st; } - g_load_stream_file_writer_cnt << -1; LOG(INFO) << "file " << segid << " path " << file_writer->path().native() << "closed, written " << file_writer->bytes_appended() << " bytes" << ", file type is " << file_type; @@ -183,7 +183,6 @@ Status LoadStreamWriter::add_segment(uint32_t segid, const SegmentStatistics& st size_t inverted_file_size = 0; { std::lock_guard lock_guard(_lock); - DBUG_EXECUTE_IF("LoadStreamWriter.add_segment.uninited_writer", { _is_init = false; }); if (!_is_init) { return Status::Corruption("add_segment failed, LoadStreamWriter is not inited"); } diff --git a/be/src/runtime/memory/cache_manager.cpp b/be/src/runtime/memory/cache_manager.cpp index 9bf3d1e12d0c8cc..a6516c40a35770a 100644 --- a/be/src/runtime/memory/cache_manager.cpp +++ b/be/src/runtime/memory/cache_manager.cpp @@ -48,24 +48,22 @@ int64_t CacheManager::for_each_cache_prune_stale(RuntimeProfile* profile) { return 0; } -int64_t CacheManager::for_each_cache_prune_all(RuntimeProfile* profile) { - if (need_prune(&_last_prune_all_timestamp, "all")) { +int64_t CacheManager::for_each_cache_prune_all(RuntimeProfile* profile, bool force) { + if (force || need_prune(&_last_prune_all_timestamp, "all")) { return for_each_cache_prune_stale_wrap( - [](CachePolicy* cache_policy) { cache_policy->prune_all(false); }, profile); + [force](CachePolicy* cache_policy) { cache_policy->prune_all(force); }, profile); } return 0; } -void CacheManager::clear_once() { +int64_t CacheManager::cache_prune_all(CachePolicy::CacheType type, bool force) { std::lock_guard l(_caches_lock); - for (const auto& pair : _caches) { - pair.second->prune_all(true); + auto* cache_policy = _caches[type]; + if (!cache_policy->enable_prune()) { + return -1; } -} - -void CacheManager::clear_once(CachePolicy::CacheType type) { - std::lock_guard l(_caches_lock); - _caches[type]->prune_all(true); // will print log + cache_policy->prune_all(force); + return cache_policy->profile()->get_counter("FreedMemory")->value(); } } // namespace doris diff --git a/be/src/runtime/memory/cache_manager.h b/be/src/runtime/memory/cache_manager.h index 20372366aa1a7d4..d94dca501670bf1 100644 --- a/be/src/runtime/memory/cache_manager.h +++ b/be/src/runtime/memory/cache_manager.h @@ -64,10 +64,9 @@ class CacheManager { int64_t for_each_cache_prune_stale(RuntimeProfile* profile = nullptr); - int64_t for_each_cache_prune_all(RuntimeProfile* profile = nullptr); - - void clear_once(); - void clear_once(CachePolicy::CacheType type); + // if force is true, regardless of the two prune interval and cache size, cache will be pruned this time. + int64_t for_each_cache_prune_all(RuntimeProfile* profile = nullptr, bool force = false); + int64_t cache_prune_all(CachePolicy::CacheType type, bool force = false); bool need_prune(int64_t* last_timestamp, const std::string& type) { int64_t now = UnixSeconds(); diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h index e59c5c7ac3e9787..c457afd86898f27 100644 --- a/be/src/runtime/memory/cache_policy.h +++ b/be/src/runtime/memory/cache_policy.h @@ -47,6 +47,7 @@ class CachePolicy { CREATE_TABLET_RR_IDX_CACHE = 15, CLOUD_TABLET_CACHE = 16, CLOUD_TXN_DELETE_BITMAP_CACHE = 17, + NONE = 18, // not be used }; static std::string type_string(CacheType type) { @@ -94,6 +95,34 @@ class CachePolicy { __builtin_unreachable(); } + inline static std::unordered_map StringToType = { + {"DataPageCache", CacheType::DATA_PAGE_CACHE}, + {"IndexPageCache", CacheType::INDEXPAGE_CACHE}, + {"PKIndexPageCache", CacheType::PK_INDEX_PAGE_CACHE}, + {"SchemaCache", CacheType::SCHEMA_CACHE}, + {"SegmentCache", CacheType::SEGMENT_CACHE}, + {"InvertedIndexSearcherCache", CacheType::INVERTEDINDEX_SEARCHER_CACHE}, + {"InvertedIndexQueryCache", CacheType::INVERTEDINDEX_QUERY_CACHE}, + {"PointQueryLookupConnectionCache", CacheType::LOOKUP_CONNECTION_CACHE}, + {"PointQueryRowCache", CacheType::POINT_QUERY_ROW_CACHE}, + {"MowDeleteBitmapAggCache", CacheType::DELETE_BITMAP_AGG_CACHE}, + {"MowTabletVersionCache", CacheType::TABLET_VERSION_CACHE}, + {"LastSuccessChannelCache", CacheType::LAST_SUCCESS_CHANNEL_CACHE}, + {"CommonObjLRUCache", CacheType::COMMON_OBJ_LRU_CACHE}, + {"ForUT", CacheType::FOR_UT}, + {"TabletSchemaCache", CacheType::TABLET_SCHEMA_CACHE}, + {"CreateTabletRRIdxCache", CacheType::CREATE_TABLET_RR_IDX_CACHE}, + {"CloudTabletCache", CacheType::CLOUD_TABLET_CACHE}, + {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE}}; + + static CacheType string_to_type(std::string type) { + if (StringToType.contains(type)) { + return StringToType[type]; + } else { + return CacheType::NONE; + } + } + CachePolicy(CacheType type, uint32_t stale_sweep_time_s, bool enable_prune); virtual ~CachePolicy(); diff --git a/be/src/runtime/memory/global_memory_arbitrator.cpp b/be/src/runtime/memory/global_memory_arbitrator.cpp index 35fa350987f34f5..344bcbc59846d9c 100644 --- a/be/src/runtime/memory/global_memory_arbitrator.cpp +++ b/be/src/runtime/memory/global_memory_arbitrator.cpp @@ -40,7 +40,7 @@ std::atomic GlobalMemoryArbitrator::_s_process_reserved_memory = 0; std::atomic GlobalMemoryArbitrator::refresh_interval_memory_growth = 0; bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) { - if (sys_mem_available() - bytes < MemInfo::sys_mem_available_low_water_mark()) { + if (sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark()) { return false; } int64_t old_reserved_mem = _s_process_reserved_memory.load(std::memory_order_relaxed); @@ -50,7 +50,7 @@ bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) { if (UNLIKELY(vm_rss_sub_allocator_cache() + refresh_interval_memory_growth.load(std::memory_order_relaxed) + new_reserved_mem >= - MemInfo::mem_limit())) { + MemInfo::soft_mem_limit())) { return false; } } while (!_s_process_reserved_memory.compare_exchange_weak(old_reserved_mem, new_reserved_mem, diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp index 46eddfa9810704b..cc695a6fdd51e1c 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.cpp +++ b/be/src/runtime/memory/mem_tracker_limiter.cpp @@ -43,7 +43,17 @@ namespace doris { -bvar::Adder g_memtrackerlimiter_cnt("memtrackerlimiter_cnt"); +static bvar::Adder memory_memtrackerlimiter_cnt("memory_memtrackerlimiter_cnt"); +static bvar::Adder memory_all_trackers_sum_bytes("memory_all_trackers_sum_bytes"); +static bvar::Adder memory_global_trackers_sum_bytes("memory_global_trackers_sum_bytes"); +static bvar::Adder memory_query_trackers_sum_bytes("memory_query_trackers_sum_bytes"); +static bvar::Adder memory_load_trackers_sum_bytes("memory_load_trackers_sum_bytes"); +static bvar::Adder memory_compaction_trackers_sum_bytes( + "memory_compaction_trackers_sum_bytes"); +static bvar::Adder memory_schema_change_trackers_sum_bytes( + "memory_schema_change_trackers_sum_bytes"); +static bvar::Adder memory_other_trackers_sum_bytes("memory_other_trackers_sum_bytes"); + constexpr auto GC_MAX_SEEK_TRACKER = 1000; std::atomic MemTrackerLimiter::_enable_print_log_process_usage {true}; @@ -80,7 +90,7 @@ MemTrackerLimiter::MemTrackerLimiter(Type type, const std::string& label, int64_ if (_type == Type::LOAD || _type == Type::QUERY) { _query_statistics = std::make_shared(); } - g_memtrackerlimiter_cnt << 1; + memory_memtrackerlimiter_cnt << 1; } std::shared_ptr MemTrackerLimiter::create_shared(MemTrackerLimiter::Type type, @@ -137,7 +147,7 @@ MemTrackerLimiter::~MemTrackerLimiter() { << print_address_sanitizers(); #endif } - g_memtrackerlimiter_cnt << -1; + memory_memtrackerlimiter_cnt << -1; } #ifndef NDEBUG @@ -223,9 +233,40 @@ void MemTrackerLimiter::refresh_global_counter() { } } } + int64_t all_trackers_mem_sum = 0; for (auto it : type_mem_sum) { MemTrackerLimiter::TypeMemSum[it.first]->set(it.second); + all_trackers_mem_sum += it.second; + switch (it.first) { + case Type::GLOBAL: + memory_global_trackers_sum_bytes + << it.second - memory_global_trackers_sum_bytes.get_value(); + break; + case Type::QUERY: + memory_query_trackers_sum_bytes + << it.second - memory_query_trackers_sum_bytes.get_value(); + break; + case Type::LOAD: + memory_load_trackers_sum_bytes + << it.second - memory_load_trackers_sum_bytes.get_value(); + break; + case Type::COMPACTION: + memory_compaction_trackers_sum_bytes + << it.second - memory_compaction_trackers_sum_bytes.get_value(); + break; + case Type::SCHEMA_CHANGE: + memory_schema_change_trackers_sum_bytes + << it.second - memory_schema_change_trackers_sum_bytes.get_value(); + break; + case Type::OTHER: + memory_other_trackers_sum_bytes + << it.second - memory_other_trackers_sum_bytes.get_value(); + } } + all_trackers_mem_sum += MemInfo::allocator_cache_mem(); + all_trackers_mem_sum += MemInfo::allocator_metadata_mem(); + memory_all_trackers_sum_bytes << all_trackers_mem_sum - + memory_all_trackers_sum_bytes.get_value(); } void MemTrackerLimiter::clean_tracker_limiter_group() { @@ -248,7 +289,7 @@ void MemTrackerLimiter::clean_tracker_limiter_group() { void MemTrackerLimiter::make_process_snapshots(std::vector* snapshots) { MemTrackerLimiter::refresh_global_counter(); - int64_t all_tracker_mem_sum = 0; + int64_t all_trackers_mem_sum = 0; Snapshot snapshot; for (auto it : MemTrackerLimiter::TypeMemSum) { snapshot.type = "overview"; @@ -257,7 +298,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector snapshot.cur_consumption = it.second->current_value(); snapshot.peak_consumption = it.second->peak_value(); (*snapshots).emplace_back(snapshot); - all_tracker_mem_sum += it.second->current_value(); + all_trackers_mem_sum += it.second->current_value(); } snapshot.type = "overview"; @@ -266,7 +307,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector snapshot.cur_consumption = MemInfo::allocator_cache_mem(); snapshot.peak_consumption = -1; (*snapshots).emplace_back(snapshot); - all_tracker_mem_sum += MemInfo::allocator_cache_mem(); + all_trackers_mem_sum += MemInfo::allocator_cache_mem(); snapshot.type = "overview"; snapshot.label = "tc/jemalloc_metadata"; @@ -274,20 +315,28 @@ void MemTrackerLimiter::make_process_snapshots(std::vector snapshot.cur_consumption = MemInfo::allocator_metadata_mem(); snapshot.peak_consumption = -1; (*snapshots).emplace_back(snapshot); - all_tracker_mem_sum += MemInfo::allocator_metadata_mem(); + all_trackers_mem_sum += MemInfo::allocator_metadata_mem(); + + snapshot.type = "overview"; + snapshot.label = "reserved_memory"; + snapshot.limit = -1; + snapshot.cur_consumption = GlobalMemoryArbitrator::process_reserved_memory(); + snapshot.peak_consumption = -1; + (*snapshots).emplace_back(snapshot); + all_trackers_mem_sum += GlobalMemoryArbitrator::process_reserved_memory(); snapshot.type = "overview"; - snapshot.label = "sum of all trackers"; // is virtual memory + snapshot.label = "sum_of_all_trackers"; // is virtual memory snapshot.limit = -1; - snapshot.cur_consumption = all_tracker_mem_sum; + snapshot.cur_consumption = all_trackers_mem_sum; snapshot.peak_consumption = -1; (*snapshots).emplace_back(snapshot); snapshot.type = "overview"; #ifdef ADDRESS_SANITIZER - snapshot.label = "[ASAN]process resident memory"; // from /proc VmRSS VmHWM + snapshot.label = "[ASAN]VmRSS(process resident memory)"; // from /proc VmRSS VmHWM #else - snapshot.label = "process resident memory"; // from /proc VmRSS VmHWM + snapshot.label = "VmRSS(process resident memory)"; // from /proc VmRSS VmHWM #endif snapshot.limit = -1; snapshot.cur_consumption = PerfCounters::get_vm_rss(); @@ -295,14 +344,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector (*snapshots).emplace_back(snapshot); snapshot.type = "overview"; - snapshot.label = "reserve_memory"; - snapshot.limit = -1; - snapshot.cur_consumption = GlobalMemoryArbitrator::process_reserved_memory(); - snapshot.peak_consumption = -1; - (*snapshots).emplace_back(snapshot); - - snapshot.type = "overview"; - snapshot.label = "process virtual memory"; // from /proc VmSize VmPeak + snapshot.label = "VmSize(process virtual memory)"; // from /proc VmSize VmPeak snapshot.limit = -1; snapshot.cur_consumption = PerfCounters::get_vm_size(); snapshot.peak_consumption = PerfCounters::get_vm_peak(); diff --git a/be/src/runtime/memory/memory_reclamation.cpp b/be/src/runtime/memory/memory_reclamation.cpp index 536c4658c8c515d..3adf1d1ac75718c 100644 --- a/be/src/runtime/memory/memory_reclamation.cpp +++ b/be/src/runtime/memory/memory_reclamation.cpp @@ -47,7 +47,6 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) { }}; freed_mem += CacheManager::instance()->for_each_cache_prune_stale(profile.get()); - MemInfo::notify_je_purge_dirty_pages(); if (freed_mem > MemInfo::process_minor_gc_size()) { return true; } @@ -98,7 +97,6 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) { }}; freed_mem += CacheManager::instance()->for_each_cache_prune_all(profile.get()); - MemInfo::notify_je_purge_dirty_pages(); if (freed_mem > MemInfo::process_full_gc_size()) { return true; } diff --git a/be/src/runtime/memory/thread_mem_tracker_mgr.h b/be/src/runtime/memory/thread_mem_tracker_mgr.h index 141c54382cfd901..73cdd3243da1dc8 100644 --- a/be/src/runtime/memory/thread_mem_tracker_mgr.h +++ b/be/src/runtime/memory/thread_mem_tracker_mgr.h @@ -242,17 +242,29 @@ inline void ThreadMemTrackerMgr::consume(int64_t size, int skip_large_memory_che flush_untracked_mem(); } - if (skip_large_memory_check == 0 && doris::config::large_memory_check_bytes > 0 && - size > doris::config::large_memory_check_bytes) { - _stop_consume = true; - LOG(WARNING) << fmt::format( - "malloc or new large memory: {}, {}, this is just a warning, not prevent memory " - "alloc, stacktrace:\n{}", - size, - is_attach_query() ? "in query or load: " + print_id(_query_id) - : "not in query or load", - get_stack_trace()); - _stop_consume = false; + if (skip_large_memory_check == 0) { + if (doris::config::stacktrace_in_alloc_large_memory_bytes > 0 && + size > doris::config::stacktrace_in_alloc_large_memory_bytes) { + _stop_consume = true; + LOG(WARNING) << fmt::format( + "alloc large memory: {}, {}, this is just a warning, not prevent memory alloc, " + "stacktrace:\n{}", + size, + is_attach_query() ? "in query or load: " + print_id(_query_id) + : "not in query or load", + get_stack_trace()); + _stop_consume = false; + } + if (doris::config::crash_in_alloc_large_memory_bytes > 0 && + size > doris::config::crash_in_alloc_large_memory_bytes) { + LOG(FATAL) << fmt::format( + "alloc large memory: {}, {}, crash generate core dumpsto help analyze, " + "stacktrace:\n{}", + size, + is_attach_query() ? "in query or load: " + print_id(_query_id) + : "not in query or load", + get_stack_trace()); + } } } diff --git a/be/src/runtime/query_context.cpp b/be/src/runtime/query_context.cpp index dd7cf4f55b87063..f10a0a5edca90c4 100644 --- a/be/src/runtime/query_context.cpp +++ b/be/src/runtime/query_context.cpp @@ -55,15 +55,34 @@ class DelayReleaseToken : public Runnable { std::unique_ptr token_; }; +const std::string toString(QuerySource queryType) { + switch (queryType) { + case QuerySource::INTERNAL_FRONTEND: + return "INTERNAL_FRONTEND"; + case QuerySource::STREAM_LOAD: + return "STREAM_LOAD"; + case QuerySource::GROUP_COMMIT_LOAD: + return "EXTERNAL_QUERY"; + case QuerySource::ROUTINE_LOAD: + return "ROUTINE_LOAD"; + case QuerySource::EXTERNAL_CONNECTOR: + return "EXTERNAL_CONNECTOR"; + default: + return "UNKNOWN"; + } +} + QueryContext::QueryContext(TUniqueId query_id, ExecEnv* exec_env, const TQueryOptions& query_options, TNetworkAddress coord_addr, - bool is_pipeline, bool is_nereids, TNetworkAddress current_connect_fe) + bool is_pipeline, bool is_nereids, TNetworkAddress current_connect_fe, + QuerySource query_source) : _timeout_second(-1), _query_id(query_id), _exec_env(exec_env), _is_pipeline(is_pipeline), _is_nereids(is_nereids), - _query_options(query_options) { + _query_options(query_options), + _query_source(query_source) { _init_query_mem_tracker(); SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(query_mem_tracker); _query_watcher.start(); @@ -89,7 +108,7 @@ QueryContext::QueryContext(TUniqueId query_id, ExecEnv* exec_env, !this->current_connect_fe.hostname.empty() && this->current_connect_fe.port != 0; DCHECK_EQ(is_report_fe_addr_valid, true); } - + clock_gettime(CLOCK_MONOTONIC, &this->_query_arrival_timestamp); register_memory_statistics(); register_cpu_statistics(); } @@ -178,6 +197,7 @@ QueryContext::~QueryContext() { _runtime_predicates.clear(); file_scan_range_params_map.clear(); obj_pool.clear(); + _merge_controller_handler.reset(); _exec_env->spill_stream_mgr()->async_cleanup_query(_query_id); diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index 0d7870a0e1d36a3..006305bf5996ac5 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -63,6 +63,16 @@ struct ReportStatusRequest { std::function cancel_fn; }; +enum class QuerySource { + INTERNAL_FRONTEND, + STREAM_LOAD, + GROUP_COMMIT_LOAD, + ROUTINE_LOAD, + EXTERNAL_CONNECTOR +}; + +const std::string toString(QuerySource query_source); + // Save the common components of fragments in a query. // Some components like DescriptorTbl may be very large // that will slow down each execution of fragments when DeSer them every time. @@ -73,7 +83,7 @@ class QueryContext { public: QueryContext(TUniqueId query_id, ExecEnv* exec_env, const TQueryOptions& query_options, TNetworkAddress coord_addr, bool is_pipeline, bool is_nereids, - TNetworkAddress current_connect_fe); + TNetworkAddress current_connect_fe, QuerySource query_type); ~QueryContext(); @@ -230,18 +240,8 @@ class QueryContext { return _running_big_mem_op_num.load(std::memory_order_relaxed); } - void set_weighted_memory(int64_t weighted_limit, double weighted_ratio) { - std::lock_guard l(_weighted_mem_lock); - _weighted_limit = weighted_limit; - _weighted_ratio = weighted_ratio; - } - - void get_weighted_memory(int64_t& weighted_limit, int64_t& weighted_consumption) { - std::lock_guard l(_weighted_mem_lock); - weighted_limit = _weighted_limit; - weighted_consumption = int64_t(query_mem_tracker->consumption() * _weighted_ratio); - } - + void set_spill_threshold(int64_t spill_threshold) { _spill_threshold = spill_threshold; } + int64_t spill_threshold() { return _spill_threshold; } DescriptorTbl* desc_tbl = nullptr; bool set_rsc_info = false; std::string user; @@ -260,6 +260,12 @@ class QueryContext { // only for file scan node std::map file_scan_range_params_map; + void update_wg_cpu_adder(int64_t delta_cpu_time) { + if (_workload_group != nullptr) { + _workload_group->update_cpu_adder(delta_cpu_time); + } + } + private: int _timeout_second; TUniqueId _query_id; @@ -311,11 +317,13 @@ class QueryContext { std::map> _fragment_id_to_pipeline_ctx; std::mutex _pipeline_map_write_lock; - std::mutex _weighted_mem_lock; - double _weighted_ratio = 0; - int64_t _weighted_limit = 0; + std::atomic _spill_threshold {0}; std::mutex _profile_mutex; + timespec _query_arrival_timestamp; + // Distinguish the query source, for query that comes from fe, we will have some memory structure on FE to + // help us manage the query. + QuerySource _query_source; // when fragment of pipeline is closed, it will register its profile to this map by using add_fragment_profile // flatten profile of one fragment: @@ -354,6 +362,9 @@ class QueryContext { bool enable_profile() const { return _query_options.__isset.enable_profile && _query_options.enable_profile; } + + timespec get_query_arrival_timestamp() const { return this->_query_arrival_timestamp; } + QuerySource get_query_source() const { return this->_query_source; } }; } // namespace doris diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index 625b487d0ee1f31..b9bd01095f34e86 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -343,7 +343,7 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz closure->request_->set_filter_size(cnt_val->global_size); stub->sync_filter_size(closure->cntl_.get(), closure->request_.get(), - closure->response_.get(), brpc::DoNothing()); + closure->response_.get(), closure.get()); closure.release(); } } @@ -396,12 +396,7 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ RuntimeFilterWrapperHolder holder; RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(¶ms, holder.getHandle())); - auto st = cnt_val->filter->merge_from(holder.getHandle()->get()); - if (!st) { - // prevent error ignored - DCHECK(false) << st.msg(); - return st; - } + RETURN_IF_ERROR(cnt_val->filter->merge_from(holder.getHandle()->get())); cnt_val->arrive_id.insert(UniqueId(request->fragment_instance_id())); merged_size = cnt_val->arrive_id.size(); @@ -430,7 +425,11 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ } if (data != nullptr && len > 0) { - request_attachment.append(data, len); + void* allocated = malloc(len); + memcpy(allocated, data, len); + // control the memory by doris self to avoid using brpc's thread local storage + // because the memory of tls will not be released + request_attachment.append_user_data(allocated, len, [](void* ptr) { free(ptr); }); has_attachment = true; } @@ -464,7 +463,7 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ continue; } stub->apply_filterv2(closure->cntl_.get(), closure->request_.get(), - closure->response_.get(), brpc::DoNothing()); + closure->response_.get(), closure.get()); closure.release(); } } diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 34aa457d5a6afb4..4930048ed805d55 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -87,11 +87,6 @@ RuntimeState::RuntimeState(const TPlanFragmentExecParams& fragment_exec_params, } #endif DCHECK(_query_mem_tracker != nullptr && _query_mem_tracker->label() != "Orphan"); - if (ctx) { - _runtime_filter_mgr = std::make_unique( - fragment_exec_params.query_id, RuntimeFilterParamsContext::create(this), - _query_mem_tracker); - } if (fragment_exec_params.__isset.runtime_filter_params) { _query_ctx->runtime_filter_mgr()->set_runtime_filter_params( fragment_exec_params.runtime_filter_params); @@ -127,8 +122,6 @@ RuntimeState::RuntimeState(const TUniqueId& instance_id, const TUniqueId& query_ } #endif DCHECK(_query_mem_tracker != nullptr && _query_mem_tracker->label() != "Orphan"); - _runtime_filter_mgr.reset(new RuntimeFilterMgr( - query_id, RuntimeFilterParamsContext::create(this), _query_mem_tracker)); } RuntimeState::RuntimeState(pipeline::PipelineFragmentContext*, const TUniqueId& instance_id, @@ -194,8 +187,6 @@ RuntimeState::RuntimeState(const TUniqueId& query_id, int32_t fragment_id, } #endif DCHECK(_query_mem_tracker != nullptr && _query_mem_tracker->label() != "Orphan"); - _runtime_filter_mgr.reset(new RuntimeFilterMgr( - query_id, RuntimeFilterParamsContext::create(this), _query_mem_tracker)); } RuntimeState::RuntimeState(const TQueryGlobals& query_globals) @@ -255,7 +246,6 @@ RuntimeState::~RuntimeState() { } _obj_pool->clear(); - _runtime_filter_mgr.reset(); } Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOptions& query_options, @@ -469,15 +459,6 @@ std::string RuntimeState::get_error_log_file_path() { return _error_log_file_path; } -int64_t RuntimeState::get_load_mem_limit() { - // TODO: the code is abandoned, it can be deleted after v1.3 - if (_query_options.__isset.load_mem_limit && _query_options.load_mem_limit > 0) { - return _query_options.load_mem_limit; - } else { - return _query_mem_tracker->limit(); - } -} - void RuntimeState::resize_op_id_to_local_state(int operator_size) { _op_id_to_local_state.resize(-operator_size); } diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index e3f8078156fc7e3..f43d0a163dfdaa3 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -344,8 +344,6 @@ class RuntimeState { return _query_options.disable_stream_preaggregations; } - bool enable_spill() const { return _query_options.enable_spilling; } - int32_t runtime_filter_wait_time_ms() const { return _query_options.runtime_filter_wait_time_ms; } @@ -439,25 +437,15 @@ class RuntimeState { std::vector& error_tablet_infos() { return _error_tablet_infos; } - // get mem limit for load channel - // if load mem limit is not set, or is zero, using query mem limit instead. - int64_t get_load_mem_limit(); - // local runtime filter mgr, the runtime filter do not have remote target or // not need local merge should regist here. the instance exec finish, the local // runtime filter mgr can release the memory of local runtime filter - RuntimeFilterMgr* local_runtime_filter_mgr() { - if (_pipeline_x_runtime_filter_mgr) { - return _pipeline_x_runtime_filter_mgr; - } else { - return _runtime_filter_mgr.get(); - } - } + RuntimeFilterMgr* local_runtime_filter_mgr() { return _runtime_filter_mgr; } RuntimeFilterMgr* global_runtime_filter_mgr(); - void set_pipeline_x_runtime_filter_mgr(RuntimeFilterMgr* pipeline_x_runtime_filter_mgr) { - _pipeline_x_runtime_filter_mgr = pipeline_x_runtime_filter_mgr; + void set_runtime_filter_mgr(RuntimeFilterMgr* runtime_filter_mgr) { + _runtime_filter_mgr = runtime_filter_mgr; } QueryContext* get_query_ctx() { return _query_ctx; } @@ -517,12 +505,6 @@ class RuntimeState { void set_be_exec_version(int32_t version) noexcept { _query_options.be_exec_version = version; } - int64_t external_agg_bytes_threshold() const { - return _query_options.__isset.external_agg_bytes_threshold - ? _query_options.external_agg_bytes_threshold - : 0; - } - inline bool enable_delete_sub_pred_v2() const { return _query_options.__isset.enable_delete_sub_predicate_v2 && _query_options.enable_delete_sub_predicate_v2; @@ -595,9 +577,9 @@ class RuntimeState { int64_t min_revocable_mem() const { if (_query_options.__isset.min_revocable_mem) { - return _query_options.min_revocable_mem; + return std::max(_query_options.min_revocable_mem, (int64_t)1); } - return 0; + return 1; } void set_max_operator_id(int max_operator_id) { _max_operator_id = max_operator_id; } @@ -642,11 +624,8 @@ class RuntimeState { const DescriptorTbl* _desc_tbl = nullptr; std::shared_ptr _obj_pool; - // runtime filter - std::unique_ptr _runtime_filter_mgr; - // owned by PipelineFragmentContext - RuntimeFilterMgr* _pipeline_x_runtime_filter_mgr = nullptr; + RuntimeFilterMgr* _runtime_filter_mgr = nullptr; // Lock protecting _error_log and _unreported_error_idx std::mutex _error_log_lock; @@ -746,9 +725,11 @@ class RuntimeState { std::string _s3_error_log_file_path; }; -#define RETURN_IF_CANCELLED(state) \ - do { \ - if (UNLIKELY((state)->is_cancelled())) return Status::Cancelled("Cancelled"); \ +#define RETURN_IF_CANCELLED(state) \ + do { \ + if (UNLIKELY((state)->is_cancelled())) { \ + return (state)->cancel_reason(); \ + } \ } while (false) } // namespace doris diff --git a/be/src/runtime/snapshot_loader.cpp b/be/src/runtime/snapshot_loader.cpp index f34dfde229abc3a..d04a5463879c9ef 100644 --- a/be/src/runtime/snapshot_loader.cpp +++ b/be/src/runtime/snapshot_loader.cpp @@ -482,6 +482,8 @@ Status SnapshotLoader::remote_http_download( remote_be_addr.hostname, remote_be_addr.port, token); std::string remote_url_prefix = fmt::format("{}&file={}", base_url, remote_path); + LOG(INFO) << "list remote files: " << remote_url_prefix << ", job: " << _job_id + << ", task id: " << _task_id << ", remote be: " << remote_be_addr; string file_list_str; auto list_files_cb = [&remote_url_prefix, &file_list_str](HttpClient* client) { RETURN_IF_ERROR(client->init(remote_url_prefix)); diff --git a/be/src/runtime/stream_load/stream_load_context.h b/be/src/runtime/stream_load/stream_load_context.h index 633c3af428b94ed..f7c4a0d474fa319 100644 --- a/be/src/runtime/stream_load/stream_load_context.h +++ b/be/src/runtime/stream_load/stream_load_context.h @@ -37,6 +37,7 @@ #include "common/utils.h" #include "runtime/exec_env.h" #include "runtime/stream_load/stream_load_executor.h" +#include "runtime/thread_context.h" #include "util/byte_buffer.h" #include "util/time.h" #include "util/uid_util.h" @@ -95,9 +96,14 @@ class StreamLoadContext { public: StreamLoadContext(ExecEnv* exec_env) : id(UniqueId::gen_uid()), _exec_env(exec_env) { start_millis = UnixMillis(); + SCOPED_ATTACH_TASK(ExecEnv::GetInstance()->stream_load_pipe_tracker()); + schema_buffer = ByteBuffer::allocate(config::stream_tvf_buffer_size); } ~StreamLoadContext() { + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER( + ExecEnv::GetInstance()->stream_load_pipe_tracker()); + schema_buffer.reset(); if (need_rollback) { _exec_env->stream_load_executor()->rollback_txn(this); need_rollback = false; @@ -184,7 +190,7 @@ class StreamLoadContext { std::shared_ptr body_sink; std::shared_ptr pipe; - ByteBufferPtr schema_buffer = ByteBuffer::allocate(config::stream_tvf_buffer_size); + ByteBufferPtr schema_buffer; TStreamLoadPutResult put_result; TStreamLoadMultiTablePutResult multi_table_put_result; diff --git a/be/src/runtime/stream_load/stream_load_executor.cpp b/be/src/runtime/stream_load/stream_load_executor.cpp index 28b0556aafdd2cc..4b0788186a0e8af 100644 --- a/be/src/runtime/stream_load/stream_load_executor.cpp +++ b/be/src/runtime/stream_load/stream_load_executor.cpp @@ -142,11 +142,16 @@ Status StreamLoadExecutor::execute_plan_fragment(std::shared_ptr(); @@ -215,6 +216,7 @@ Status BaseTabletsChannel::incremental_open(const PTabletWriterOpenRequest& para ss << "LocalTabletsChannel txn_id: " << _txn_id << " load_id: " << print_id(params.id()) << " incremental open delta writer: "; + // every change will hold _lock. this find in under _lock too. so no need _tablet_writers_lock again. for (const auto& tablet : params.tablets()) { if (_tablet_writers.find(tablet.tablet_id()) != _tablet_writers.end()) { continue; @@ -237,6 +239,7 @@ Status BaseTabletsChannel::incremental_open(const PTabletWriterOpenRequest& para auto delta_writer = create_delta_writer(wrequest); { + // here we modify _tablet_writers. so need lock. std::lock_guard l(_tablet_writers_lock); _tablet_writers.emplace(tablet.tablet_id(), std::move(delta_writer)); } @@ -291,6 +294,7 @@ Status TabletsChannel::close(LoadChannel* parent, const PTabletWriterAddBlockReq // All senders are closed // 1. close all delta writers std::set need_wait_writers; + // under _lock. no need _tablet_writers_lock again. for (auto&& [tablet_id, writer] : _tablet_writers) { if (_partition_ids.contains(writer->partition_id())) { auto st = writer->close(); @@ -492,6 +496,7 @@ Status BaseTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& req #endif int tablet_cnt = 0; + // under _lock. no need _tablet_writers_lock again. for (const auto& tablet : request.tablets()) { if (_tablet_writers.find(tablet.tablet_id()) != _tablet_writers.end()) { continue; @@ -574,6 +579,11 @@ Status BaseTabletsChannel::_write_block_data( std::function write_func) { google::protobuf::RepeatedPtrField* tablet_errors = response->mutable_tablet_errors(); + + // add_batch may concurrency with inc_open but not under _lock. + // so need to protect it with _tablet_writers_lock. + std::lock_guard l(_tablet_writers_lock); + auto tablet_writer_it = _tablet_writers.find(tablet_id); if (tablet_writer_it == _tablet_writers.end()) { return Status::InternalError("unknown tablet to append data, tablet={}", tablet_id); diff --git a/be/src/runtime/tablets_channel.h b/be/src/runtime/tablets_channel.h index 48e987341587d7f..87fbf9d06aaaa75 100644 --- a/be/src/runtime/tablets_channel.h +++ b/be/src/runtime/tablets_channel.h @@ -143,11 +143,8 @@ class BaseTabletsChannel { // id of this load channel TabletsChannelKey _key; - // make execute sequence + // protect _state change. open and close. when add_batch finished, lock to change _next_seqs also std::mutex _lock; - - SpinLock _tablet_writers_lock; - enum State { kInitialized, kOpened, @@ -173,8 +170,10 @@ class BaseTabletsChannel { // currently it's OK. Status _close_status; - // tablet_id -> TabletChannel + // tablet_id -> TabletChannel. it will only be changed in open() or inc_open() std::unordered_map> _tablet_writers; + // protect _tablet_writers + SpinLock _tablet_writers_lock; // broken tablet ids. // If a tablet write fails, it's id will be added to this set. // So that following batch will not handle this tablet anymore. diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index b009affa53fc7c7..c54b1a6892bd9ee 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -114,6 +114,40 @@ __VA_ARGS__; \ } while (0) +#define LIMIT_LOCAL_SCAN_IO(data_dir, bytes_read) \ + std::shared_ptr iot = nullptr; \ + auto* t_ctx = doris::thread_context(true); \ + if (t_ctx) { \ + iot = t_ctx->get_local_scan_io_throttle(data_dir); \ + } \ + if (iot) { \ + iot->acquire(-1); \ + } \ + Defer defer { \ + [&]() { \ + if (iot) { \ + iot->update_next_io_time(*bytes_read); \ + t_ctx->update_total_local_scan_io_adder(*bytes_read); \ + } \ + } \ + } + +#define LIMIT_REMOTE_SCAN_IO(bytes_read) \ + std::shared_ptr iot = nullptr; \ + if (auto* t_ctx = doris::thread_context(true)) { \ + iot = t_ctx->get_remote_scan_io_throttle(); \ + } \ + if (iot) { \ + iot->acquire(-1); \ + } \ + Defer defer { \ + [&]() { \ + if (iot) { \ + iot->update_next_io_time(*bytes_read); \ + } \ + } \ + } + namespace doris { class ThreadContext; @@ -230,13 +264,26 @@ class ThreadContext { std::weak_ptr workload_group() { return _wg_wptr; } - std::shared_ptr io_throttle(const std::string& data_dir) { + std::shared_ptr get_local_scan_io_throttle(const std::string& data_dir) { if (std::shared_ptr wg_ptr = _wg_wptr.lock()) { - return wg_ptr->get_scan_io_throttle(data_dir); + return wg_ptr->get_local_scan_io_throttle(data_dir); } return nullptr; } + std::shared_ptr get_remote_scan_io_throttle() { + if (std::shared_ptr wg_ptr = _wg_wptr.lock()) { + return wg_ptr->get_remote_scan_io_throttle(); + } + return nullptr; + } + + void update_total_local_scan_io_adder(size_t bytes_read) { + if (std::shared_ptr wg_ptr = _wg_wptr.lock()) { + wg_ptr->update_total_local_scan_io_adder(bytes_read); + } + } + int thread_local_handle_count = 0; int skip_memory_check = 0; int skip_large_memory_check = 0; diff --git a/be/src/runtime/workload_group/workload_group.cpp b/be/src/runtime/workload_group/workload_group.cpp index 6347193e319cd2a..e37f83a00e828b0 100644 --- a/be/src/runtime/workload_group/workload_group.cpp +++ b/be/src/runtime/workload_group/workload_group.cpp @@ -43,11 +43,9 @@ namespace doris { -const static uint64_t CPU_SHARE_DEFAULT_VALUE = 1024; const static std::string MEMORY_LIMIT_DEFAULT_VALUE = "0%"; const static bool ENABLE_MEMORY_OVERCOMMIT_DEFAULT_VALUE = true; const static int CPU_HARD_LIMIT_DEFAULT_VALUE = -1; -const static uint64_t CPU_SOFT_LIMIT_DEFAULT_VALUE = 1024; const static int SPILL_LOW_WATERMARK_DEFAULT_VALUE = 50; const static int SPILL_HIGH_WATERMARK_DEFAULT_VALUE = 80; @@ -69,9 +67,18 @@ WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& tg_info) _remote_scan_bytes_per_second(tg_info.remote_read_bytes_per_second) { std::vector& data_dir_list = io::BeConfDataDirReader::be_config_data_dir_list; for (const auto& data_dir : data_dir_list) { - _scan_io_throttle_map[data_dir.path] = std::make_shared(); - } - _remote_scan_io_throttle = std::make_shared(); + _scan_io_throttle_map[data_dir.path] = + std::make_shared(_name, data_dir.bvar_name + "_read_bytes"); + } + _remote_scan_io_throttle = std::make_shared(_name, "remote_read_bytes"); + _mem_used_status = std::make_unique>(_name, "memory_used", 0); + _cpu_usage_adder = std::make_unique>(_name, "cpu_usage_adder"); + _cpu_usage_per_second = std::make_unique>>( + _name, "cpu_usage", _cpu_usage_adder.get(), 10); + _total_local_scan_io_adder = + std::make_unique>(_name, "total_local_read_bytes"); + _total_local_scan_io_per_second = std::make_unique>>( + _name, "total_local_read_bytes_per_second", _total_local_scan_io_adder.get(), 1); } std::string WorkloadGroup::debug_string() const { @@ -136,6 +143,7 @@ int64_t WorkloadGroup::make_memory_tracker_snapshots( } } refresh_memory(used_memory); + _mem_used_status->set_value(used_memory); return used_memory; } @@ -152,10 +160,6 @@ void WorkloadGroup::refresh_memory(int64_t used_memory) { _wg_refresh_interval_memory_growth.store(0.0); } -void WorkloadGroup::set_weighted_memory_ratio(double ratio) { - _weighted_mem_ratio = ratio; -} - void WorkloadGroup::add_mem_tracker_limiter(std::shared_ptr mem_tracker_ptr) { std::unique_lock wlock(_mutex); auto group_num = mem_tracker_ptr->group_num(); @@ -304,7 +308,7 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( } // 4 cpu_share - uint64_t cpu_share = CPU_SHARE_DEFAULT_VALUE; + uint64_t cpu_share = CgroupCpuCtl::cpu_soft_limit_default_value(); if (tworkload_group_info.__isset.cpu_share) { cpu_share = tworkload_group_info.cpu_share; } @@ -409,14 +413,18 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e std::lock_guard wlock(_task_sched_lock); if (config::doris_cgroup_cpu_path != "" && _cgroup_cpu_ctl == nullptr) { - std::unique_ptr cgroup_cpu_ctl = std::make_unique(tg_id); - Status ret = cgroup_cpu_ctl->init(); - if (ret.ok()) { - _cgroup_cpu_ctl = std::move(cgroup_cpu_ctl); - LOG(INFO) << "[upsert wg thread pool] cgroup init success, wg_id=" << tg_id; + std::unique_ptr cgroup_cpu_ctl = CgroupCpuCtl::create_cgroup_cpu_ctl(tg_id); + if (cgroup_cpu_ctl) { + Status ret = cgroup_cpu_ctl->init(); + if (ret.ok()) { + _cgroup_cpu_ctl = std::move(cgroup_cpu_ctl); + LOG(INFO) << "[upsert wg thread pool] cgroup init success, wg_id=" << tg_id; + } else { + LOG(INFO) << "[upsert wg thread pool] cgroup init failed, wg_id= " << tg_id + << ", reason=" << ret.to_string(); + } } else { - LOG(INFO) << "[upsert wg thread pool] cgroup init failed, wg_id= " << tg_id - << ", reason=" << ret.to_string(); + LOG(INFO) << "[upsert wg thread pool] create cgroup cpu ctl for " << tg_id << " failed"; } } @@ -515,7 +523,8 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e if (enable_cpu_hard_limit) { if (cpu_hard_limit > 0) { _cgroup_cpu_ctl->update_cpu_hard_limit(cpu_hard_limit); - _cgroup_cpu_ctl->update_cpu_soft_limit(CPU_SOFT_LIMIT_DEFAULT_VALUE); + _cgroup_cpu_ctl->update_cpu_soft_limit( + CgroupCpuCtl::cpu_soft_limit_default_value()); } else { LOG(INFO) << "[upsert wg thread pool] enable cpu hard limit but value is illegal: " << cpu_hard_limit << ", gid=" << tg_id; @@ -577,10 +586,7 @@ void WorkloadGroup::upsert_scan_io_throttle(WorkloadGroupInfo* tg_info) { _remote_scan_io_throttle->set_io_bytes_per_second(tg_info->remote_read_bytes_per_second); } -std::shared_ptr WorkloadGroup::get_scan_io_throttle(const std::string& disk_dir) { - if (disk_dir == io::FileReader::VIRTUAL_REMOTE_DATA_DIR) { - return _remote_scan_io_throttle; - } +std::shared_ptr WorkloadGroup::get_local_scan_io_throttle(const std::string& disk_dir) { auto find_ret = _scan_io_throttle_map.find(disk_dir); if (find_ret != _scan_io_throttle_map.end()) { return find_ret->second; @@ -588,6 +594,22 @@ std::shared_ptr WorkloadGroup::get_scan_io_throttle(const std::strin return nullptr; } +std::shared_ptr WorkloadGroup::get_remote_scan_io_throttle() { + return _remote_scan_io_throttle; +} + +void WorkloadGroup::update_cpu_adder(int64_t delta_cpu_time) { + (*_cpu_usage_adder) << (uint64_t)delta_cpu_time; +} + +void WorkloadGroup::update_total_local_scan_io_adder(size_t scan_bytes) { + (*_total_local_scan_io_adder) << scan_bytes; +} + +int64_t WorkloadGroup::get_remote_scan_bytes_per_second() { + return _remote_scan_io_throttle->get_bvar_io_per_second(); +} + void WorkloadGroup::try_stop_schedulers() { std::lock_guard wlock(_task_sched_lock); if (_task_sched) { diff --git a/be/src/runtime/workload_group/workload_group.h b/be/src/runtime/workload_group/workload_group.h index 7605b9a17d8cf22..3561098b6ce29c1 100644 --- a/be/src/runtime/workload_group/workload_group.h +++ b/be/src/runtime/workload_group/workload_group.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include @@ -77,6 +78,12 @@ class WorkloadGroup : public std::enable_shared_from_this { return _memory_limit; }; + int64_t weighted_memory_limit() const { return _weighted_memory_limit; }; + + void set_weighted_memory_limit(int64_t weighted_memory_limit) { + _weighted_memory_limit = weighted_memory_limit; + } + // make memory snapshots and refresh total memory used at the same time. int64_t make_memory_tracker_snapshots( std::list>* tracker_snapshots); @@ -93,13 +100,10 @@ class WorkloadGroup : public std::enable_shared_from_this { void set_weighted_memory_ratio(double ratio); bool add_wg_refresh_interval_memory_growth(int64_t size) { - // `weighted_mem_used` is a rough memory usage in this group, - // because we can only get a precise memory usage by MemTracker which is not include page cache. - auto weighted_mem_used = - int64_t((_total_mem_used + _wg_refresh_interval_memory_growth.load() + size) * - _weighted_mem_ratio); - if ((weighted_mem_used > ((double)_memory_limit * - _spill_high_watermark.load(std::memory_order_relaxed) / 100))) { + auto realtime_total_mem_used = _total_mem_used + _wg_refresh_interval_memory_growth.load(); + if ((realtime_total_mem_used > + ((double)_weighted_memory_limit * + _spill_high_watermark.load(std::memory_order_relaxed) / 100))) { return false; } else { _wg_refresh_interval_memory_growth.fetch_add(size); @@ -111,17 +115,13 @@ class WorkloadGroup : public std::enable_shared_from_this { } void check_mem_used(bool* is_low_wartermark, bool* is_high_wartermark) const { - // `weighted_mem_used` is a rough memory usage in this group, - // because we can only get a precise memory usage by MemTracker which is not include page cache. - auto weighted_mem_used = - int64_t((_total_mem_used + _wg_refresh_interval_memory_growth.load()) * - _weighted_mem_ratio); - *is_low_wartermark = - (weighted_mem_used > ((double)_memory_limit * - _spill_low_watermark.load(std::memory_order_relaxed) / 100)); - *is_high_wartermark = - (weighted_mem_used > ((double)_memory_limit * - _spill_high_watermark.load(std::memory_order_relaxed) / 100)); + auto realtime_total_mem_used = _total_mem_used + _wg_refresh_interval_memory_growth.load(); + *is_low_wartermark = (realtime_total_mem_used > + ((double)_weighted_memory_limit * + _spill_low_watermark.load(std::memory_order_relaxed) / 100)); + *is_high_wartermark = (realtime_total_mem_used > + ((double)_weighted_memory_limit * + _spill_high_watermark.load(std::memory_order_relaxed) / 100)); } std::string debug_string() const; @@ -190,20 +190,34 @@ class WorkloadGroup : public std::enable_shared_from_this { std::string thread_debug_info(); - std::shared_ptr get_scan_io_throttle(const std::string& disk_dir); + std::shared_ptr get_local_scan_io_throttle(const std::string& disk_dir); + + std::shared_ptr get_remote_scan_io_throttle(); void upsert_scan_io_throttle(WorkloadGroupInfo* tg_info); + void update_cpu_adder(int64_t delta_cpu_time); + + void update_total_local_scan_io_adder(size_t scan_bytes); + + int64_t get_mem_used() { return _mem_used_status->get_value(); } + uint64_t get_cpu_usage() { return _cpu_usage_per_second->get_value(); } + int64_t get_local_scan_bytes_per_second() { + return _total_local_scan_io_per_second->get_value(); + } + int64_t get_remote_scan_bytes_per_second(); + private: mutable std::shared_mutex _mutex; // lock _name, _version, _cpu_share, _memory_limit const uint64_t _id; std::string _name; int64_t _version; int64_t _memory_limit; // bytes + // `weighted_memory_limit` less than or equal to _memory_limit, calculate after exclude public memory. + // more detailed description in `refresh_wg_weighted_memory_limit`. + std::atomic _weighted_memory_limit {0}; // // last value of make_memory_tracker_snapshots, refresh every time make_memory_tracker_snapshots is called. std::atomic_int64_t _total_mem_used = 0; // bytes - // last value of refresh_wg_weighted_memory_ratio. - std::atomic _weighted_mem_ratio = 0.0; std::atomic_int64_t _wg_refresh_interval_memory_growth; bool _enable_memory_overcommit; std::atomic _cpu_share; @@ -232,6 +246,13 @@ class WorkloadGroup : public std::enable_shared_from_this { std::map> _scan_io_throttle_map; std::shared_ptr _remote_scan_io_throttle {nullptr}; + + // bvar metric + std::unique_ptr> _mem_used_status; + std::unique_ptr> _cpu_usage_adder; + std::unique_ptr>> _cpu_usage_per_second; + std::unique_ptr> _total_local_scan_io_adder; + std::unique_ptr>> _total_local_scan_io_per_second; }; using WorkloadGroupPtr = std::shared_ptr; diff --git a/be/src/runtime/workload_group/workload_group_manager.cpp b/be/src/runtime/workload_group/workload_group_manager.cpp index 9e595841c6770c6..314a2b87841e1d1 100644 --- a/be/src/runtime/workload_group/workload_group_manager.cpp +++ b/be/src/runtime/workload_group/workload_group_manager.cpp @@ -28,6 +28,7 @@ #include "util/mem_info.h" #include "util/threadpool.h" #include "util/time.h" +#include "vec/core/block.h" #include "vec/exec/scan/scanner_scheduler.h" namespace doris { @@ -91,7 +92,8 @@ void WorkloadGroupMgr::delete_workload_group_by_ids(std::set used_wg_i } // wg is shutdown and running rum = 0, its resource can be released in BE if (workload_group_ptr->can_be_dropped()) { - LOG(INFO) << "[topic_publish_wg]There is no query in wg" << wg_id << ", delete it."; + LOG(INFO) << "[topic_publish_wg]There is no query in wg " << wg_id + << ", delete it."; deleted_task_groups.push_back(workload_group_ptr); } } @@ -120,30 +122,16 @@ void WorkloadGroupMgr::delete_workload_group_by_ids(std::set used_wg_i // Using cgdelete has no such issue. { if (config::doris_cgroup_cpu_path != "") { - std::lock_guard write_lock(_init_cg_ctl_lock); - if (!_cg_cpu_ctl) { - _cg_cpu_ctl = std::make_unique(); - } - if (!_is_init_succ) { - Status ret = _cg_cpu_ctl->init(); - if (ret.ok()) { - _is_init_succ = true; - } else { - LOG(INFO) << "[topic_publish_wg]init workload group mgr cpu ctl failed, " - << ret.to_string(); - } - } - if (_is_init_succ) { - Status ret = _cg_cpu_ctl->delete_unused_cgroup_path(used_wg_id); - if (!ret.ok()) { - LOG(WARNING) << "[topic_publish_wg]" << ret.to_string(); - } + std::lock_guard write_lock(_clear_cgroup_lock); + Status ret = CgroupCpuCtl::delete_unused_cgroup_path(used_wg_id); + if (!ret.ok()) { + LOG(WARNING) << "[topic_publish_wg]" << ret.to_string(); } } } int64_t time_cost_ms = MonotonicMillis() - begin_time; LOG(INFO) << "[topic_publish_wg]finish clear unused workload group, time cost: " << time_cost_ms - << "ms, deleted group size:" << deleted_task_groups.size() + << " ms, deleted group size:" << deleted_task_groups.size() << ", before wg size=" << old_wg_size << ", after wg size=" << new_wg_size; } @@ -153,86 +141,101 @@ struct WorkloadGroupMemInfo { std::list>(); }; -void WorkloadGroupMgr::refresh_wg_weighted_memory_ratio() { +void WorkloadGroupMgr::refresh_wg_weighted_memory_limit() { std::shared_lock r_lock(_group_mutex); // 1. make all workload groups memory snapshots(refresh workload groups total memory used at the same time) // and calculate total memory used of all queries. - int64_t all_queries_mem_used = 0; + int64_t all_workload_groups_mem_usage = 0; std::unordered_map wgs_mem_info; for (auto& [wg_id, wg] : _workload_groups) { wgs_mem_info[wg_id].total_mem_used = wg->make_memory_tracker_snapshots(&wgs_mem_info[wg_id].tracker_snapshots); - all_queries_mem_used += wgs_mem_info[wg_id].total_mem_used; + all_workload_groups_mem_usage += wgs_mem_info[wg_id].total_mem_used; } - if (all_queries_mem_used <= 0) { + if (all_workload_groups_mem_usage <= 0) { return; } - // 2. calculate weighted ratio. - // process memory used is actually bigger than all_queries_mem_used, - // because memory of page cache, allocator cache, segment cache etc. are included - // in proc_vm_rss. - // we count these cache memories equally on workload groups. + // 2. calculate weighted memory limit ratio. + // when construct workload group, mem_limit is equal to (process_memory_limit * group_limit_percent), + // here, it is assumed that the available memory of workload groups is equal to process_memory_limit. + // + // but process_memory_usage is actually bigger than all_workload_groups_mem_usage, + // because public_memory of page cache, allocator cache, segment cache etc. are included in process_memory_usage. + // so actual available memory of the workload groups is equal to (process_memory_limit - public_memory) + // + // we will exclude this public_memory when calculate workload group mem_limit. + // so a ratio is calculated to multiply the workload group mem_limit from the previous construction. auto process_memory_usage = GlobalMemoryArbitrator::process_memory_usage(); - all_queries_mem_used = std::min(process_memory_usage, all_queries_mem_used); - double ratio = (double)process_memory_usage / (double)all_queries_mem_used; - if (ratio <= 1.25) { - std::string debug_msg = - fmt::format("\nProcess Memory Summary: {}, {}, all quries mem: {}", - doris::GlobalMemoryArbitrator::process_memory_used_details_str(), - doris::GlobalMemoryArbitrator::sys_mem_available_details_str(), - PrettyPrinter::print(all_queries_mem_used, TUnit::BYTES)); - LOG_EVERY_T(INFO, 10) << debug_msg; + auto process_memory_limit = MemInfo::mem_limit(); + double weighted_memory_limit_ratio = 1; + // if all_workload_groups_mem_usage is greater than process_memory_usage, it means that the memory statistics + // of the workload group are inaccurate. + // the reason is that query/load/etc. tracked is virtual memory, and virtual memory is not used in time. + // + // At this time, weighted_memory_limit_ratio is equal to 1, and workload group mem_limit is still equal to + // (process_memory_limit * group_limit_percent), this may cause query spill to occur earlier, + // However, there is no good solution at present, but we cannot predict when these virtual memory will be used. + if (all_workload_groups_mem_usage < process_memory_usage) { + int64_t public_memory = process_memory_usage - all_workload_groups_mem_usage; + weighted_memory_limit_ratio = 1 - (double)public_memory / (double)process_memory_limit; } + std::string debug_msg = fmt::format( + "\nProcess Memory Summary: {}, {}, all workload groups memory usage: {}, " + "weighted_memory_limit_ratio: {}", + doris::GlobalMemoryArbitrator::process_memory_used_details_str(), + doris::GlobalMemoryArbitrator::sys_mem_available_details_str(), + PrettyPrinter::print(all_workload_groups_mem_usage, TUnit::BYTES), + weighted_memory_limit_ratio); + LOG_EVERY_T(INFO, 10) << debug_msg; + for (auto& wg : _workload_groups) { - // 3.1 calculate query weighted memory limit of task group - auto wg_mem_limit = wg.second->memory_limit(); - auto wg_query_count = wgs_mem_info[wg.first].tracker_snapshots.size(); - int64_t query_weighted_mem_limit = - wg_query_count ? (wg_mem_limit + wg_query_count) / wg_query_count : wg_mem_limit; + // 3.1 calculate query spill threshold of task group + auto wg_weighted_mem_limit = + int64_t(wg.second->memory_limit() * weighted_memory_limit_ratio); + wg.second->set_weighted_memory_limit(wg_weighted_mem_limit); - // 3.2 set all workload groups weighted memory ratio and all query weighted memory limit and ratio. - wg.second->set_weighted_memory_ratio(ratio); + // 3.2 set workload groups weighted memory limit and all query spill threshold. + auto wg_query_count = wgs_mem_info[wg.first].tracker_snapshots.size(); + int64_t query_spill_threshold = + wg_query_count ? (wg_weighted_mem_limit + wg_query_count) / wg_query_count + : wg_weighted_mem_limit; for (const auto& query : wg.second->queries()) { auto query_ctx = query.second.lock(); if (!query_ctx) { continue; } - query_ctx->set_weighted_memory(query_weighted_mem_limit, ratio); + query_ctx->set_spill_threshold(query_spill_threshold); } // 3.3 only print debug logs, if workload groups is_high_wartermark or is_low_wartermark. - auto weighted_mem_used = int64_t(wgs_mem_info[wg.first].total_mem_used * ratio); - bool is_high_wartermark = - (weighted_mem_used > - ((double)wg_mem_limit * wg.second->spill_threashold_high_water_mark() / 100)); - bool is_low_wartermark = - (weighted_mem_used > - ((double)wg_mem_limit * wg.second->spill_threshold_low_water_mark() / 100)); + bool is_low_wartermark = false; + bool is_high_wartermark = false; + wg.second->check_mem_used(&is_low_wartermark, &is_high_wartermark); std::string debug_msg; if (is_high_wartermark || is_low_wartermark) { debug_msg = fmt::format( - "\nWorkload Group {}: mem limit: {}, mem used: {}, weighted mem used: {}, used " - "ratio: {}, query " - "count: {}, query_weighted_mem_limit: {}", - wg.second->name(), PrettyPrinter::print(wg_mem_limit, TUnit::BYTES), + "\nWorkload Group {}: mem limit: {}, mem used: {}, weighted mem limit: {}, " + "used " + "ratio: {}, query count: {}, query spill threshold: {}", + wg.second->name(), + PrettyPrinter::print(wg.second->memory_limit(), TUnit::BYTES), PrettyPrinter::print(wgs_mem_info[wg.first].total_mem_used, TUnit::BYTES), - PrettyPrinter::print(weighted_mem_used, TUnit::BYTES), - (double)weighted_mem_used / wg_mem_limit, wg_query_count, - PrettyPrinter::print(query_weighted_mem_limit, TUnit::BYTES)); + PrettyPrinter::print(wg_weighted_mem_limit, TUnit::BYTES), + (double)wgs_mem_info[wg.first].total_mem_used / wg_weighted_mem_limit, + wg_query_count, PrettyPrinter::print(query_spill_threshold, TUnit::BYTES)); debug_msg += "\n Query Memory Summary:"; // check whether queries need to revoke memory for task group for (const auto& query_mem_tracker : wgs_mem_info[wg.first].tracker_snapshots) { debug_msg += fmt::format( - "\n MemTracker Label={}, Parent Label={}, Used={}, WeightedUsed={}, " + "\n MemTracker Label={}, Parent Label={}, Used={}, SpillThreshold={}, " "Peak={}", query_mem_tracker->label(), query_mem_tracker->parent_label(), PrettyPrinter::print(query_mem_tracker->consumption(), TUnit::BYTES), - PrettyPrinter::print(int64_t(query_mem_tracker->consumption() * ratio), - TUnit::BYTES), + PrettyPrinter::print(query_spill_threshold, TUnit::BYTES), PrettyPrinter::print(query_mem_tracker->peak_consumption(), TUnit::BYTES)); } LOG_EVERY_T(INFO, 1) << debug_msg; @@ -242,6 +245,52 @@ void WorkloadGroupMgr::refresh_wg_weighted_memory_ratio() { } } +void WorkloadGroupMgr::get_wg_resource_usage(vectorized::Block* block) { + auto insert_int_value = [&](int col_index, int64_t int_val, vectorized::Block* block) { + vectorized::MutableColumnPtr mutable_col_ptr; + mutable_col_ptr = std::move(*block->get_by_position(col_index).column).assume_mutable(); + auto* nullable_column = + reinterpret_cast(mutable_col_ptr.get()); + vectorized::IColumn* col_ptr = &nullable_column->get_nested_column(); + reinterpret_cast*>(col_ptr)->insert_value( + int_val); + nullable_column->get_null_map_data().emplace_back(0); + }; + + auto insert_double_value = [&](int col_index, double double_val, vectorized::Block* block) { + vectorized::MutableColumnPtr mutable_col_ptr; + mutable_col_ptr = std::move(*block->get_by_position(col_index).column).assume_mutable(); + auto* nullable_column = + reinterpret_cast(mutable_col_ptr.get()); + vectorized::IColumn* col_ptr = &nullable_column->get_nested_column(); + reinterpret_cast*>(col_ptr)->insert_value( + double_val); + nullable_column->get_null_map_data().emplace_back(0); + }; + + int64_t be_id = ExecEnv::GetInstance()->master_info()->backend_id; + int cpu_num = CpuInfo::num_cores(); + cpu_num = cpu_num <= 0 ? 1 : cpu_num; + uint64_t total_cpu_time_ns_per_second = cpu_num * 1000000000ll; + + std::shared_lock r_lock(_group_mutex); + block->reserve(_workload_groups.size()); + for (const auto& [id, wg] : _workload_groups) { + insert_int_value(0, be_id, block); + insert_int_value(1, wg->id(), block); + insert_int_value(2, wg->get_mem_used(), block); + + double cpu_usage_p = + (double)wg->get_cpu_usage() / (double)total_cpu_time_ns_per_second * 100; + cpu_usage_p = std::round(cpu_usage_p * 100.0) / 100.0; + + insert_double_value(3, cpu_usage_p, block); + + insert_int_value(4, wg->get_local_scan_bytes_per_second(), block); + insert_int_value(5, wg->get_remote_scan_bytes_per_second(), block); + } +} + void WorkloadGroupMgr::stop() { for (auto iter = _workload_groups.begin(); iter != _workload_groups.end(); iter++) { iter->second->try_stop_schedulers(); diff --git a/be/src/runtime/workload_group/workload_group_manager.h b/be/src/runtime/workload_group/workload_group_manager.h index 37539ada8d85e66..d8547c3383e219d 100644 --- a/be/src/runtime/workload_group/workload_group_manager.h +++ b/be/src/runtime/workload_group/workload_group_manager.h @@ -27,6 +27,10 @@ namespace doris { class CgroupCpuCtl; +namespace vectorized { +class Block; +} // namespace vectorized + namespace pipeline { class TaskScheduler; class MultiCoreTaskQueue; @@ -54,15 +58,15 @@ class WorkloadGroupMgr { bool enable_cpu_hard_limit() { return _enable_cpu_hard_limit.load(); } - void refresh_wg_weighted_memory_ratio(); + void refresh_wg_weighted_memory_limit(); + + void get_wg_resource_usage(vectorized::Block* block); private: std::shared_mutex _group_mutex; std::unordered_map _workload_groups; - std::shared_mutex _init_cg_ctl_lock; - std::unique_ptr _cg_cpu_ctl; - bool _is_init_succ = false; + std::shared_mutex _clear_cgroup_lock; }; } // namespace doris diff --git a/be/src/runtime/workload_management/io_throttle.cpp b/be/src/runtime/workload_management/io_throttle.cpp index 3a8256eee3746dd..dacfa29012f59fe 100644 --- a/be/src/runtime/workload_management/io_throttle.cpp +++ b/be/src/runtime/workload_management/io_throttle.cpp @@ -17,12 +17,19 @@ #include "runtime/workload_management/io_throttle.h" +#include "util/defer_op.h" #include "util/time.h" namespace doris { +IOThrottle::IOThrottle(std::string prefix, std::string name) { + _io_adder = std::make_unique>(prefix, name); + _io_adder_per_second = std::make_unique>>( + prefix, name + "_per_second", _io_adder.get(), 1); +} + bool IOThrottle::acquire(int64_t block_timeout_ms) { - if (_io_bytes_per_second < 0) { + if (_io_bytes_per_second_limit < 0) { return true; } @@ -42,7 +49,7 @@ bool IOThrottle::acquire(int64_t block_timeout_ms) { } bool IOThrottle::try_acquire() { - if (_io_bytes_per_second < 0) { + if (_io_bytes_per_second_limit < 0) { return true; } std::unique_lock w_lock(_mutex); @@ -50,24 +57,31 @@ bool IOThrottle::try_acquire() { } void IOThrottle::update_next_io_time(int64_t io_bytes) { - if (_io_bytes_per_second <= 0 || io_bytes <= 0) { + Defer defer {[&]() { + if (io_bytes > 0) { + (*_io_adder) << io_bytes; + } + }}; + if (_io_bytes_per_second_limit <= 0 || io_bytes <= 0) { return; } - int64_t read_bytes_per_second = _io_bytes_per_second; - std::unique_lock w_lock(_mutex); - double io_bytes_float = static_cast(io_bytes); - double ret = (io_bytes_float / static_cast(read_bytes_per_second)) * - static_cast(MICROS_PER_SEC); - int64_t current_time = GetCurrentTimeMicros(); + int64_t read_bytes_per_second = _io_bytes_per_second_limit; + { + std::unique_lock w_lock(_mutex); + double io_bytes_float = static_cast(io_bytes); + double ret = (io_bytes_float / static_cast(read_bytes_per_second)) * + static_cast(MICROS_PER_SEC); + int64_t current_time = GetCurrentTimeMicros(); - if (current_time > _next_io_time_micros) { - _next_io_time_micros = current_time; + if (current_time > _next_io_time_micros) { + _next_io_time_micros = current_time; + } + _next_io_time_micros += ret < 1 ? static_cast(1) : static_cast(ret); } - _next_io_time_micros += ret < 1 ? static_cast(1) : static_cast(ret); } void IOThrottle::set_io_bytes_per_second(int64_t io_bytes_per_second) { - _io_bytes_per_second = io_bytes_per_second; + _io_bytes_per_second_limit = io_bytes_per_second; } }; // namespace doris \ No newline at end of file diff --git a/be/src/runtime/workload_management/io_throttle.h b/be/src/runtime/workload_management/io_throttle.h index 691255d23c48c47..ce62c65d7a9eeb2 100644 --- a/be/src/runtime/workload_management/io_throttle.h +++ b/be/src/runtime/workload_management/io_throttle.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -25,16 +26,9 @@ namespace doris { -class IOThrottle; - -struct IOThrottleCtx { - IOThrottle* io_throttle = nullptr; - int io_block_timeout; -}; - class IOThrottle { public: - IOThrottle() = default; + IOThrottle(std::string prefix, std::string name); ~IOThrottle() = default; @@ -47,12 +41,16 @@ class IOThrottle { void set_io_bytes_per_second(int64_t read_bytes_per_second); - int64_t get_io_bytes_per_second() { return _io_bytes_per_second; } + size_t get_bvar_io_per_second() { return _io_adder_per_second->get_value(); } private: std::mutex _mutex; std::condition_variable wait_condition; int64_t _next_io_time_micros {0}; - std::atomic _io_bytes_per_second {10485760}; + std::atomic _io_bytes_per_second_limit {10485760}; + + // bvar monitor + std::unique_ptr> _io_adder; + std::unique_ptr>> _io_adder_per_second; }; }; // namespace doris \ No newline at end of file diff --git a/be/src/service/backend_service.cpp b/be/src/service/backend_service.cpp index 9b63439a63425a0..aa29661da022080 100644 --- a/be/src/service/backend_service.cpp +++ b/be/src/service/backend_service.cpp @@ -653,7 +653,8 @@ Status BaseBackendService::start_plan_fragment_execution( if (!exec_params.fragment.__isset.output_sink) { return Status::InternalError("missing sink in plan fragment"); } - return _exec_env->fragment_mgr()->exec_plan_fragment(exec_params); + return _exec_env->fragment_mgr()->exec_plan_fragment(exec_params, + QuerySource::INTERNAL_FRONTEND); } void BaseBackendService::cancel_plan_fragment(TCancelPlanFragmentResult& return_val, diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 8ca86d4c5750e48..dcc7625986829cf 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -67,7 +67,6 @@ #include "common/signal_handler.h" #include "common/status.h" #include "io/cache/block_file_cache_factory.h" -#include "io/fs/local_file_reader.h" #include "olap/options.h" #include "olap/storage_engine.h" #include "runtime/exec_env.h" @@ -528,7 +527,6 @@ int main(int argc, char** argv) { doris::ThreadLocalHandle::create_thread_local_if_not_exits(); - doris::io::BeConfDataDirReader::init_be_conf_data_dir(paths, spill_paths); // init exec env auto* exec_env(doris::ExecEnv::GetInstance()); status = doris::ExecEnv::init(doris::ExecEnv::GetInstance(), paths, spill_paths, broken_paths); @@ -604,6 +602,8 @@ int main(int argc, char** argv) { stop_work_if_error( status, "Arrow Flight Service did not start correctly, exiting, " + status.to_string()); + exec_env->storage_engine().notify_listeners(); + while (!doris::k_doris_exit) { #if defined(LEAK_SANITIZER) __lsan_do_leak_check(); diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index b851302aaa830eb..9522f23e3bd70ea 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -156,10 +156,10 @@ Status HttpService::start() { HealthAction* health_action = _pool.add(new HealthAction()); _ev_http_server->register_handler(HttpMethod::GET, "/api/health", health_action); - // Dump all running pipeline tasks - ClearDataCacheAction* clear_data_cache_action = _pool.add(new ClearDataCacheAction()); - _ev_http_server->register_handler(HttpMethod::GET, "/api/clear_data_cache", - clear_data_cache_action); + // Clear cache action + ClearCacheAction* clear_cache_action = _pool.add(new ClearCacheAction()); + _ev_http_server->register_handler(HttpMethod::GET, "/api/clear_cache/{type}", + clear_cache_action); // Dump all running pipeline tasks PipelineTaskAction* pipeline_task_action = _pool.add(new PipelineTaskAction()); diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 1fd8c681881be34..c2251c240ae647f 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -514,9 +514,11 @@ Status PInternalService::_exec_plan_fragment_impl( RETURN_IF_ERROR(deserialize_thrift_msg(buf, &len, compact, &t_request)); } if (cb) { - return _exec_env->fragment_mgr()->exec_plan_fragment(t_request, cb); + return _exec_env->fragment_mgr()->exec_plan_fragment( + t_request, QuerySource::INTERNAL_FRONTEND, cb); } else { - return _exec_env->fragment_mgr()->exec_plan_fragment(t_request); + return _exec_env->fragment_mgr()->exec_plan_fragment(t_request, + QuerySource::INTERNAL_FRONTEND); } } else if (version == PFragmentRequestVersion::VERSION_2) { TExecPlanFragmentParamsList t_request; @@ -531,9 +533,11 @@ Status PInternalService::_exec_plan_fragment_impl( for (const TExecPlanFragmentParams& params : t_request.paramsList) { if (cb) { - RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(params, cb)); + RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment( + params, QuerySource::INTERNAL_FRONTEND, cb)); } else { - RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(params)); + RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment( + params, QuerySource::INTERNAL_FRONTEND)); } } @@ -562,9 +566,11 @@ Status PInternalService::_exec_plan_fragment_impl( timer.start(); for (const TPipelineFragmentParams& fragment : fragment_list) { if (cb) { - RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(fragment, cb)); + RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment( + fragment, QuerySource::INTERNAL_FRONTEND, cb)); } else { - RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(fragment)); + RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment( + fragment, QuerySource::INTERNAL_FRONTEND)); } } timer.stop(); @@ -658,7 +664,7 @@ void PInternalService::outfile_write_success(google::protobuf::RpcController* co uint32_t len = request->result_file_sink().size(); st = deserialize_thrift_msg(buf, &len, false, &result_file_sink); if (!st.ok()) { - LOG(WARNING) << "outfile write success filefailed, errmsg=" << st; + LOG(WARNING) << "outfile write success file failed, errmsg = " << st; st.to_protobuf(result->mutable_status()); return; } @@ -677,7 +683,7 @@ void PInternalService::outfile_write_success(google::protobuf::RpcController* co bool exists = true; st = io::global_local_filesystem()->exists(file_name, &exists); if (!st.ok()) { - LOG(WARNING) << "outfile write success filefailed, errmsg=" << st; + LOG(WARNING) << "outfile write success filefailed, errmsg = " << st; st.to_protobuf(result->mutable_status()); return; } @@ -685,7 +691,7 @@ void PInternalService::outfile_write_success(google::protobuf::RpcController* co st = Status::InternalError("File already exists: {}", file_name); } if (!st.ok()) { - LOG(WARNING) << "outfile write success filefailed, errmsg=" << st; + LOG(WARNING) << "outfile write success file failed, errmsg = " << st; st.to_protobuf(result->mutable_status()); return; } @@ -2030,7 +2036,10 @@ void PInternalService::group_commit_insert(google::protobuf::RpcController* cont TUniqueId load_id; load_id.__set_hi(request->load_id().hi()); load_id.__set_lo(request->load_id().lo()); - bool ret = _light_work_pool.try_offer([this, request, response, done, load_id]() { + std::shared_ptr lock = std::make_shared(); + std::shared_ptr is_done = std::make_shared(false); + bool ret = _light_work_pool.try_offer([this, request, response, done, load_id, lock, + is_done]() { brpc::ClosureGuard closure_guard(done); std::shared_ptr ctx = std::make_shared(_exec_env); auto pipe = std::make_shared( @@ -2044,7 +2053,13 @@ void PInternalService::group_commit_insert(google::protobuf::RpcController* cont request->exec_plan_fragment_request().request(), request->exec_plan_fragment_request().version(), request->exec_plan_fragment_request().compact(), - [&, response, done, load_id](RuntimeState* state, Status* status) { + [&, response, done, load_id, lock, is_done](RuntimeState* state, + Status* status) { + std::lock_guard lock1(*lock); + if (*is_done) { + return; + } + *is_done = true; brpc::ClosureGuard cb_closure_guard(done); response->set_label(state->import_label()); response->set_txn_id(state->wal_id()); @@ -2064,7 +2079,16 @@ void PInternalService::group_commit_insert(google::protobuf::RpcController* cont "_exec_plan_fragment_impl meet unknown error"); } if (!st.ok()) { - LOG(WARNING) << "exec plan fragment failed, errmsg=" << st; + LOG(WARNING) << "exec plan fragment failed, load_id=" << print_id(load_id) + << ", errmsg=" << st; + std::lock_guard lock1(*lock); + if (*is_done) { + closure_guard.release(); + } else { + *is_done = true; + st.to_protobuf(response->mutable_status()); + _exec_env->new_load_stream_mgr()->remove(load_id); + } } else { closure_guard.release(); for (int i = 0; i < request->data().size(); ++i) { diff --git a/be/src/util/block_compression.cpp b/be/src/util/block_compression.cpp index 439860909e8984c..2faaf53d5e133d9 100644 --- a/be/src/util/block_compression.cpp +++ b/be/src/util/block_compression.cpp @@ -20,6 +20,8 @@ #include #include #include + +#include // Only used on x86 or x86_64 #if defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || \ defined(__i386) || defined(_M_IX86) @@ -951,6 +953,8 @@ class ZstdBlockCompression : public BlockCompressionCodec { if (max_len <= MAX_COMPRESSION_BUFFER_SIZE_FOR_REUSE) { output->assign_copy(reinterpret_cast(compressed_buf.data), out_buf.pos); } + } catch (std::exception& e) { + return Status::InternalError("Fail to do ZSTD compress due to exception {}", e.what()); } catch (...) { // Do not set compress_failed to release context DCHECK(!compress_failed); diff --git a/be/src/util/brpc_client_cache.cpp b/be/src/util/brpc_client_cache.cpp index b9135e8014dc7d9..c5a6488787879b0 100644 --- a/be/src/util/brpc_client_cache.cpp +++ b/be/src/util/brpc_client_cache.cpp @@ -25,12 +25,23 @@ namespace doris { DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(brpc_endpoint_stub_count, MetricUnit::NOUNIT); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(brpc_stream_endpoint_stub_count, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(brpc_function_endpoint_stub_count, MetricUnit::NOUNIT); template <> -BrpcClientCache::BrpcClientCache() { - REGISTER_HOOK_METRIC(brpc_endpoint_stub_count, [this]() { return _stub_map.size(); }); +BrpcClientCache::BrpcClientCache(std::string protocol, + std::string connection_type, + std::string connection_group) + : _protocol(protocol), + _connection_type(connection_type), + _connection_group(connection_group) { + if (connection_group == "streaming") { + REGISTER_HOOK_METRIC(brpc_stream_endpoint_stub_count, + [this]() { return _stub_map.size(); }); + } else { + REGISTER_HOOK_METRIC(brpc_endpoint_stub_count, [this]() { return _stub_map.size(); }); + } } template <> @@ -39,7 +50,12 @@ BrpcClientCache::~BrpcClientCache() { } template <> -BrpcClientCache::BrpcClientCache() { +BrpcClientCache::BrpcClientCache(std::string protocol, + std::string connection_type, + std::string connection_group) + : _protocol(protocol), + _connection_type(connection_type), + _connection_group(connection_group) { REGISTER_HOOK_METRIC(brpc_function_endpoint_stub_count, [this]() { return _stub_map.size(); }); } diff --git a/be/src/util/brpc_client_cache.h b/be/src/util/brpc_client_cache.h index ebef80f4a6bdfb8..09c92fb398e0854 100644 --- a/be/src/util/brpc_client_cache.h +++ b/be/src/util/brpc_client_cache.h @@ -59,7 +59,8 @@ namespace doris { template class BrpcClientCache { public: - BrpcClientCache(); + BrpcClientCache(std::string protocol = "baidu_std", std::string connection_type = "", + std::string connection_group = ""); virtual ~BrpcClientCache(); std::shared_ptr get_client(const butil::EndPoint& endpoint) { @@ -110,20 +111,24 @@ class BrpcClientCache { } std::shared_ptr get_new_client_no_cache(const std::string& host_port, - const std::string& protocol = "baidu_std", - const std::string& connect_type = "", + const std::string& protocol = "", + const std::string& connection_type = "", const std::string& connection_group = "") { brpc::ChannelOptions options; - if constexpr (std::is_same_v) { - options.protocol = config::function_service_protocol; - } else { + if (protocol != "") { options.protocol = protocol; + } else if (_protocol != "") { + options.protocol = _protocol; } - if (connect_type != "") { - options.connection_type = connect_type; + if (connection_type != "") { + options.connection_type = connection_type; + } else if (_connection_type != "") { + options.connection_type = _connection_type; } if (connection_group != "") { options.connection_group = connection_group; + } else if (_connection_group != "") { + options.connection_group = _connection_group; } options.connect_timeout_ms = 2000; options.timeout_ms = 2000; @@ -204,6 +209,9 @@ class BrpcClientCache { private: StubMap _stub_map; + const std::string _protocol; + const std::string _connection_type; + const std::string _connection_group; }; using InternalServiceClientCache = BrpcClientCache; diff --git a/be/src/util/byte_buffer.h b/be/src/util/byte_buffer.h index aab8fd42db6e3b4..e8eadf69e028b57 100644 --- a/be/src/util/byte_buffer.h +++ b/be/src/util/byte_buffer.h @@ -23,19 +23,27 @@ #include #include "common/logging.h" +#include "common/status.h" +#include "vec/common/allocator.h" +#include "vec/common/allocator_fwd.h" namespace doris { struct ByteBuffer; using ByteBufferPtr = std::shared_ptr; -struct ByteBuffer { +struct ByteBuffer : private Allocator { static ByteBufferPtr allocate(size_t size) { ByteBufferPtr ptr(new ByteBuffer(size)); return ptr; } - ~ByteBuffer() { delete[] ptr; } + static Status create_and_allocate(ByteBufferPtr& ptr, size_t size) { + ptr = ByteBufferPtr(new ByteBuffer(size)); + return Status::OK(); + } + + ~ByteBuffer() { Allocator::free(ptr, capacity); } void put_bytes(const char* data, size_t size) { memcpy(ptr + pos, data, size); @@ -56,14 +64,15 @@ struct ByteBuffer { size_t remaining() const { return limit - pos; } bool has_remaining() const { return limit > pos; } - char* const ptr; + char* ptr; size_t pos; size_t limit; size_t capacity; private: - ByteBuffer(size_t capacity_) - : ptr(new char[capacity_]), pos(0), limit(capacity_), capacity(capacity_) {} + ByteBuffer(size_t capacity_) : pos(0), limit(capacity_), capacity(capacity_) { + ptr = reinterpret_cast(Allocator::alloc(capacity_)); + } }; } // namespace doris diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp index 9ad78696a6f12c3..8f64fe699c60625 100644 --- a/be/src/util/cgroup_util.cpp +++ b/be/src/util/cgroup_util.cpp @@ -18,10 +18,7 @@ #include "util/cgroup_util.h" #include -#include #include -#include -#include #include #include @@ -40,14 +37,33 @@ using std::pair; namespace doris { -Status CGroupUtil::find_global_cgroup(const string& subsystem, string* path) { +bool CGroupUtil::cgroupsv1_enable() { + bool exists = true; + Status st = io::global_local_filesystem()->exists("/proc/cgroups", &exists); + return st.ok() && exists; +} + +bool CGroupUtil::cgroupsv2_enable() { +#if defined(OS_LINUX) + // This file exists iff the host has cgroups v2 enabled. + auto controllers_file = default_cgroups_mount / "cgroup.controllers"; + bool exists = true; + Status st = io::global_local_filesystem()->exists(controllers_file, &exists); + return st.ok() && exists; +#else + return false; +#endif +} + +Status CGroupUtil::find_global_cgroupv1(const string& subsystem, string* path) { std::ifstream proc_cgroups("/proc/self/cgroup", std::ios::in); string line; while (true) { if (proc_cgroups.fail()) { - return Status::IOError("Error reading /proc/self/cgroup: {}", get_str_err_msg()); + return Status::CgroupError("Error reading /proc/self/cgroup: {}", get_str_err_msg()); } else if (proc_cgroups.peek() == std::ifstream::traits_type::eof()) { - return Status::NotFound("Could not find subsystem {} in /proc/self/cgroup", subsystem); + return Status::CgroupError("Could not find subsystem {} in /proc/self/cgroup", + subsystem); } // The line format looks like this: // 4:memory:/user.slice @@ -82,32 +98,15 @@ static Status unescape_path(const string& escaped, string* unescaped) { return Status::OK(); } -static Status read_cgroup_value(const string& limit_file_path, int64_t* val) { - std::ifstream limit_file(limit_file_path, std::ios::in); - string line; - getline(limit_file, line); - if (limit_file.fail() || limit_file.bad()) { - return Status::IOError("Error reading {}: {}", limit_file_path, get_str_err_msg()); - } - StringParser::ParseResult pr; - // Parse into an int64_t If it overflows, returning the max value of int64_t is ok because that - // is effectively unlimited. - *val = StringParser::string_to_int(line.c_str(), line.size(), &pr); - if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)) { - return Status::InvalidArgument("Failed to parse {} as int64: '{}'", limit_file_path, line); - } - return Status::OK(); -} - -Status CGroupUtil::find_cgroup_mounts(const string& subsystem, pair* result) { +Status CGroupUtil::find_cgroupv1_mounts(const string& subsystem, pair* result) { std::ifstream mountinfo("/proc/self/mountinfo", std::ios::in); string line; while (true) { if (mountinfo.fail() || mountinfo.bad()) { - return Status::IOError("Error reading /proc/self/mountinfo: {}", get_str_err_msg()); + return Status::CgroupError("Error reading /proc/self/mountinfo: {}", get_str_err_msg()); } else if (mountinfo.eof()) { - return Status::NotFound("Could not find subsystem {} in /proc/self/mountinfo", - subsystem); + return Status::CgroupError("Could not find subsystem {} in /proc/self/mountinfo", + subsystem); } // The relevant lines look like below (see proc manpage for full documentation). The // first example is running outside of a container, the second example is running @@ -118,14 +117,18 @@ Status CGroupUtil::find_cgroup_mounts(const string& subsystem, pair fields = Split(line, " ", SkipWhitespace()); if (fields.size() < 7) { return Status::InvalidArgument( "Could not parse line from /proc/self/mountinfo - had {} > 7 tokens: '{}'", fields.size(), line); } - if (fields[fields.size() - 3] != "cgroup") continue; + if (fields[fields.size() - 3] != "cgroup") { + continue; + } // This is a cgroup mount. Check if it's the mount we're looking for. std::vector cgroup_opts = Split(fields[fields.size() - 1], ",", SkipWhitespace()); auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem); @@ -138,16 +141,21 @@ Status CGroupUtil::find_cgroup_mounts(const string& subsystem, pair paths; - RETURN_IF_ERROR(find_cgroup_mounts(subsystem, &paths)); + RETURN_IF_ERROR(find_cgroupv1_mounts(subsystem, &paths)); const string& mount_path = paths.first; const string& system_path = paths.second; if (path->compare(0, system_path.size(), system_path) != 0) { @@ -158,98 +166,102 @@ Status CGroupUtil::find_abs_cgroup_path(const string& subsystem, string* path) { return Status::OK(); } -Status CGroupUtil::find_cgroup_mem_limit(int64_t* bytes) { - if (!enable()) { - return Status::InvalidArgument("cgroup is not enabled!"); +std::string CGroupUtil::cgroupv2_of_process() { +#if defined(OS_LINUX) + if (!cgroupsv2_enable()) { + return ""; + } + // All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs + // A simpler way to get the membership is: + std::ifstream cgroup_name_file("/proc/self/cgroup"); + if (!cgroup_name_file.is_open()) { + return ""; + } + // With cgroups v2, there will be a *single* line with prefix "0::/" + // (see https://docs.kernel.org/admin-guide/cgroup-v2.html) + std::string cgroup; + std::getline(cgroup_name_file, cgroup); + static const std::string v2_prefix = "0::/"; + if (!cgroup.starts_with(v2_prefix)) { + return ""; } - string cgroup_path; - RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path)); - string limit_file_path = cgroup_path + "/memory.limit_in_bytes"; - return read_cgroup_value(limit_file_path, bytes); + cgroup = cgroup.substr(v2_prefix.length()); + return cgroup; +#else + return ""; +#endif } -Status CGroupUtil::find_cgroup_mem_usage(int64_t* bytes) { - if (!enable()) { - return Status::InvalidArgument("cgroup is not enabled!"); +std::optional CGroupUtil::get_cgroupsv2_path(const std::string& subsystem) { +#if defined(OS_LINUX) + if (!CGroupUtil::cgroupsv2_enable()) { + return {}; } - string cgroup_path; - RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path)); - string usage_file_path = cgroup_path + "/memory.usage_in_bytes"; - return read_cgroup_value(usage_file_path, bytes); -} -Status CGroupUtil::find_cgroup_mem_info(std::string* file_path) { - if (!enable()) { - return Status::InvalidArgument("cgroup is not enabled!"); + std::string cgroup = CGroupUtil::cgroupv2_of_process(); + auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); + + // Return the bottom-most nested current memory file. If there is no such file at the current + // level, try again at the parent level as memory settings are inherited. + while (current_cgroup != default_cgroups_mount.parent_path()) { + if (std::filesystem::exists(current_cgroup / subsystem)) { + return {current_cgroup}; + } + current_cgroup = current_cgroup.parent_path(); } - string cgroup_path; - RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path)); - *file_path = cgroup_path + "/memory.stat"; - return Status::OK(); + return {}; +#else + return {}; +#endif } -Status CGroupUtil::find_cgroup_cpu_limit(float* cpu_count) { - if (!enable()) { - return Status::InvalidArgument("cgroup is not enabled!"); - } - int64_t quota; - int64_t period; - string cgroup_path; - if (!find_abs_cgroup_path("cpu", &cgroup_path).ok()) { - RETURN_IF_ERROR(find_abs_cgroup_path("cpuacct", &cgroup_path)); - } - string cfs_quota_filename = cgroup_path + "/cpu.cfs_quota_us"; - RETURN_IF_ERROR(read_cgroup_value(cfs_quota_filename, "a)); - if (quota <= 0) { - *cpu_count = -1; - return Status::OK(); - } - string cfs_period_filename = cgroup_path + "/cpu.cfs_period_us"; - RETURN_IF_ERROR(read_cgroup_value(cfs_period_filename, &period)); - if (quota <= period) { - return Status::InvalidArgument("quota <= period"); +Status CGroupUtil::read_int_line_from_cgroup_file(const std::filesystem::path& file_path, + int64_t* val) { + std::ifstream file_stream(file_path, std::ios::in); + string line; + getline(file_stream, line); + if (file_stream.fail() || file_stream.bad()) { + return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); } - *cpu_count = float(quota) / float(period); - if (*cpu_count >= FLT_MAX) { - return Status::InvalidArgument("unknown"); + StringParser::ParseResult pr; + // Parse into an int64_t If it overflows, returning the max value of int64_t is ok because that + // is effectively unlimited. + *val = StringParser::string_to_int(line.c_str(), line.size(), &pr); + if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)) { + return Status::InvalidArgument("Failed to parse {} as int64: '{}'", file_path.string(), + line); } return Status::OK(); } -std::string CGroupUtil::debug_string() { - if (!enable()) { - return std::string("cgroup is not enabled!"); - } - string mem_limit_str; - int64_t mem_limit; - Status status = find_cgroup_mem_limit(&mem_limit); - if (status.ok()) { - mem_limit_str = strings::Substitute("$0", mem_limit); - } else { - mem_limit_str = status.to_string(); - } - string cpu_limit_str; - float cpu_limit; - status = find_cgroup_cpu_limit(&cpu_limit); - if (status.ok()) { - if (cpu_limit > 0) { - std::stringstream stream; - stream << std::fixed << std::setprecision(1) << cpu_limit; - cpu_limit_str = stream.str(); - } else { - cpu_limit_str = "unlimited"; +void CGroupUtil::read_int_metric_from_cgroup_file( + const std::filesystem::path& file_path, + std::unordered_map& metrics_map) { + std::ifstream cgroup_file(file_path, std::ios::in); + std::string line; + while (cgroup_file.good() && !cgroup_file.eof()) { + getline(cgroup_file, line); + std::vector fields = strings::Split(line, " ", strings::SkipWhitespace()); + if (fields.size() < 2) { + continue; } - } else { - cpu_limit_str = status.to_string(); - } - return strings::Substitute("Process CGroup Info: memory.limit_in_bytes=$0, cpu cfs limits: $1", - mem_limit_str, cpu_limit_str); -} + std::string key = fields[0].substr(0, fields[0].size()); -bool CGroupUtil::enable() { - bool exists = true; - Status st = io::global_local_filesystem()->exists("/proc/cgroups", &exists); - return st.ok() && exists; + StringParser::ParseResult result; + auto value = + StringParser::string_to_int(fields[1].data(), fields[1].size(), &result); + + if (result == StringParser::PARSE_SUCCESS) { + if (fields.size() == 2) { + metrics_map[key] = value; + } else if (fields[2] == "kB") { + metrics_map[key] = value * 1024L; + } + } + } + if (cgroup_file.is_open()) { + cgroup_file.close(); + } } } // namespace doris diff --git a/be/src/util/cgroup_util.h b/be/src/util/cgroup_util.h index 2152720ccdd1dae..bc1417453f41f6e 100644 --- a/be/src/util/cgroup_util.h +++ b/be/src/util/cgroup_util.h @@ -18,50 +18,91 @@ #pragma once #include +#include +#include #include #include #include "common/status.h" namespace doris { -class CGroupUtil { -public: - // Determines the CGroup memory limit from the current processes' cgroup. - // If the limit is more than INT64_MAX, INT64_MAX is returned (since that is - // effectively unlimited anyway). Does not take into account memory limits - // set on any ancestor CGroups. - static Status find_cgroup_mem_limit(int64_t* bytes); - - // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff) - // https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command - static Status find_cgroup_mem_usage(int64_t* bytes); - static Status find_cgroup_mem_info(std::string* file_path); - // Determines the CGroup cpu cores limit from the current processes' cgroup. - static Status find_cgroup_cpu_limit(float* cpu_count); +// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers). +// /sys/fs/cgroup was still symlinked to the actual mount in the cases that I have seen. +static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup"; - // Returns a human-readable string with information about CGroups. - static std::string debug_string(); +/* Cgroup debugging steps + * CgroupV1: + * sudo cgcreate -t username:username -g memory:test + * sudo sh -c "echo 6000M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes" + * // process started by the current terminal will join Cgroup test + * sudo sh -c "echo $$ >> /sys/fs/cgroup/memory/test/cgroup.procs" + * + * CgroupV2: + * sudo mkdir /sys/fs/cgroup/test + * sudo echo 3000M > /sys/fs/cgroup/test/memory.max + * // process started by the current terminal will join Cgroup test + * sudo sh -c "echo $$ >> /sys/fs/cgroup/test/cgroup.procs" + * or + * // only memory allocated after joining the Cgroup is counted in `memory.current`. + * sudo echo pid > /sys/fs/cgroup/test/cgroup.procs +*/ +class CGroupUtil { +public: + enum class CgroupsVersion : uint8_t { V1, V2 }; - // detect if cgroup is enabled - static bool enable(); + // Detect if cgroup is enabled. + // If true, it only means that the OS allows the use of Cgroup v1 or v2, + // not that the current BE process is using Cgroup. + // To confirm whether the process is using Cgroup need to use `find_global_cgroupv1` or `cgroupv2_of_process`. + // To confirm whether the process is using a subsystem of Cgroup, + // need to use `find_abs_cgroupv1_path` or `get_cgroupsv2_path`. + static bool cgroupsv1_enable(); + static bool cgroupsv2_enable(); -private: // return the global cgroup path of subsystem like 12:memory:/user.slice -> user.slice - static Status find_global_cgroup(const std::string& subsystem, std::string* path); + static Status find_global_cgroupv1(const std::string& subsystem, std::string* path); // Returns the absolute path to the CGroup from inside the container. // E.g. if this process belongs to // /sys/fs/cgroup/memory/kubepods/burstable/pod-, which is mounted at // /sys/fs/cgroup/memory inside the container, this function returns // "/sys/fs/cgroup/memory". - static Status find_abs_cgroup_path(const std::string& subsystem, std::string* path); + static Status find_abs_cgroupv1_path(const std::string& subsystem, std::string* path); // Figures out the mapping of the cgroup root from the container's point of view to // the full path relative to the system-wide cgroups outside of the container. // E.g. /sys/fs/cgroup/memory/kubepods/burstable/pod- may be mounted at // /sys/fs/cgroup/memory inside the container. In that case this function would return // ("/sys/fs/cgroup/memory", "kubepods/burstable/pod-"). - static Status find_cgroup_mounts(const std::string& subsystem, - std::pair* result); + static Status find_cgroupv1_mounts(const std::string& subsystem, + std::pair* result); + + // Which cgroup does the process belong to? + // Returns an empty string if the cgroup cannot be determined. + // Assumes that cgroupsV2Enabled() is enabled. + static std::string cgroupv2_of_process(); + + // Caveats: + // - All of the logic in this file assumes that the current process is the only process in the + // containing cgroup (or more precisely: the only process with significant memory consumption). + // If this is not the case, then other processe's memory consumption may affect the internal + // memory tracker ... + // - Cgroups v1 and v2 allow nested cgroup hierarchies. As v1 is deprecated for over half a + // decade and will go away at some point, hierarchical detection is only implemented for v2. + // - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such + // systems existed only for a short transition period. + static std::optional get_cgroupsv2_path(const std::string& subsystem); + + // Cgroup file with only one line of numbers. + static Status read_int_line_from_cgroup_file(const std::filesystem::path& file_path, + int64_t* val); + + // Multi-line Cgroup files, format is + // kernel 5 + // rss 15 + // [...] + static void read_int_metric_from_cgroup_file( + const std::filesystem::path& file_path, + std::unordered_map& metrics_map); }; } // namespace doris diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h index c568098195b61d6..b201369454f9264 100644 --- a/be/src/util/doris_metrics.h +++ b/be/src/util/doris_metrics.h @@ -178,6 +178,7 @@ class DorisMetrics { UIntGauge* stream_load_pipe_count = nullptr; UIntGauge* new_stream_load_pipe_count = nullptr; UIntGauge* brpc_endpoint_stub_count = nullptr; + UIntGauge* brpc_stream_endpoint_stub_count = nullptr; UIntGauge* brpc_function_endpoint_stub_count = nullptr; UIntGauge* tablet_writer_count = nullptr; diff --git a/be/src/util/faststring.h b/be/src/util/faststring.h index 8d9fa6d004f589e..3ec0acbda01d79e 100644 --- a/be/src/util/faststring.h +++ b/be/src/util/faststring.h @@ -35,7 +35,7 @@ namespace doris { // common use cases (in particular, resize() will fill with uninitialized data // instead of memsetting to \0) // only build() can transfer data to the outside. -class faststring : private Allocator { +class faststring : private Allocator { public: enum { kInitialCapacity = 32 }; diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index baddefcc27f828f..8be1db5cb8559ce 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -20,6 +20,8 @@ #include "mem_info.h" +#include "gutil/strings/split.h" + #ifdef __APPLE__ #include #endif @@ -34,11 +36,11 @@ #include #include #include -#include +#include "common/cgroup_memory_ctl.h" #include "common/config.h" #include "common/status.h" -#include "gutil/strings/split.h" +#include "runtime/memory/global_memory_arbitrator.h" #include "util/cgroup_util.h" #include "util/parse_util.h" #include "util/pretty_printer.h" @@ -46,6 +48,21 @@ namespace doris { +static bvar::Adder memory_jemalloc_cache_bytes("memory_jemalloc_cache_bytes"); +static bvar::Adder memory_jemalloc_dirty_pages_bytes("memory_jemalloc_dirty_pages_bytes"); +static bvar::Adder memory_jemalloc_metadata_bytes("memory_jemalloc_metadata_bytes"); +static bvar::Adder memory_jemalloc_virtual_bytes("memory_jemalloc_virtual_bytes"); +static bvar::Adder memory_cgroup_usage_bytes("memory_cgroup_usage_bytes"); +static bvar::Adder memory_sys_available_bytes("memory_sys_available_bytes"); +static bvar::Adder memory_arbitrator_sys_available_bytes( + "memory_arbitrator_sys_available_bytes"); +static bvar::Adder memory_arbitrator_process_usage_bytes( + "memory_arbitrator_process_usage_bytes"); +static bvar::Adder memory_arbitrator_reserve_memory_bytes( + "memory_arbitrator_reserve_memory_bytes"); +static bvar::Adder memory_arbitrator_refresh_interval_growth_bytes( + "memory_arbitrator_refresh_interval_growth_bytes"); + bool MemInfo::_s_initialized = false; std::atomic MemInfo::_s_physical_mem = std::numeric_limits::max(); std::atomic MemInfo::_s_mem_limit = std::numeric_limits::max(); @@ -59,7 +76,6 @@ std::atomic MemInfo::_s_virtual_memory_used = 0; int64_t MemInfo::_s_cgroup_mem_limit = std::numeric_limits::max(); int64_t MemInfo::_s_cgroup_mem_usage = std::numeric_limits::min(); -static std::unordered_map _s_cgroup_mem_info_bytes; bool MemInfo::_s_cgroup_mem_refresh_state = false; int64_t MemInfo::_s_cgroup_mem_refresh_wait_times = 0; @@ -116,6 +132,33 @@ void MemInfo::refresh_allocator_mem() { #endif } +void MemInfo::refresh_memory_bvar() { + memory_jemalloc_cache_bytes << MemInfo::allocator_cache_mem() - + memory_jemalloc_cache_bytes.get_value(); + memory_jemalloc_dirty_pages_bytes + << MemInfo::je_dirty_pages_mem() - memory_jemalloc_dirty_pages_bytes.get_value(); + memory_jemalloc_metadata_bytes + << MemInfo::allocator_metadata_mem() - memory_jemalloc_metadata_bytes.get_value(); + memory_jemalloc_virtual_bytes << MemInfo::allocator_virtual_mem() - + memory_jemalloc_virtual_bytes.get_value(); + + memory_cgroup_usage_bytes << _s_cgroup_mem_usage - memory_cgroup_usage_bytes.get_value(); + memory_sys_available_bytes << _s_sys_mem_available - memory_sys_available_bytes.get_value(); + + memory_arbitrator_sys_available_bytes + << GlobalMemoryArbitrator::sys_mem_available() - + memory_arbitrator_sys_available_bytes.get_value(); + memory_arbitrator_process_usage_bytes + << GlobalMemoryArbitrator::process_memory_usage() - + memory_arbitrator_process_usage_bytes.get_value(); + memory_arbitrator_reserve_memory_bytes + << GlobalMemoryArbitrator::process_reserved_memory() - + memory_arbitrator_reserve_memory_bytes.get_value(); + memory_arbitrator_refresh_interval_growth_bytes + << GlobalMemoryArbitrator::refresh_interval_memory_growth - + memory_arbitrator_refresh_interval_growth_bytes.get_value(); +} + #ifndef __APPLE__ void MemInfo::refresh_proc_meminfo() { std::ifstream meminfo("/proc/meminfo", std::ios::in); @@ -136,7 +179,7 @@ void MemInfo::refresh_proc_meminfo() { if (result == StringParser::PARSE_SUCCESS) { if (fields.size() == 2) { _mem_info_bytes[key] = mem_value; - } else if (fields[2].compare("kB") == 0) { + } else if (fields[2] == "kB") { _mem_info_bytes[key] = mem_value * 1024L; } } @@ -151,65 +194,28 @@ void MemInfo::refresh_proc_meminfo() { int64_t cgroup_mem_usage = -1; std::string cgroup_mem_info_file_path; _s_cgroup_mem_refresh_state = true; - Status status = CGroupUtil::find_cgroup_mem_limit(&cgroup_mem_limit); - if (!status.ok() || cgroup_mem_limit <= 0) { + Status status = CGroupMemoryCtl::find_cgroup_mem_limit(&cgroup_mem_limit); + if (!status.ok()) { _s_cgroup_mem_refresh_state = false; } - status = CGroupUtil::find_cgroup_mem_usage(&cgroup_mem_usage); - if (!status.ok() || cgroup_mem_usage <= 0) { - _s_cgroup_mem_refresh_state = false; - } - status = CGroupUtil::find_cgroup_mem_info(&cgroup_mem_info_file_path); - if (status.ok()) { - std::ifstream cgroup_meminfo(cgroup_mem_info_file_path, std::ios::in); - std::string line; - - while (cgroup_meminfo.good() && !cgroup_meminfo.eof()) { - getline(cgroup_meminfo, line); - std::vector fields = - strings::Split(line, " ", strings::SkipWhitespace()); - if (fields.size() < 2) { - continue; - } - std::string key = fields[0].substr(0, fields[0].size()); - - StringParser::ParseResult result; - auto mem_value = StringParser::string_to_int(fields[1].data(), - fields[1].size(), &result); - - if (result == StringParser::PARSE_SUCCESS) { - if (fields.size() == 2) { - _s_cgroup_mem_info_bytes[key] = mem_value; - } else if (fields[2] == "kB") { - _s_cgroup_mem_info_bytes[key] = mem_value * 1024L; - } - } - } - if (cgroup_meminfo.is_open()) { - cgroup_meminfo.close(); - } - } else { + status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage); + if (!status.ok()) { _s_cgroup_mem_refresh_state = false; } if (_s_cgroup_mem_refresh_state) { _s_cgroup_mem_limit = cgroup_mem_limit; - // https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command - // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff) - // so, memory.usage_in_bytes - memory.meminfo["Cached"] - _s_cgroup_mem_usage = cgroup_mem_usage - _s_cgroup_mem_info_bytes["cache"]; + _s_cgroup_mem_usage = cgroup_mem_usage; // wait 10s, 100 * 100ms, avoid too frequently. _s_cgroup_mem_refresh_wait_times = -100; LOG(INFO) << "Refresh cgroup memory win, refresh again after 10s, cgroup mem limit: " - << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage - << ", cgroup mem info cached: " << _s_cgroup_mem_info_bytes["cache"]; + << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage; } else { // find cgroup failed, wait 300s, 1000 * 100ms. _s_cgroup_mem_refresh_wait_times = -3000; LOG(INFO) << "Refresh cgroup memory failed, refresh again after 300s, cgroup mem limit: " - << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage - << ", cgroup mem info cached: " << _s_cgroup_mem_info_bytes["cache"]; + << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage; } } else { if (config::enable_use_cgroup_memory_info) { @@ -392,7 +398,7 @@ std::string MemInfo::debug_string() { stream << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES) << std::endl; stream << "Memory Limt: " << PrettyPrinter::print(_s_mem_limit, TUnit::BYTES) << std::endl; - stream << "CGroup Info: " << doris::CGroupUtil::debug_string() << std::endl; + stream << "CGroup Info: " << doris::CGroupMemoryCtl::debug_string() << std::endl; return stream.str(); } diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h index 9335933286ec24c..60ce26016b1b321 100644 --- a/be/src/util/mem_info.h +++ b/be/src/util/mem_info.h @@ -75,6 +75,8 @@ class MemInfo { static void refresh_proc_meminfo(); + static void refresh_memory_bvar(); + static inline int64_t sys_mem_available_low_water_mark() { return _s_sys_mem_available_low_water_mark; } @@ -144,8 +146,11 @@ class MemInfo { if (config::enable_je_purge_dirty_pages) { try { // Purge all unused dirty pages for arena , or for all arenas if equals MALLCTL_ARENAS_ALL. - jemallctl(fmt::format("arena.{}.purge", MALLCTL_ARENAS_ALL).c_str(), nullptr, - nullptr, nullptr, 0); + int err = jemallctl(fmt::format("arena.{}.purge", MALLCTL_ARENAS_ALL).c_str(), + nullptr, nullptr, nullptr, 0); + if (err) { + LOG(WARNING) << "Jemalloc purge all unused dirty pages failed"; + } } catch (...) { LOG(WARNING) << "Purge all unused dirty pages for all arenas failed"; } @@ -153,6 +158,22 @@ class MemInfo { #endif } + // the limit of `tcache` is the number of pages, not the total number of page bytes. + // `tcache` has two cleaning opportunities: 1. the number of memory alloc and releases reaches a certain number, + // recycle pages that has not been used for a long time; 2. recycle all `tcache` when the thread exits. + // here add a total size limit. + static inline void je_thread_tcache_flush() { +#ifdef USE_JEMALLOC + constexpr size_t TCACHE_LIMIT = (1ULL << 30); // 1G + if (allocator_cache_mem() - je_dirty_pages_mem() > TCACHE_LIMIT) { + int err = jemallctl("thread.tcache.flush", nullptr, nullptr, nullptr, 0); + if (err) { + LOG(WARNING) << "Jemalloc thread.tcache.flush failed"; + } + } +#endif + } + static std::mutex je_purge_dirty_pages_lock; static std::condition_variable je_purge_dirty_pages_cv; static std::atomic je_purge_dirty_pages_notify; diff --git a/be/src/util/s3_util.cpp b/be/src/util/s3_util.cpp index 24e11b03b0b3817..ab291c7340c39de 100644 --- a/be/src/util/s3_util.cpp +++ b/be/src/util/s3_util.cpp @@ -56,7 +56,8 @@ namespace doris { namespace s3_bvar { bvar::LatencyRecorder s3_get_latency("s3_get"); bvar::LatencyRecorder s3_put_latency("s3_put"); -bvar::LatencyRecorder s3_delete_latency("s3_delete"); +bvar::LatencyRecorder s3_delete_object_latency("s3_delete_object"); +bvar::LatencyRecorder s3_delete_objects_latency("s3_delete_objects"); bvar::LatencyRecorder s3_head_latency("s3_head"); bvar::LatencyRecorder s3_multi_part_upload_latency("s3_multi_part_upload"); bvar::LatencyRecorder s3_list_latency("s3_list"); @@ -67,8 +68,20 @@ bvar::LatencyRecorder s3_copy_object_latency("s3_copy_object"); namespace { -bool is_s3_conf_valid(const S3ClientConf& conf) { - return !conf.endpoint.empty() && !conf.region.empty() && !conf.ak.empty() && !conf.sk.empty(); +doris::Status is_s3_conf_valid(const S3ClientConf& conf) { + if (conf.endpoint.empty()) { + return Status::InvalidArgument("Invalid s3 conf, empty endpoint"); + } + if (conf.region.empty()) { + return Status::InvalidArgument("Invalid s3 conf, empty region"); + } + if (conf.ak.empty()) { + return Status::InvalidArgument("Invalid s3 conf, empty ak"); + } + if (conf.sk.empty()) { + return Status::InvalidArgument("Invalid s3 conf, empty sk"); + } + return Status::OK(); } // Return true is convert `str` to int successfully @@ -90,10 +103,22 @@ constexpr char S3_MAX_CONN_SIZE[] = "AWS_MAX_CONN_SIZE"; constexpr char S3_REQUEST_TIMEOUT_MS[] = "AWS_REQUEST_TIMEOUT_MS"; constexpr char S3_CONN_TIMEOUT_MS[] = "AWS_CONNECTION_TIMEOUT_MS"; +auto metric_func_factory(bvar::Adder& ns_bvar, bvar::Adder& req_num_bvar) { + return [&](int64_t ns) { + if (ns > 0) { + ns_bvar << ns; + } else { + req_num_bvar << 1; + } + }; +} + } // namespace -bvar::Adder get_rate_limit_ms("get_rate_limit_ms"); -bvar::Adder put_rate_limit_ms("put_rate_limit_ms"); +bvar::Adder get_rate_limit_ns("get_rate_limit_ns"); +bvar::Adder get_rate_limit_exceed_req_num("get_rate_limit_exceed_req_num"); +bvar::Adder put_rate_limit_ns("put_rate_limit_ns"); +bvar::Adder put_rate_limit_exceed_req_num("put_rate_limit_exceed_req_num"); S3RateLimiterHolder* S3ClientFactory::rate_limiter(S3RateLimitType type) { CHECK(type == S3RateLimitType::GET || type == S3RateLimitType::PUT) << to_string(type); @@ -164,18 +189,19 @@ S3ClientFactory::S3ClientFactory() { }; Aws::InitAPI(_aws_options); _ca_cert_file_path = get_valid_ca_cert_path(); - _rate_limiters = {std::make_unique( - S3RateLimitType::GET, config::s3_get_token_per_second, - config::s3_get_bucket_tokens, config::s3_get_token_limit, - [&](int64_t ms) { get_rate_limit_ms << ms; }), - std::make_unique( - S3RateLimitType::PUT, config::s3_put_token_per_second, - config::s3_put_bucket_tokens, config::s3_put_token_limit, - [&](int64_t ms) { put_rate_limit_ms << ms; })}; + _rate_limiters = { + std::make_unique( + S3RateLimitType::GET, config::s3_get_token_per_second, + config::s3_get_bucket_tokens, config::s3_get_token_limit, + metric_func_factory(get_rate_limit_ns, get_rate_limit_exceed_req_num)), + std::make_unique( + S3RateLimitType::PUT, config::s3_put_token_per_second, + config::s3_put_bucket_tokens, config::s3_put_token_limit, + metric_func_factory(put_rate_limit_ns, put_rate_limit_exceed_req_num))}; } -string S3ClientFactory::get_valid_ca_cert_path() { - vector vec_ca_file_path = doris::split(config::ca_cert_file_paths, ";"); +std::string S3ClientFactory::get_valid_ca_cert_path() { + auto vec_ca_file_path = doris::split(config::ca_cert_file_paths, ";"); auto it = vec_ca_file_path.begin(); for (; it != vec_ca_file_path.end(); ++it) { if (std::filesystem::exists(*it)) { @@ -195,7 +221,7 @@ S3ClientFactory& S3ClientFactory::instance() { } std::shared_ptr S3ClientFactory::create(const S3ClientConf& s3_conf) { - if (!is_s3_conf_valid(s3_conf)) { + if (!is_s3_conf_valid(s3_conf).ok()) { return nullptr; } @@ -364,8 +390,8 @@ Status S3ClientFactory::convert_properties_to_s3_conf( s3_conf->client_conf.use_virtual_addressing = it->second != "true"; } - if (!is_s3_conf_valid(s3_conf->client_conf)) { - return Status::InvalidArgument("S3 properties are incorrect, please check properties."); + if (auto st = is_s3_conf_valid(s3_conf->client_conf); !st.ok()) { + return st; } return Status::OK(); } diff --git a/be/src/util/s3_util.h b/be/src/util/s3_util.h index 3d8f55e7613f0cb..1a1a5ae39ca18a2 100644 --- a/be/src/util/s3_util.h +++ b/be/src/util/s3_util.h @@ -50,7 +50,8 @@ namespace doris { namespace s3_bvar { extern bvar::LatencyRecorder s3_get_latency; extern bvar::LatencyRecorder s3_put_latency; -extern bvar::LatencyRecorder s3_delete_latency; +extern bvar::LatencyRecorder s3_delete_object_latency; +extern bvar::LatencyRecorder s3_delete_objects_latency; extern bvar::LatencyRecorder s3_head_latency; extern bvar::LatencyRecorder s3_multi_part_upload_latency; extern bvar::LatencyRecorder s3_list_latency; @@ -61,25 +62,6 @@ extern bvar::LatencyRecorder s3_copy_object_latency; class S3URI; -inline ::Aws::Client::AWSError<::Aws::S3::S3Errors> s3_error_factory() { - return {::Aws::S3::S3Errors::INTERNAL_FAILURE, "exceeds limit", "exceeds limit", false}; -} - -#define DO_S3_RATE_LIMIT(op, code) \ - [&]() mutable { \ - if (!config::enable_s3_rate_limiter) { \ - return (code); \ - } \ - auto sleep_duration = S3ClientFactory::instance().rate_limiter(op)->add(1); \ - if (sleep_duration < 0) { \ - using T = decltype((code)); \ - return T(s3_error_factory()); \ - } \ - return (code); \ - }() - -#define DO_S3_GET_RATE_LIMIT(code) DO_S3_RATE_LIMIT(S3RateLimitType::GET, code) - struct S3ClientConf { std::string endpoint; std::string region; diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h index 45f82b23ac99000..7e2e7c8202569d2 100644 --- a/be/src/util/simd/bits.h +++ b/be/src/util/simd/bits.h @@ -21,19 +21,58 @@ #include #include +#if defined(__ARM_NEON) && defined(__aarch64__) +#include +#endif + #include "util/sse_util.hpp" namespace doris { namespace simd { -/// todo(zeno) Compile add avx512 parameter, modify it to bytes64_mask_to_bits64_mask -/// Transform 32-byte mask to 32-bit mask +consteval auto bits_mask_length() { +#if defined(__ARM_NEON) && defined(__aarch64__) + return 16; +#else + return 32; +#endif +} + +#if defined(__ARM_NEON) && defined(__aarch64__) +inline uint64_t get_nibble_mask(uint8x16_t values) { + // It produces 4-bit out of each byte, alternating between the high 4-bits and low 4-bits of the 16-byte vector. + // Given that the comparison operators give a 16-byte result of 0x00 or 0xff, the result is close to being a PMOVMSKB, + // the only difference is that every matching bit is repeated 4 times and is a 64-bit integer. + // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(values), 4)), 0); +} +/* +Input 16 bytes of data and convert it into a 64-bit integer, where one bit appears 4 times. +Compare with bytes32_mask_to_bits32_mask, a u8 array with a length of 32 + std::vector vec = {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}; + +bytes32_mask_to_bits32_mask 0100 0000 0000 0000,1101 0000 0000 0011 + + + (1101 0000 0000 0011) +bytes16_mask_to_bits64_mask 1111 1111 0000 1111,0000 0000 0000 0000,0000 0000 0000 0000,0000 0000 1111 1111 + (0100 0000 0000 0000) + 0000 1111 0000 0000,0000 0000 0000 0000,0000 0000 0000 0000,0000 0000 0000 0000 +*/ + +inline uint64_t bytes16_mask_to_bits64_mask(const uint8_t* data) { + const uint8x16_t vfilter = vld1q_u8(data); + return get_nibble_mask(vmvnq_u8(vceqzq_u8(vfilter))); +} +#endif + inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { #ifdef __AVX2__ auto zero32 = _mm256_setzero_si256(); uint32_t mask = static_cast(_mm256_movemask_epi8( _mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast(data)), zero32))); -#elif defined(__SSE2__) || defined(__aarch64__) +#elif defined(__SSE2__) auto zero16 = _mm_setzero_si128(); uint32_t mask = (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8( @@ -51,8 +90,39 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { return mask; } -inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) { - return bytes32_mask_to_bits32_mask(reinterpret_cast(data)); +inline auto bytes_mask_to_bits_mask(const uint8_t* data) { +#if defined(__ARM_NEON) && defined(__aarch64__) + return bytes16_mask_to_bits64_mask(data); +#else + return bytes32_mask_to_bits32_mask(data); +#endif +} + +inline constexpr auto bits_mask_all() { +#if defined(__ARM_NEON) && defined(__aarch64__) + return 0xffff'ffff'ffff'ffffULL; +#else + return 0xffffffff; +#endif +} + +template +void iterate_through_bits_mask(Func func, decltype(bytes_mask_to_bits_mask(nullptr)) mask) { +#if defined(__ARM_NEON) && defined(__aarch64__) + mask &= 0x8888'8888'8888'8888ULL; + while (mask) { + const auto index = __builtin_ctzll(mask) >> 2; + func(index); + mask &= mask - 1; + } + +#else + while (mask) { + const auto bit_pos = __builtin_ctzll(mask); + func(bit_pos); + mask = mask & (mask - 1); + } +#endif } inline size_t count_zero_num(const int8_t* __restrict data, size_t size) { @@ -136,6 +206,18 @@ static size_t find_byte(const std::vector& vec, size_t start, T byte) { return (T*)p - vec.data(); } +template +static size_t find_byte(const T* data, size_t start, size_t end, T byte) { + if (start >= end) { + return start; + } + const void* p = std::memchr((const void*)(data + start), byte, end - start); + if (p == nullptr) { + return end; + } + return (T*)p - data; +} + template bool contain_byte(const T* __restrict data, const size_t length, const signed char byte) { return nullptr != std::memchr(reinterpret_cast(data), byte, length); @@ -145,6 +227,10 @@ inline size_t find_one(const std::vector& vec, size_t start) { return find_byte(vec, start, 1); } +inline size_t find_one(const uint8_t* data, size_t start, size_t end) { + return find_byte(data, start, end, 1); +} + inline size_t find_zero(const std::vector& vec, size_t start) { return find_byte(vec, start, 0); } diff --git a/be/src/util/slice.h b/be/src/util/slice.h index 80f9616f3da2bd4..bae33d4ee75010f 100644 --- a/be/src/util/slice.h +++ b/be/src/util/slice.h @@ -340,7 +340,7 @@ struct SliceMap { // // only receive the memory allocated by Allocator and disables mmap, // otherwise the memory may not be freed correctly, currently only be constructed by faststring. -class OwnedSlice : private Allocator { +class OwnedSlice : private Allocator { public: OwnedSlice() : _slice((uint8_t*)nullptr, 0) {} diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index 082f27e73183456..12d629b42c89f8e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -43,6 +43,8 @@ class AggregateFunctionBitmapCount; template class AggregateFunctionBitmapOp; struct AggregateFunctionBitmapUnionOp; +class IAggregateFunction; +using AggregateFunctionPtr = std::shared_ptr; using DataTypePtr = std::shared_ptr; using DataTypes = std::vector; @@ -178,11 +180,6 @@ class IAggregateFunction { const size_t offset, IColumn& to, const size_t num_rows) const = 0; - /** Returns true for aggregate functions of type -State. - * They are executed as other aggregate functions, but not finalized (return an aggregation state that can be combined with another). - */ - virtual bool is_state() const { return false; } - /** Contains a loop with calls to "add" function. You can collect arguments into array "places" * and do a single call to "add_batch" for devirtualization and inlining. */ @@ -223,6 +220,8 @@ class IAggregateFunction { virtual void set_version(const int version_) { version = version_; } + virtual AggregateFunctionPtr transmit_to_stable() { return nullptr; } + protected: DataTypes argument_types; int version {}; @@ -238,13 +237,16 @@ class IAggregateFunctionHelper : public IAggregateFunction { void destroy_vec(AggregateDataPtr __restrict place, const size_t num_rows) const noexcept override { const size_t size_of_data_ = size_of_data(); + const Derived* derived = assert_cast(this); for (size_t i = 0; i != num_rows; ++i) { - assert_cast(this)->destroy(place + size_of_data_ * i); + derived->destroy(place + size_of_data_ * i); } } void add_batch(size_t batch_size, AggregateDataPtr* places, size_t place_offset, const IColumn** columns, Arena* arena, bool agg_many) const override { + const Derived* derived = assert_cast(this); + if constexpr (std::is_same_v> || std::is_same_v> || std::is_same_v(this)->add_many(iter->first, columns, iter->second, - arena); + derived->add_many(iter->first, columns, iter->second, arena); iter++; } return; @@ -272,23 +273,25 @@ class IAggregateFunctionHelper : public IAggregateFunction { } for (size_t i = 0; i < batch_size; ++i) { - assert_cast(this)->add(places[i] + place_offset, columns, i, arena); + derived->add(places[i] + place_offset, columns, i, arena); } } void add_batch_selected(size_t batch_size, AggregateDataPtr* places, size_t place_offset, const IColumn** columns, Arena* arena) const override { + const Derived* derived = assert_cast(this); for (size_t i = 0; i < batch_size; ++i) { if (places[i]) { - assert_cast(this)->add(places[i] + place_offset, columns, i, arena); + derived->add(places[i] + place_offset, columns, i, arena); } } } void add_batch_single_place(size_t batch_size, AggregateDataPtr place, const IColumn** columns, Arena* arena) const override { + const Derived* derived = assert_cast(this); for (size_t i = 0; i < batch_size; ++i) { - assert_cast(this)->add(place, columns, i, arena); + derived->add(place, columns, i, arena); } } //now this is use for sum/count/avg/min/max win function, other win function should override this function in class @@ -296,31 +299,35 @@ class IAggregateFunctionHelper : public IAggregateFunction { void add_range_single_place(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end, AggregateDataPtr place, const IColumn** columns, Arena* arena) const override { + const Derived* derived = assert_cast(this); frame_start = std::max(frame_start, partition_start); frame_end = std::min(frame_end, partition_end); for (int64_t i = frame_start; i < frame_end; ++i) { - assert_cast(this)->add(place, columns, i, arena); + derived->add(place, columns, i, arena); } } void add_batch_range(size_t batch_begin, size_t batch_end, AggregateDataPtr place, const IColumn** columns, Arena* arena, bool has_null) override { + const Derived* derived = assert_cast(this); for (size_t i = batch_begin; i <= batch_end; ++i) { - assert_cast(this)->add(place, columns, i, arena); + derived->add(place, columns, i, arena); } } void insert_result_into_vec(const std::vector& places, const size_t offset, IColumn& to, const size_t num_rows) const override { + const Derived* derived = assert_cast(this); for (size_t i = 0; i != num_rows; ++i) { - assert_cast(this)->insert_result_into(places[i] + offset, to); + derived->insert_result_into(places[i] + offset, to); } } void serialize_vec(const std::vector& places, size_t offset, BufferWritable& buf, const size_t num_rows) const override { + const Derived* derived = assert_cast(this); for (size_t i = 0; i != num_rows; ++i) { - assert_cast(this)->serialize(places[i] + offset, buf); + derived->serialize(places[i] + offset, buf); buf.commit(); } } @@ -334,11 +341,12 @@ class IAggregateFunctionHelper : public IAggregateFunction { void streaming_agg_serialize(const IColumn** columns, BufferWritable& buf, const size_t num_rows, Arena* arena) const override { std::vector place(size_of_data()); + const Derived* derived = assert_cast(this); for (size_t i = 0; i != num_rows; ++i) { - assert_cast(this)->create(place.data()); - DEFER({ assert_cast(this)->destroy(place.data()); }); - assert_cast(this)->add(place.data(), columns, i, arena); - assert_cast(this)->serialize(place.data(), buf); + derived->create(place.data()); + DEFER({ derived->destroy(place.data()); }); + derived->add(place.data(), columns, i, arena); + derived->serialize(place.data(), buf); buf.commit(); } } @@ -358,17 +366,18 @@ class IAggregateFunctionHelper : public IAggregateFunction { void deserialize_vec(AggregateDataPtr places, const ColumnString* column, Arena* arena, size_t num_rows) const override { - const auto size_of_data = assert_cast(this)->size_of_data(); + const Derived* derived = assert_cast(this); + const auto size_of_data = derived->size_of_data(); for (size_t i = 0; i != num_rows; ++i) { try { auto place = places + size_of_data * i; VectorBufferReader buffer_reader(column->get_data_at(i)); - assert_cast(this)->create(place); - assert_cast(this)->deserialize(place, buffer_reader, arena); + derived->create(place); + derived->deserialize(place, buffer_reader, arena); } catch (...) { for (int j = 0; j < i; ++j) { auto place = places + size_of_data * j; - assert_cast(this)->destroy(place); + derived->destroy(place); } throw; } @@ -378,49 +387,52 @@ class IAggregateFunctionHelper : public IAggregateFunction { void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, AggregateDataPtr rhs, const IColumn* column, Arena* arena, const size_t num_rows) const override { - const auto size_of_data = assert_cast(this)->size_of_data(); + const Derived* derived = assert_cast(this); + const auto size_of_data = derived->size_of_data(); const auto* column_string = assert_cast(column); + for (size_t i = 0; i != num_rows; ++i) { try { auto rhs_place = rhs + size_of_data * i; VectorBufferReader buffer_reader(column_string->get_data_at(i)); - assert_cast(this)->create(rhs_place); - assert_cast(this)->deserialize_and_merge( - places[i] + offset, rhs_place, buffer_reader, arena); + derived->create(rhs_place); + derived->deserialize_and_merge(places[i] + offset, rhs_place, buffer_reader, arena); } catch (...) { for (int j = 0; j < i; ++j) { auto place = rhs + size_of_data * j; - assert_cast(this)->destroy(place); + derived->destroy(place); } throw; } } - assert_cast(this)->destroy_vec(rhs, num_rows); + + derived->destroy_vec(rhs, num_rows); } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, AggregateDataPtr rhs, const IColumn* column, Arena* arena, const size_t num_rows) const override { - const auto size_of_data = assert_cast(this)->size_of_data(); + const auto* derived = assert_cast(this); + const auto size_of_data = derived->size_of_data(); const auto* column_string = assert_cast(column); for (size_t i = 0; i != num_rows; ++i) { try { auto rhs_place = rhs + size_of_data * i; VectorBufferReader buffer_reader(column_string->get_data_at(i)); - assert_cast(this)->create(rhs_place); + derived->create(rhs_place); if (places[i]) { - assert_cast(this)->deserialize_and_merge( - places[i] + offset, rhs_place, buffer_reader, arena); + derived->deserialize_and_merge(places[i] + offset, rhs_place, buffer_reader, + arena); } } catch (...) { for (int j = 0; j < i; ++j) { auto place = rhs + size_of_data * j; - assert_cast(this)->destroy(place); + derived->destroy(place); } throw; } } - assert_cast(this)->destroy_vec(rhs, num_rows); + derived->destroy_vec(rhs, num_rows); } void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, @@ -430,21 +442,21 @@ class IAggregateFunctionHelper : public IAggregateFunction { void merge_vec(const AggregateDataPtr* places, size_t offset, ConstAggregateDataPtr rhs, Arena* arena, const size_t num_rows) const override { - const auto size_of_data = assert_cast(this)->size_of_data(); + const auto* derived = assert_cast(this); + const auto size_of_data = derived->size_of_data(); for (size_t i = 0; i != num_rows; ++i) { - assert_cast(this)->merge(places[i] + offset, rhs + size_of_data * i, - arena); + derived->merge(places[i] + offset, rhs + size_of_data * i, arena); } } void merge_vec_selected(const AggregateDataPtr* places, size_t offset, ConstAggregateDataPtr rhs, Arena* arena, const size_t num_rows) const override { - const auto size_of_data = assert_cast(this)->size_of_data(); + const auto* derived = assert_cast(this); + const auto size_of_data = derived->size_of_data(); for (size_t i = 0; i != num_rows; ++i) { if (places[i]) { - assert_cast(this)->merge(places[i] + offset, rhs + size_of_data * i, - arena); + derived->merge(places[i] + offset, rhs + size_of_data * i, arena); } } } @@ -456,13 +468,15 @@ class IAggregateFunctionHelper : public IAggregateFunction { << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); std::vector deserialized_data(size_of_data()); auto* deserialized_place = (AggregateDataPtr)deserialized_data.data(); + const ColumnString& column_string = assert_cast(column); + const Derived* derived = assert_cast(this); for (size_t i = begin; i <= end; ++i) { - VectorBufferReader buffer_reader( - (assert_cast(column)).get_data_at(i)); - assert_cast(this)->create(deserialized_place); - DEFER({ assert_cast(this)->destroy(deserialized_place); }); - assert_cast(this)->deserialize_and_merge(place, deserialized_place, - buffer_reader, arena); + VectorBufferReader buffer_reader(column_string.get_data_at(i)); + derived->create(deserialized_place); + + DEFER({ derived->destroy(deserialized_place); }); + + derived->deserialize_and_merge(place, deserialized_place, buffer_reader, arena); } } @@ -476,8 +490,9 @@ class IAggregateFunctionHelper : public IAggregateFunction { void deserialize_and_merge(AggregateDataPtr __restrict place, AggregateDataPtr __restrict rhs, BufferReadable& buf, Arena* arena) const override { - assert_cast(this)->deserialize(rhs, buf, arena); - assert_cast(this)->merge(place, rhs, arena); + assert_cast(this)->deserialize(rhs, buf, + arena); + assert_cast(this)->merge(place, rhs, arena); } }; @@ -514,13 +529,12 @@ class IAggregateFunctionDataHelper : public IAggregateFunctionHelper { void deserialize_and_merge(AggregateDataPtr __restrict place, AggregateDataPtr __restrict rhs, BufferReadable& buf, Arena* arena) const override { - assert_cast(this)->deserialize(rhs, buf, arena); - assert_cast(this)->merge(place, rhs, arena); + assert_cast(this)->deserialize(rhs, buf, + arena); + assert_cast(this)->merge(place, rhs, arena); } }; -using AggregateFunctionPtr = std::shared_ptr; - class AggregateFunctionGuard { public: using AggregateData = std::remove_pointer_t; diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.h b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.h index d0f5bce81a02bee..d267499e059818d 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.h @@ -31,6 +31,7 @@ #include "vec/aggregate_functions/aggregate_function.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" #include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" #include "vec/common/string_ref.h" #include "vec/core/types.h" #include "vec/data_types/data_type_number.h" @@ -98,12 +99,14 @@ class AggregateFunctionApproxCountDistinct final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena*) const override { if constexpr (IsFixLenColumnType::value) { - auto column = assert_cast(columns[0]); + auto column = + assert_cast(columns[0]); auto value = column->get_element(row_num); this->data(place).add( HashUtil::murmur_hash64A((char*)&value, sizeof(value), HashUtil::MURMUR_SEED)); } else { - auto value = assert_cast(columns[0])->get_data_at(row_num); + auto value = assert_cast(columns[0]) + ->get_data_at(row_num); uint64_t hash_value = HashUtil::murmur_hash64A(value.data, value.size, HashUtil::MURMUR_SEED); this->data(place).add(hash_value); diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg.h b/be/src/vec/aggregate_functions/aggregate_function_avg.h index 6827c6db373667c..8a18a88839b4db4 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_avg.h @@ -145,7 +145,8 @@ class AggregateFunctionAvg final #ifdef __clang__ #pragma clang fp reassociate(on) #endif - const auto& column = assert_cast(*columns[0]); + const auto& column = + assert_cast(*columns[0]); if constexpr (IsDecimalNumber) { this->data(place).sum += column.get_data()[row_num].value; } else { diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.h b/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.h index af3b2c98cd8b4c6..b59a3dccf0cea82 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.h +++ b/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.h @@ -108,8 +108,10 @@ class AggregateFunctionAvgWeight final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena*) const override { - const auto& column = assert_cast(*columns[0]); - const auto& weight = assert_cast(*columns[1]); + const auto& column = + assert_cast(*columns[0]); + const auto& weight = + assert_cast(*columns[1]); this->data(place).add(column.get_data()[row_num], weight.get_element(row_num)); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_binary.h b/be/src/vec/aggregate_functions/aggregate_function_binary.h index ca06cc1bb81a8f5..a5b6e2b1e0e316c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_binary.h +++ b/be/src/vec/aggregate_functions/aggregate_function_binary.h @@ -41,8 +41,7 @@ template typename Moments> struct StatFunc { using Type1 = T1; using Type2 = T2; - using ResultType = std::conditional_t && std::is_same_v, - Float32, Float64>; + using ResultType = Float64; using Data = Moments; }; diff --git a/be/src/vec/aggregate_functions/aggregate_function_bit.h b/be/src/vec/aggregate_functions/aggregate_function_bit.h index c0b2df85ba25d21..1ab01b03ceea38a 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bit.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bit.h @@ -25,6 +25,7 @@ #include #include "vec/aggregate_functions/aggregate_function.h" +#include "vec/common/assert_cast.h" #include "vec/core/types.h" #include "vec/io/io_helper.h" @@ -114,7 +115,8 @@ class AggregateFunctionBitwise final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena*) const override { - const auto& column = assert_cast&>(*columns[0]); + const auto& column = + assert_cast&, TypeCheckOnRelease::DISABLE>(*columns[0]); this->data(place).add(column.get_data()[row_num]); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h index dd7af71de06ae03..6c504b91bf4abd1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h @@ -166,9 +166,12 @@ class AggregateFunctionBitmapSerializationHelper col.resize(num_rows); auto* data = col.get_data().data(); for (size_t i = 0; i != num_rows; ++i) { - assert_cast(this)->create(place); - DEFER({ assert_cast(this)->destroy(place); }); - assert_cast(this)->add(place, columns, i, arena); + assert_cast(this)->create(place); + DEFER({ + assert_cast(this)->destroy(place); + }); + assert_cast(this)->add(place, columns, + i, arena); data[i] = std::move(this->data(place).value); } } else { @@ -304,7 +307,8 @@ class AggregateFunctionBitmapOp final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena*) const override { - const auto& column = assert_cast(*columns[0]); + const auto& column = + assert_cast(*columns[0]); this->data(place).add(column.get_data()[row_num]); } @@ -367,12 +371,13 @@ class AggregateFunctionBitmapCount final if constexpr (arg_is_nullable) { auto& nullable_column = assert_cast(*columns[0]); if (!nullable_column.is_null_at(row_num)) { - const auto& column = - assert_cast(nullable_column.get_nested_column()); + const auto& column = assert_cast( + nullable_column.get_nested_column()); this->data(place).add(column.get_data()[row_num]); } } else { - const auto& column = assert_cast(*columns[0]); + const auto& column = + assert_cast(*columns[0]); this->data(place).add(column.get_data()[row_num]); } } diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h index ce80b38d0913ba8..19352e022fa7a27 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h @@ -27,6 +27,7 @@ #include "util/bitmap_value.h" #include "vec/aggregate_functions/aggregate_function.h" +#include "vec/common/assert_cast.h" #include "vec/data_types/data_type_bitmap.h" namespace doris { @@ -74,14 +75,16 @@ class AggregateFunctionBitmapAgg final Arena* arena) const override { DCHECK_LT(row_num, columns[0]->size()); if constexpr (arg_nullable) { - auto& nullable_col = assert_cast(*columns[0]); + auto& nullable_col = + assert_cast(*columns[0]); auto& nullable_map = nullable_col.get_null_map_data(); if (!nullable_map[row_num]) { - auto& col = assert_cast(nullable_col.get_nested_column()); + auto& col = assert_cast( + nullable_col.get_nested_column()); this->data(place).add(col.get_data()[row_num]); } } else { - auto& col = assert_cast(*columns[0]); + auto& col = assert_cast(*columns[0]); this->data(place).add(col.get_data()[row_num]); } } diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.h b/be/src/vec/aggregate_functions/aggregate_function_collect.h index 4da6e023eb39496..b99ecd959245e3f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.h +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.h @@ -69,7 +69,8 @@ struct AggregateFunctionCollectSetData { size_t size() const { return data_set.size(); } void add(const IColumn& column, size_t row_num) { - data_set.insert(assert_cast(column).get_data()[row_num]); + data_set.insert(assert_cast(column) + .get_data()[row_num]); } void merge(const SelfType& rhs) { @@ -191,7 +192,8 @@ struct AggregateFunctionCollectListData { size_t size() const { return data.size(); } void add(const IColumn& column, size_t row_num) { - const auto& vec = assert_cast(column).get_data(); + const auto& vec = + assert_cast(column).get_data(); data.push_back(vec[row_num]); } @@ -256,8 +258,10 @@ struct AggregateFunctionCollectListData { } max_size = rhs.max_size; - data->insert_range_from(*rhs.data, 0, - std::min(assert_cast(max_size - size()), rhs.size())); + data->insert_range_from( + *rhs.data, 0, + std::min(assert_cast(max_size - size()), + rhs.size())); } else { data->insert_range_from(*rhs.data, 0, rhs.size()); } @@ -326,8 +330,10 @@ struct AggregateFunctionArrayAggData { } void add(const IColumn& column, size_t row_num) { - const auto& col = assert_cast(column); - const auto& vec = assert_cast(col.get_nested_column()).get_data(); + const auto& col = assert_cast(column); + const auto& vec = + assert_cast(col.get_nested_column()) + .get_data(); null_map->push_back(col.get_null_map_data()[row_num]); nested_column->get_data().push_back(vec[row_num]); DCHECK(null_map->size() == nested_column->size()); @@ -426,8 +432,9 @@ struct AggregateFunctionArrayAggData { } void add(const IColumn& column, size_t row_num) { - const auto& col = assert_cast(column); - const auto& vec = assert_cast(col.get_nested_column()); + const auto& col = assert_cast(column); + const auto& vec = assert_cast( + col.get_nested_column()); null_map->push_back(col.get_null_map_data()[row_num]); nested_column->insert_from(vec, row_num); DCHECK(null_map->size() == nested_column->size()); @@ -561,7 +568,9 @@ class AggregateFunctionCollect if constexpr (HasLimit::value) { if (data.max_size == -1) { data.max_size = - (UInt64)assert_cast(columns[1])->get_element(row_num); + (UInt64)assert_cast( + columns[1]) + ->get_element(row_num); } if (data.size() >= data.max_size) { return; @@ -711,15 +720,20 @@ class AggregateFunctionCollect for (size_t i = 0; i < num_rows; ++i) { col_null->get_null_map_data().push_back(col_src.get_null_map_data()[i]); if constexpr (std::is_same_v) { - auto& vec = assert_cast(col_null->get_nested_column()); + auto& vec = assert_cast( + col_null->get_nested_column()); const auto& vec_src = - assert_cast(col_src.get_nested_column()); + assert_cast( + col_src.get_nested_column()); vec.insert_from(vec_src, i); } else { using ColVecType = ColumnVectorOrDecimal; - auto& vec = assert_cast(col_null->get_nested_column()).get_data(); - auto& vec_src = - assert_cast(col_src.get_nested_column()).get_data(); + auto& vec = assert_cast( + col_null->get_nested_column()) + .get_data(); + auto& vec_src = assert_cast( + col_src.get_nested_column()) + .get_data(); vec.push_back(vec_src[i]); } to_arr.get_offsets().push_back(to_nested_col.size()); diff --git a/be/src/vec/aggregate_functions/aggregate_function_corr.cpp b/be/src/vec/aggregate_functions/aggregate_function_corr.cpp index fb84e92e0e669b9..8237f5882980641 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_corr.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_corr.cpp @@ -68,7 +68,8 @@ struct CorrMoment { } T get() const { - if ((m0 * x2 - x1 * x1) * (m0 * y2 - y1 * y1) == 0) [[unlikely]] { + // avoid float error(silent nan) when x or y is constant + if (m0 * x2 <= x1 * x1 || m0 * y2 <= y1 * y1) [[unlikely]] { return 0; } return (m0 * xy - x1 * y1) / sqrt((m0 * x2 - x1 * x1) * (m0 * y2 - y1 * y1)); diff --git a/be/src/vec/aggregate_functions/aggregate_function_count.h b/be/src/vec/aggregate_functions/aggregate_function_count.h index 7449c949cb90471..62aa869771c0a53 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count.h +++ b/be/src/vec/aggregate_functions/aggregate_function_count.h @@ -196,7 +196,9 @@ class AggregateFunctionCountNotNullUnary final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena*) const override { - data(place).count += !assert_cast(*columns[0]).is_null_at(row_num); + data(place).count += + !assert_cast(*columns[0]) + .is_null_at(row_num); } void reset(AggregateDataPtr place) const override { data(place).count = 0; } diff --git a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp index 1ff6427f69a6a81..790d0270aa39e8f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp @@ -46,15 +46,6 @@ AggregateFunctionPtr create_function_single_value(const String& name, FOR_NUMERIC_TYPES(DISPATCH) #undef DISPATCH -#define DISPATCH(TYPE) \ - if (which.idx == TypeIndex::TYPE) \ - return creator_without_type::create>>, is_nullable>>( \ - custom_nullable ? remove_nullable(argument_types) : argument_types, \ - result_is_nullable); - FOR_DECIMAL_TYPES(DISPATCH) -#undef DISPATCH - LOG(WARNING) << fmt::format("create_function_single_value with unknowed type {}", argument_types[0]->get_name()); return nullptr; diff --git a/be/src/vec/aggregate_functions/aggregate_function_covar.h b/be/src/vec/aggregate_functions/aggregate_function_covar.h index 9dc2d2d5b381c66..179e723285e9008 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_covar.h +++ b/be/src/vec/aggregate_functions/aggregate_function_covar.h @@ -107,9 +107,11 @@ struct BaseData { } void add(const IColumn* column_x, const IColumn* column_y, size_t row_num) { - const auto& sources_x = assert_cast&>(*column_x); + const auto& sources_x = + assert_cast&, TypeCheckOnRelease::DISABLE>(*column_x); double source_data_x = sources_x.get_data()[row_num]; - const auto& sources_y = assert_cast&>(*column_y); + const auto& sources_y = + assert_cast&, TypeCheckOnRelease::DISABLE>(*column_y); double source_data_y = sources_y.get_data()[row_num]; sum_x += source_data_x; @@ -126,119 +128,23 @@ struct BaseData { int64_t count; }; -template -struct BaseDatadecimal { - BaseDatadecimal() : sum_x(0), sum_y(0), sum_xy(0), count(0) {} - virtual ~BaseDatadecimal() = default; - - void write(BufferWritable& buf) const { - write_binary(sum_x, buf); - write_binary(sum_y, buf); - write_binary(sum_xy, buf); - write_binary(count, buf); - } - - void read(BufferReadable& buf) { - read_binary(sum_x, buf); - read_binary(sum_y, buf); - read_binary(sum_xy, buf); - read_binary(count, buf); - } - - void reset() { - sum_x = DecimalV2Value(); - sum_y = DecimalV2Value(); - sum_xy = DecimalV2Value(); - count = {}; - } - - DecimalV2Value get_pop_result() const { - if (count == 1) { - return DecimalV2Value(); - } - DecimalV2Value count_dec = DecimalV2Value(static_cast(count)); - return sum_xy / count_dec - sum_x * sum_y / (count_dec * count_dec); - } - - DecimalV2Value get_samp_result() const { - DecimalV2Value count_dec = DecimalV2Value(static_cast(count)); - DecimalV2Value one = DecimalV2Value(static_cast(1)); - return sum_xy / (count_dec - one) - sum_x * sum_y / (count_dec * (count_dec - one)); - } - - void merge(const BaseDatadecimal& rhs) { - if (rhs.count == 0) { - return; - } - sum_x += rhs.sum_x; - sum_y += rhs.sum_y; - sum_xy += rhs.sum_xy; - count += rhs.count; - } - - void add(const IColumn* column_x, const IColumn* column_y, size_t row_num) { - auto source_data_x = get_source_data(column_x, row_num); - auto source_data_y = get_source_data(column_y, row_num); - sum_x += source_data_x; - sum_y += source_data_y; - sum_xy += source_data_x * source_data_y; - count += 1; - } - - DecimalV2Value get_source_data(const IColumn* column, size_t row_num) { - const auto& sources = assert_cast&>(*column); - Field field = sources[row_num]; - auto decimal_field = field.template get>(); - int128_t value; - if (decimal_field.get_scale() > DecimalV2Value::SCALE) { - value = static_cast(decimal_field.get_value()) / - (decimal_field.get_scale_multiplier() / DecimalV2Value::ONE_BILLION); - } else { - value = static_cast(decimal_field.get_value()) * - (DecimalV2Value::ONE_BILLION / decimal_field.get_scale_multiplier()); - } - return DecimalV2Value(value); - } - - static DataTypePtr get_return_type() { - return std::make_shared>(27, 9); - } - - DecimalV2Value sum_x; - DecimalV2Value sum_y; - DecimalV2Value sum_xy; - int64_t count; -}; - template struct PopData : Data { - using ColVecResult = - std::conditional_t, ColumnDecimal, ColumnFloat64>; void insert_result_into(IColumn& to) const { - auto& col = assert_cast(to); - if constexpr (IsDecimalNumber) { - col.get_data().push_back(this->get_pop_result().value()); - } else { - col.get_data().push_back(this->get_pop_result()); - } + auto& col = assert_cast(to); + col.get_data().push_back(this->get_pop_result()); } }; template struct SampData_OLDER : Data { - using ColVecResult = - std::conditional_t, ColumnDecimal, ColumnFloat64>; void insert_result_into(IColumn& to) const { ColumnNullable& nullable_column = assert_cast(to); if (this->count == 1 || this->count == 0) { nullable_column.insert_default(); } else { - auto& col = assert_cast(nullable_column.get_nested_column()); - if constexpr (IsDecimalNumber) { - col.get_data().push_back(this->get_samp_result().value()); - } else { - col.get_data().push_back(this->get_samp_result()); - } + auto& col = assert_cast(nullable_column.get_nested_column()); + col.get_data().push_back(this->get_samp_result()); nullable_column.get_null_map_data().push_back(0); } } @@ -246,18 +152,12 @@ struct SampData_OLDER : Data { template struct SampData : Data { - using ColVecResult = - std::conditional_t, ColumnDecimal, ColumnFloat64>; void insert_result_into(IColumn& to) const { - auto& col = assert_cast(to); + auto& col = assert_cast(to); if (this->count == 1 || this->count == 0) { col.insert_default(); } else { - if constexpr (IsDecimalNumber) { - col.get_data().push_back(this->get_samp_result().value()); - } else { - col.get_data().push_back(this->get_samp_result()); - } + col.get_data().push_back(this->get_samp_result()); } } }; diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp index 3155aa24be2a08c..f86d44b7d6828ba 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp @@ -29,6 +29,16 @@ namespace doris::vectorized { +template +struct Reducer { + template + using Output = AggregateFunctionDistinctSingleNumericData; + using AggregateFunctionDistinctNormal = AggregateFunctionDistinct; +}; + +template +using AggregateFunctionDistinctNumeric = Reducer::AggregateFunctionDistinctNormal; + class AggregateFunctionCombinatorDistinct final : public IAggregateFunctionCombinator { public: String get_name() const override { return "Distinct"; } @@ -52,22 +62,15 @@ class AggregateFunctionCombinatorDistinct final : public IAggregateFunctionCombi if (arguments.size() == 1) { AggregateFunctionPtr res( - creator_with_numeric_type::create( + creator_with_numeric_type::create( arguments, result_is_nullable, nested_function)); if (res) { return res; } - if (arguments[0]->is_value_unambiguously_represented_in_contiguous_memory_region()) { - res = creator_without_type::create>>( - arguments, result_is_nullable, nested_function); - } else { - res = creator_without_type::create>>( - arguments, result_is_nullable, nested_function); - } + res = creator_without_type::create< + AggregateFunctionDistinct>( + arguments, result_is_nullable, nested_function); return res; } return creator_without_type::create< diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.h b/be/src/vec/aggregate_functions/aggregate_function_distinct.h index c0c7a5b66dd58f5..6193b28a131e9f2 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.h +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.h @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include "vec/aggregate_functions/aggregate_function.h" @@ -54,105 +56,172 @@ struct DefaultHash; namespace doris::vectorized { -template +template struct AggregateFunctionDistinctSingleNumericData { /// When creating, the hash table must be small. - using Set = HashSetWithStackMemory, 4>; - using Self = AggregateFunctionDistinctSingleNumericData; - Set set; + using Container = std::conditional_t, + HashSetWithStackMemory, 4>>; + using Self = AggregateFunctionDistinctSingleNumericData; + Container data; void add(const IColumn** columns, size_t /* columns_num */, size_t row_num, Arena*) { - const auto& vec = assert_cast&>(*columns[0]).get_data(); - set.insert(vec[row_num]); + const auto& vec = + assert_cast&, TypeCheckOnRelease::DISABLE>(*columns[0]) + .get_data(); + if constexpr (stable) { + data.emplace(vec[row_num], data.size()); + } else { + data.insert(vec[row_num]); + } } - void merge(const Self& rhs, Arena*) { set.merge(rhs.set); } + void merge(const Self& rhs, Arena*) { + DCHECK(!stable); + if constexpr (!stable) { + data.merge(rhs.data); + } + } - void serialize(BufferWritable& buf) const { set.write(buf); } + void serialize(BufferWritable& buf) const { + DCHECK(!stable); + if constexpr (!stable) { + data.write(buf); + } + } - void deserialize(BufferReadable& buf, Arena*) { set.read(buf); } + void deserialize(BufferReadable& buf, Arena*) { + DCHECK(!stable); + if constexpr (!stable) { + data.read(buf); + } + } MutableColumns get_arguments(const DataTypes& argument_types) const { MutableColumns argument_columns; argument_columns.emplace_back(argument_types[0]->create_column()); - for (const auto& elem : set) { - argument_columns[0]->insert(elem.get_value()); + + if constexpr (stable) { + argument_columns[0]->resize(data.size()); + auto ptr = (T*)const_cast(argument_columns[0]->get_raw_data().data); + for (auto it : data) { + ptr[it.second] = it.first; + } + } else { + for (const auto& elem : data) { + argument_columns[0]->insert(elem.get_value()); + } } return argument_columns; } }; +template struct AggregateFunctionDistinctGenericData { /// When creating, the hash table must be small. - using Set = HashSetWithStackMemory; + using Container = std::conditional_t, + HashSetWithStackMemory>; using Self = AggregateFunctionDistinctGenericData; - Set set; + Container data; void merge(const Self& rhs, Arena* arena) { - Set::LookupResult it; - bool inserted; - for (const auto& elem : rhs.set) { - StringRef key = elem.get_value(); - key.data = arena->insert(key.data, key.size); - set.emplace(key, it, inserted); + DCHECK(!stable); + if constexpr (!stable) { + typename Container::LookupResult it; + bool inserted; + for (const auto& elem : rhs.data) { + StringRef key = elem.get_value(); + key.data = arena->insert(key.data, key.size); + data.emplace(key, it, inserted); + } } } void serialize(BufferWritable& buf) const { - write_var_uint(set.size(), buf); - for (const auto& elem : set) { - write_string_binary(elem.get_value(), buf); + DCHECK(!stable); + if constexpr (!stable) { + write_var_uint(data.size(), buf); + for (const auto& elem : data) { + write_string_binary(elem.get_value(), buf); + } } } void deserialize(BufferReadable& buf, Arena* arena) { - UInt64 size; - read_var_uint(size, buf); - - StringRef ref; - for (size_t i = 0; i < size; ++i) { - read_string_binary(ref, buf); - set.insert(ref); + DCHECK(!stable); + if constexpr (!stable) { + UInt64 size; + read_var_uint(size, buf); + + StringRef ref; + for (size_t i = 0; i < size; ++i) { + read_string_binary(ref, buf); + data.insert(ref); + } } } }; -template -struct AggregateFunctionDistinctSingleGenericData : public AggregateFunctionDistinctGenericData { +template +struct AggregateFunctionDistinctSingleGenericData + : public AggregateFunctionDistinctGenericData { + using Base = AggregateFunctionDistinctGenericData; + using Base::data; void add(const IColumn** columns, size_t /* columns_num */, size_t row_num, Arena* arena) { - Set::LookupResult it; - bool inserted; auto key = columns[0]->get_data_at(row_num); key.data = arena->insert(key.data, key.size); - set.emplace(key, it, inserted); + + if constexpr (stable) { + data.emplace(key, data.size()); + } else { + typename Base::Container::LookupResult it; + bool inserted; + data.emplace(key, it, inserted); + } } MutableColumns get_arguments(const DataTypes& argument_types) const { MutableColumns argument_columns; argument_columns.emplace_back(argument_types[0]->create_column()); - for (const auto& elem : set) { - argument_columns[0]->insert_data(elem.get_value().data, elem.get_value().size); + if constexpr (stable) { + std::vector tmp(data.size()); + for (auto it : data) { + tmp[it.second] = it.first; + } + for (int i = 0; i < data.size(); i++) { + argument_columns[0]->insert_data(tmp[i].data, tmp[i].size); + } + } else { + for (const auto& elem : data) { + argument_columns[0]->insert_data(elem.get_value().data, elem.get_value().size); + } } return argument_columns; } }; -struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDistinctGenericData { +template +struct AggregateFunctionDistinctMultipleGenericData + : public AggregateFunctionDistinctGenericData { + using Base = AggregateFunctionDistinctGenericData; + using Base::data; void add(const IColumn** columns, size_t columns_num, size_t row_num, Arena* arena) { const char* begin = nullptr; - StringRef value(begin, 0); + StringRef key(begin, 0); for (size_t i = 0; i < columns_num; ++i) { auto cur_ref = columns[i]->serialize_value_into_arena(row_num, *arena, begin); - value.data = cur_ref.data - value.size; - value.size += cur_ref.size; + key.data = cur_ref.data - key.size; + key.size += cur_ref.size; } - Set::LookupResult it; - bool inserted; - value.data = arena->insert(value.data, value.size); - set.emplace(value, it, inserted); + if constexpr (stable) { + data.emplace(key, data.size()); + } else { + typename Base::Container::LookupResult it; + bool inserted; + data.emplace(key, it, inserted); + } } MutableColumns get_arguments(const DataTypes& argument_types) const { @@ -161,10 +230,23 @@ struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDi argument_columns[i] = argument_types[i]->create_column(); } - for (const auto& elem : set) { - const char* begin = elem.get_value().data; - for (auto& column : argument_columns) { - begin = column->deserialize_and_insert_from_arena(begin); + if constexpr (stable) { + std::vector tmp(data.size()); + for (auto it : data) { + tmp[it.second] = it.first; + } + for (int i = 0; i < data.size(); i++) { + const char* begin = tmp[i].data; + for (auto& column : argument_columns) { + begin = column->deserialize_and_insert_from_arena(begin); + } + } + } else { + for (const auto& elem : data) { + const char* begin = elem.get_value().data; + for (auto& column : argument_columns) { + begin = column->deserialize_and_insert_from_arena(begin); + } } } @@ -175,9 +257,10 @@ struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDi /** Adaptor for aggregate functions. * Adding -Distinct suffix to aggregate function **/ -template +template