diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index 604c2479bb0..6b58f5c7370 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -179,7 +179,17 @@ class FunctionStringReplace : public IFunction auto needle = c1_const->getValue(); auto replacement = c2_const->getValue(); - if (const auto * col = checkAndGetColumn(column_src.get())) + if (const auto * col_const = checkAndGetColumnConst(column_src.get())) + { + std::string result_value; + const auto * src_const = typeid_cast(column_src.get()); + auto src = src_const->getValue(); + Impl::constant(src, needle, replacement, pos, occ, match_type, collator, result_value); + auto col_res = ColumnString::create(); + col_res->insert(result_value); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vector( @@ -232,7 +242,25 @@ class FunctionStringReplace : public IFunction const auto * col_replacement_const = typeid_cast(column_replacement.get()); auto replacement = col_replacement_const->getValue(); - if (const auto * col = checkAndGetColumn(column_src.get())) + if (const auto * col_const = checkAndGetColumnConst(column_src.get())) + { + auto col_res = ColumnString::create(); + + Impl::vectorConstSrcAndReplace( + col_const->getValue(), + col_needle->getChars(), + col_needle->getOffsets(), + replacement, + pos, + occ, + match_type, + collator, + col_res->getChars(), + col_res->getOffsets()); + + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedle( @@ -292,7 +320,24 @@ class FunctionStringReplace : public IFunction auto needle = col_needle_const->getValue(); const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const auto * col = checkAndGetColumn(column_src.get())) + if (const auto * col_const = checkAndGetColumnConst(column_src.get())) + { + auto col_res = ColumnString::create(); + + Impl::vectorConstSrcAndNeedle( + col_const->getValue(), + needle, + col_replacement->getChars(), + col_replacement->getOffsets(), + pos, + occ, + match_type, + collator, + col_res->getChars(), + col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstReplacement( @@ -351,7 +396,25 @@ class FunctionStringReplace : public IFunction const auto * col_needle = typeid_cast(column_needle.get()); const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const auto * col = checkAndGetColumn(column_src.get())) + if (const auto * col_const = checkAndGetColumnConst(column_src.get())) + { + auto col_res = ColumnString::create(); + + Impl::vectorConstSrc( + col_const->getValue(), + col_needle->getChars(), + col_needle->getOffsets(), + col_replacement->getChars(), + col_replacement->getOffsets(), + pos, + occ, + match_type, + collator, + col_res->getChars(), + col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedleReplacement( diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 5b5318bcc30..e2b33a5f943 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -972,6 +972,263 @@ struct ReplaceStringImpl } } + static void vectorConstSrcAndReplace( + const std::string & data, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + auto data_col = ColumnString::create(); + data_col->insert(data); + const ColumnString::Chars_t & search_data = data_col->getChars(); + const ColumnString::Offsets & search_offsets = data_col->getOffsets(); + + res_data.reserve(search_data.size()); + res_offsets.resize(search_offsets.size()); + + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < search_offsets.size(); ++i) + { + auto data_offset = StringUtil::offsetAt(search_offsets, i); + auto data_size = StringUtil::sizeAt(search_offsets, i); + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + const UInt8 * begin = &search_data[data_offset]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data_size; + + if (needle_size == 0) + { + /// Copy the whole data to res without changing + res_data.resize(res_data.size() + data_size); + memcpy(&res_data[res_offset], begin, data_size); + res_offset += data_size; + res_offsets[i] = res_offset; + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + { + /// It's time to stop. + break; + } + + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle_size; + + if (replace_one) + { + /// Copy the rest of data and stop. + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } + } + res_offsets[i] = res_offset; + } + } + + static void vectorConstSrcAndNeedle( + const std::string & data, + const std::string & needle, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + // create a ColumnString which has 1 rows. + auto data_col = ColumnString::create(); + data_col->insert(data); + const ColumnString::Chars_t & search_data = data_col->getChars(); + const ColumnString::Offsets & search_offsets = data_col->getOffsets(); + + const UInt8 * begin = &search_data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + search_data.size(); + + ColumnString::Offset res_offset = 0; + res_data.reserve(search_data.size()); + size_t size = search_offsets.size(); + res_offsets.resize(size); + + if (needle.empty()) + { + /// Copy all the data without changing. + res_data.resize(search_data.size()); + memcpy(&res_data[0], begin, search_data.size()); + memcpy(&res_offsets[0], &search_offsets[0], size * sizeof(UInt64)); + return; + } + + /// The current index in the array of strings. + size_t i = 0; + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + + /// Determine which index it belongs to. + while (i < search_offsets.size() && begin + search_offsets[i] <= match) + { + res_offsets[i] = res_offset + ((begin + search_offsets[i]) - pos); + ++i; + } + res_offset += (match - pos); + + /// If you have reached the end, it's time to stop + if (i == search_offsets.size()) + break; + + /// Is it true that this line no longer needs to perform transformations. + bool can_finish_current_string = false; + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + /// We check that the entry does not go through the boundaries of strings. + if (match + needle.size() < begin + search_offsets[i]) + { + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle.size(); + if (replace_one) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } + + if (can_finish_current_string) + { + res_data.resize(res_data.size() + (begin + search_offsets[i] - pos)); + memcpy(&res_data[res_offset], pos, (begin + search_offsets[i] - pos)); + res_offset += (begin + search_offsets[i] - pos); + res_offsets[i] = res_offset; + pos = begin + search_offsets[i]; + ++i; + } + } + } + + static void vectorConstSrc( + const std::string & data, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + // create a ColumnString which has 1 rows. + auto data_col = ColumnString::create(); + data_col->insert(data); + const ColumnString::Chars_t & search_data = data_col->getChars(); + const ColumnString::Offsets & search_offsets = data_col->getOffsets(); + + res_data.reserve(search_data.size()); + res_offsets.resize(search_offsets.size()); + + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < search_offsets.size(); ++i) + { + auto data_offset = StringUtil::offsetAt(search_offsets, i); + auto data_size = StringUtil::sizeAt(search_offsets, i); + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + const UInt8 * begin = &search_data[data_offset]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data_size; + + if (needle_size == 0) + { + res_data.resize(res_data.size() + data_size); + memcpy(&res_data[res_offset], begin, data_size); + res_offset += data_size; + res_offsets[i] = res_offset; + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + { + /// It's time to stop. + break; + } + + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos = match + needle_size; + + if (replace_one) + { + /// Copy the rest of data and stop. + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } + } + res_offsets[i] = res_offset; + } + } + + static void vectorNonConstReplacement( const ColumnString::Chars_t & data, const ColumnString::Offsets & offsets, diff --git a/dbms/src/Functions/tests/gtest_strings_replace.cpp b/dbms/src/Functions/tests/gtest_strings_replace.cpp index 4615d634e5f..dbe26a2ea46 100644 --- a/dbms/src/Functions/tests/gtest_strings_replace.cpp +++ b/dbms/src/Functions/tests/gtest_strings_replace.cpp @@ -104,6 +104,38 @@ try toVec({" hello ", " h e llo", "hello ", " ", "hello, world"}), toVec({" ", "h", "", "h", ","}), toVec({"", "x", "xx", " ", ","}))); + + /// const src replacement + ASSERT_COLUMN_EQ( + toVec({"Good Night", "Bad Afternoon", "Good Afterwhile"}), + executeFunction( + "replaceAll", + toVec({"Good Afternoon"}), + toVec({"Afternoon", "Good", "noon"}), + toVec({"Night", "Bad", "while"}))); + + /// const src and needle replacement + ASSERT_COLUMN_EQ( + toVec({"Good Night", "Good Bad", "Good while"}), + executeFunction( + "replaceAll", + toVec({"Good Afternoon"}), + toConst({"Afternoon"}), + toVec({"Night", "Bad", "while"}))); + + /// const src and replace replacement + ASSERT_COLUMN_EQ( + toVec({"Good Night", "Night Afternoon", "Good AfterNight"}), + executeFunction( + "replaceAll", + toVec({"Good Afternoon"}), + toVec({"Afternoon", "Good", "noon"}), + toConst({"Night"}))); + + /// const src and replace replacement + ASSERT_COLUMN_EQ( + toVec({"Good Night"}), + executeFunction("replaceAll", toVec({"Good Afternoon"}), toConst({"Afternoon"}), toConst({"Night"}))); } CATCH