Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix sql failed when using replace function #9524

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
123 changes: 122 additions & 1 deletion dbms/src/Functions/FunctionsStringReplace.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,40 @@ class FunctionStringReplace : public IFunction

ColumnWithTypeAndName & column_result = block.getByPosition(result);

bool src_const = column_src->isColumnConst();
bool needle_const = column_needle->isColumnConst();
bool replacement_const = column_replacement->isColumnConst();

if (needle_const && replacement_const)
if (src_const){
EricZequan marked this conversation as resolved.
Show resolved Hide resolved
if (!needle_const && replacement_const)
{
executeImplConstReplacement(
column_src,
column_needle,
column_replacement,
pos,
occ,
match_type,
column_result
);
}else if (!needle_const && !replacement_const)
{
executeImplConstFirstParaReplacement(
column_src,
column_needle,
column_replacement,
pos,
occ,
match_type,
column_result
);
}else
{
throw Exception(
"UnImplement function.",
ErrorCodes::BAD_ARGUMENTS);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly no need for a lot of impls because we are fixing for an edge case that only discovered in tests. It is acceptable to be not performance optimal.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my test case, some sql like :

explain SELECT /*+ read_from_storage(tiflash[my_table]) */ REPLACE('Hello World', my_table.col_11, my_table.col_14)  FROM my_table;

will get same error. I think it may need to process, so I add these function to deal with such problems. 😂

Copy link
Member

@breezewish breezewish Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I mean we could use a slower but more general impl for such cases.

}else if (needle_const && replacement_const)
{
executeImpl(column_src, column_needle, column_replacement, pos, occ, match_type, column_result);
}
Expand Down Expand Up @@ -400,6 +430,97 @@ class FunctionStringReplace : public IFunction
}
}

void executeImplConstFirstParaReplacement(
const ColumnPtr & column_src,
const ColumnPtr & column_needle,
const ColumnPtr & column_replacement,
Int64 pos [[maybe_unused]],
Int64 occ [[maybe_unused]],
const String & match_type,
ColumnWithTypeAndName & column_result) const
{
if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement)
{
const auto * col_needle = typeid_cast<const ColumnString *>(column_needle.get());
const auto * col_replacement = typeid_cast<const ColumnString *>(column_replacement.get());
if (const auto * col = checkAndGetColumn<ColumnConst>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vectorConstFirstParaReplacement(
col->getValue<String>(),
col_needle->getChars(),
col_needle->getOffsets(),
col_replacement->getChars(),
col_replacement->getOffsets(),
pos,
occ,
match_type,
collator,
col_res->getChars(),
col_res->getOffsets()
);
column_result.column = std::move(col_res);
}
else
throw Exception(
"Illegal column " + column_src->getName() + " of first argument of function " + getName() +
"Illegal column " + column_needle->getName() + " of second argument of function " + getName() +
"Illegal column " + column_replacement->getName() + " of third argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
else
{
throw Exception(
"Argument at index 1 for function replace must be constant",
ErrorCodes::ILLEGAL_COLUMN);
}
}

void executeImplConstReplacement(
const ColumnPtr & column_src,
const ColumnPtr & column_needle,
const ColumnPtr & column_replacement,
Int64 pos [[maybe_unused]],
Int64 occ [[maybe_unused]],
const String & match_type,
ColumnWithTypeAndName & column_result) const
{
if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement)
{
const auto * col_needle = typeid_cast<const ColumnString *>(column_needle.get());
const auto * col_replacement = typeid_cast<const ColumnConst *>(column_replacement.get());
if (const auto * col = checkAndGetColumn<ColumnConst>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vectorConstReplacement(
col->getValue<String>(),
col_needle->getChars(),
col_needle->getOffsets(),
col_replacement->getValue<String>(),
pos,
occ,
match_type,
collator,
col_res->getChars(),
col_res->getOffsets()
);
column_result.column = std::move(col_res);
}
else
throw Exception(
"Illegal column " + column_src->getName() + " of first argument of function " + getName() +
"Illegal column " + column_needle->getName() + " of second argument of function " + getName() +
"Illegal column " + column_replacement->getName() + " of third argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
else
{
throw Exception(
"Argument at index 1 and 3 for function replace must be constant",
ErrorCodes::ILLEGAL_COLUMN);
}
}

TiDB::TiDBCollatorPtr collator{};
};
} // namespace DB
160 changes: 160 additions & 0 deletions dbms/src/Functions/FunctionsStringSearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,166 @@ struct ReplaceStringImpl
}
}

// Handle the case where `column_src` and `replace` are const
static void vectorConstReplacement(
Copy link
Member

@breezewish breezewish Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we modify existing functions instead of introducing new ones? Existing functions are already capable of handling a list of sources. It should be better to allow it handling constant source (i.e. one source), minimizing the changes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these functions are necessary because the first parameter of existing functions is Column by default, which will be parsed into ColumnString in FunctionsStringSearch.cpp, and our implementation needs to parse the parameter into ColumnConst. ColumnConst does not have getChars and getOffsets methods, so existing functions cannot be used.

Copy link
Member

@breezewish breezewish Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't make sense. Think about it. A function can now calculate a bunch of FN(A, B), why cannot calculate a single row of FN(ConstA, ConstB)? What we want to support is a subset of the current capability.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I found a way to convert between the two types, so that now the existing function can be used directly, and the amount of code is greatly reduced.

const std::string & data,
const ColumnString::Chars_t & needle_chars,
const ColumnString::Offsets & needle_offsets,
const std::string & replace_chars,
const Int64 & /* pos */,
const Int64 & /* occ */,
const std::string & /* match_type */,
TiDB::TiDBCollatorPtr /* collator */,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.reserve(data.size());
res_offsets.resize(needle_offsets.size());
ColumnString::Offset res_offset = 0;

for (size_t i = 0; i < needle_offsets.size(); ++i)
{
auto needle_offset = StringUtil::offsetAt(needle_offsets, i);
auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero

// Copy the data without changing if `need_size` is null.
if (needle_size == 0)
{
res_data.resize(res_data.size() + data.size());
memcpy(&res_data[res_offset], data.data(), data.size());
res_offset += data.size();
res_offsets[i] = res_offset;
continue;
}

size_t pos_in_data = 0; // trace the location in `data`
int replace_cnt = 0;

while (pos_in_data < data.size())
{
bool match = true;

// Check if there are enough characters to match
if (pos_in_data + needle_size > data.size() || (replace_one && replace_cnt > 0))
match = false;

// Check if characters match
for (size_t j = 0; match && j < needle_size; ++j)
if (data[pos_in_data + j] != needle_chars[needle_offset + j])
match = false;

// If it matches, replace `needle` with `replacement`
if (match)
{
++replace_cnt;
res_data.resize(res_data.size() + replace_chars.size());
memcpy(&res_data[res_offset], replace_chars.data(), replace_chars.size());
res_offset += replace_chars.size();
pos_in_data += needle_size;
}
else
{
// Copy unmatched characters
res_data.resize(res_data.size() + 1);
res_data[res_offset] = data[pos_in_data];
res_offset += 1;
pos_in_data += 1;
}

// If `replace_one` is enabled, stop after replacing once.
if (replace_one && replace_cnt > 0)
{
// Process the rest
res_data.resize(res_data.size() + (data.size() - pos_in_data));
memcpy(&res_data[res_offset], data.data() + pos_in_data, data.size() - pos_in_data);
res_offset += (data.size() - pos_in_data);
break;
}
}

res_offsets[i] = res_offset;
}
}

// Handle the case where `column_src` are const
static void vectorConstFirstParaReplacement(
const std::string & data,
const ColumnString::Chars_t & needle_chars,
const ColumnString::Offsets & needle_offsets,
const ColumnString::Chars_t & replacement_chars,
const ColumnString::Offsets & replacement_offsets,
const Int64 & /* pos */,
const Int64 & /* occ */,
const std::string & /* match_type */,
TiDB::TiDBCollatorPtr /* collator */,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.reserve(data.size());
res_offsets.resize(needle_offsets.size());
ColumnString::Offset res_offset = 0;

for (size_t i = 0; i < needle_offsets.size(); ++i)
{
// 获取当前 needle 的起始位置和大小
auto needle_offset = StringUtil::offsetAt(needle_offsets, i);
auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore trailing zero bytes

// If needle is empty, copy the entire data directly
if (needle_size == 0)
{
res_data.resize(res_data.size() + data.size());
memcpy(&res_data[res_offset], data.data(), data.size());
res_offset += data.size();
res_offsets[i] = res_offset;
continue;
}

Volnitsky searcher(reinterpret_cast<const char *>(&needle_chars[needle_offset]), needle_size, data.size());
size_t pos_in_data = 0; // trace the location in `data`

while (pos_in_data < data.size())
{
const char* match = searcher.search(data.data() + pos_in_data, data.size() - pos_in_data);

// Copy unmatched characters
size_t unmatched_len = match - (data.data() + pos_in_data);
res_data.resize(res_data.size() + unmatched_len);
memcpy(&res_data[res_offset], data.data() + pos_in_data, unmatched_len);
res_offset += unmatched_len;
pos_in_data += unmatched_len;

if (match == data.data() + data.size())
{
break;
}

if (pos_in_data + needle_size <= data.size())
{
auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i);
auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // 忽略末尾的零字节
EricZequan marked this conversation as resolved.
Show resolved Hide resolved

res_data.resize(res_data.size() + replacement_size);
memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size);
res_offset += replacement_size;
pos_in_data += needle_size;
}

// If `replace_one` is enabled, stop after replacing once.
if (replace_one)
{
size_t remaining_len = data.size() - pos_in_data;
res_data.resize(res_data.size() + remaining_len);
memcpy(&res_data[res_offset], data.data() + pos_in_data, remaining_len);
res_offset += remaining_len;
break;
}
}

res_offsets[i] = res_offset;
}
}

static void vectorNonConstNeedleReplacement(
const ColumnString::Chars_t & data,
const ColumnString::Offsets & offsets,
Expand Down