Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix sql failed when using replace function #9524

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
71 changes: 67 additions & 4 deletions dbms/src/Functions/FunctionsStringReplace.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,17 @@ class FunctionStringReplace : public IFunction
auto needle = c1_const->getValue<String>();
auto replacement = c2_const->getValue<String>();

if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
if (const auto * col_const = checkAndGetColumnConst<ColumnString>(column_src.get()))
{
std::string result_value;
const auto * src_const = typeid_cast<const ColumnConst *>(column_src.get());
auto src = src_const->getValue<String>();
Impl::constant(src, needle, replacement, pos, occ, match_type, collator, result_value);
auto col_res = ColumnString::create();
col_res->insert(result_value);
column_result.column = std::move(col_res);
}
else if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vector(
Expand Down Expand Up @@ -232,7 +242,25 @@ class FunctionStringReplace : public IFunction
const auto * col_replacement_const = typeid_cast<const ColumnConst *>(column_replacement.get());
auto replacement = col_replacement_const->getValue<String>();

if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
if (const auto * col_const = checkAndGetColumnConst<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();

Impl::vectorConstSrcAndReplace(
col_const->getValue<String>(),
col_needle->getChars(),
col_needle->getOffsets(),
replacement,
pos,
occ,
match_type,
collator,
col_res->getChars(),
col_res->getOffsets());

column_result.column = std::move(col_res);
}
else if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vectorNonConstNeedle(
Expand Down Expand Up @@ -292,7 +320,24 @@ class FunctionStringReplace : public IFunction
auto needle = col_needle_const->getValue<String>();
const auto * col_replacement = typeid_cast<const ColumnString *>(column_replacement.get());

if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
if (const auto * col_const = checkAndGetColumnConst<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();

Impl::vectorConstSrcAndNeedle(
col_const->getValue<String>(),
needle,
col_replacement->getChars(),
col_replacement->getOffsets(),
pos,
occ,
match_type,
collator,
col_res->getChars(),
col_res->getOffsets());
column_result.column = std::move(col_res);
}
else if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vectorNonConstReplacement(
Expand Down Expand Up @@ -351,7 +396,25 @@ class FunctionStringReplace : public IFunction
const auto * col_needle = typeid_cast<const ColumnString *>(column_needle.get());
const auto * col_replacement = typeid_cast<const ColumnString *>(column_replacement.get());

if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
if (const auto * col_const = checkAndGetColumnConst<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();

Impl::vectorConstSrc(
col_const->getValue<String>(),
col_needle->getChars(),
col_needle->getOffsets(),
col_replacement->getChars(),
col_replacement->getOffsets(),
pos,
occ,
match_type,
collator,
col_res->getChars(),
col_res->getOffsets());
column_result.column = std::move(col_res);
}
else if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vectorNonConstNeedleReplacement(
Expand Down
257 changes: 257 additions & 0 deletions dbms/src/Functions/FunctionsStringSearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,263 @@ struct ReplaceStringImpl
}
}

static void vectorConstSrcAndReplace(
const std::string & data,
const ColumnString::Chars_t & needle_chars,
const ColumnString::Offsets & needle_offsets,
const std::string & replacement,
const Int64 & /* pos */,
const Int64 & /* occ */,
const std::string & /* match_type */,
TiDB::TiDBCollatorPtr /* collator */,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
auto data_col = ColumnString::create();
data_col->insert(data);
Comment on lines +987 to +988
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't do this, this greatly reduce the performance.

const ColumnString::Chars_t & search_data = data_col->getChars();
const ColumnString::Offsets & search_offsets = data_col->getOffsets();

res_data.reserve(search_data.size());
res_offsets.resize(search_offsets.size());

ColumnString::Offset res_offset = 0;

for (size_t i = 0; i < search_offsets.size(); ++i)
{
auto data_offset = StringUtil::offsetAt(search_offsets, i);
auto data_size = StringUtil::sizeAt(search_offsets, i);

auto needle_offset = StringUtil::offsetAt(needle_offsets, i);
auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero

const UInt8 * begin = &search_data[data_offset];
const UInt8 * pos = begin;
const UInt8 * end = pos + data_size;

if (needle_size == 0)
{
/// Copy the whole data to res without changing
res_data.resize(res_data.size() + data_size);
memcpy(&res_data[res_offset], begin, data_size);
res_offset += data_size;
res_offsets[i] = res_offset;
continue;
}

Volnitsky searcher(reinterpret_cast<const char *>(&needle_chars[needle_offset]), needle_size, data_size);
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);

/// Copy the data without changing.
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += match - pos;

if (match == end)
{
/// It's time to stop.
break;
}

res_data.resize(res_data.size() + replacement.size());
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
res_offset += replacement.size();
pos = match + needle_size;

if (replace_one)
{
/// Copy the rest of data and stop.
res_data.resize(res_data.size() + (end - pos));
memcpy(&res_data[res_offset], pos, (end - pos));
res_offset += (end - pos);
break;
}
}
res_offsets[i] = res_offset;
}
}

static void vectorConstSrcAndNeedle(
const std::string & data,
const std::string & needle,
const ColumnString::Chars_t & replacement_chars,
const ColumnString::Offsets & replacement_offsets,
const Int64 & /* pos */,
const Int64 & /* occ */,
const std::string & /* match_type */,
TiDB::TiDBCollatorPtr /* collator */,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
// create a ColumnString which has 1 rows.
auto data_col = ColumnString::create();
data_col->insert(data);
const ColumnString::Chars_t & search_data = data_col->getChars();
const ColumnString::Offsets & search_offsets = data_col->getOffsets();

const UInt8 * begin = &search_data[0];
const UInt8 * pos = begin;
const UInt8 * end = pos + search_data.size();

ColumnString::Offset res_offset = 0;
res_data.reserve(search_data.size());
size_t size = search_offsets.size();
res_offsets.resize(size);

if (needle.empty())
{
/// Copy all the data without changing.
res_data.resize(search_data.size());
memcpy(&res_data[0], begin, search_data.size());
memcpy(&res_offsets[0], &search_offsets[0], size * sizeof(UInt64));
return;
}

/// The current index in the array of strings.
size_t i = 0;

Volnitsky searcher(needle.data(), needle.size(), end - pos);

/// We will search for the next occurrence in all rows at once.
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);

/// Copy the data without changing
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);

/// Determine which index it belongs to.
while (i < search_offsets.size() && begin + search_offsets[i] <= match)
{
res_offsets[i] = res_offset + ((begin + search_offsets[i]) - pos);
++i;
}
res_offset += (match - pos);

/// If you have reached the end, it's time to stop
if (i == search_offsets.size())
break;

/// Is it true that this line no longer needs to perform transformations.
bool can_finish_current_string = false;

auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i);
auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero

/// We check that the entry does not go through the boundaries of strings.
if (match + needle.size() < begin + search_offsets[i])
{
res_data.resize(res_data.size() + replacement_size);
memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size);
res_offset += replacement_size;
pos = match + needle.size();
if (replace_one)
can_finish_current_string = true;
}
else
{
pos = match;
can_finish_current_string = true;
}

if (can_finish_current_string)
{
res_data.resize(res_data.size() + (begin + search_offsets[i] - pos));
memcpy(&res_data[res_offset], pos, (begin + search_offsets[i] - pos));
res_offset += (begin + search_offsets[i] - pos);
res_offsets[i] = res_offset;
pos = begin + search_offsets[i];
++i;
}
}
}

static void vectorConstSrc(
const std::string & data,
const ColumnString::Chars_t & needle_chars,
const ColumnString::Offsets & needle_offsets,
const ColumnString::Chars_t & replacement_chars,
const ColumnString::Offsets & replacement_offsets,
const Int64 & /* pos */,
const Int64 & /* occ */,
const std::string & /* match_type */,
TiDB::TiDBCollatorPtr /* collator */,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
// create a ColumnString which has 1 rows.
auto data_col = ColumnString::create();
data_col->insert(data);
const ColumnString::Chars_t & search_data = data_col->getChars();
const ColumnString::Offsets & search_offsets = data_col->getOffsets();

res_data.reserve(search_data.size());
res_offsets.resize(search_offsets.size());

ColumnString::Offset res_offset = 0;

for (size_t i = 0; i < search_offsets.size(); ++i)
{
auto data_offset = StringUtil::offsetAt(search_offsets, i);
auto data_size = StringUtil::sizeAt(search_offsets, i);

auto needle_offset = StringUtil::offsetAt(needle_offsets, i);
auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero

auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i);
auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero

const UInt8 * begin = &search_data[data_offset];
const UInt8 * pos = begin;
const UInt8 * end = pos + data_size;

if (needle_size == 0)
{
res_data.resize(res_data.size() + data_size);
memcpy(&res_data[res_offset], begin, data_size);
res_offset += data_size;
res_offsets[i] = res_offset;
continue;
}

Volnitsky searcher(reinterpret_cast<const char *>(&needle_chars[needle_offset]), needle_size, data_size);
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);

/// Copy the data without changing.
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += match - pos;

if (match == end)
{
/// It's time to stop.
break;
}

res_data.resize(res_data.size() + replacement_size);
memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size);
res_offset += replacement_size;
pos = match + needle_size;

if (replace_one)
{
/// Copy the rest of data and stop.
res_data.resize(res_data.size() + (end - pos));
memcpy(&res_data[res_offset], pos, (end - pos));
res_offset += (end - pos);
break;
}
}
res_offsets[i] = res_offset;
}
}


static void vectorNonConstReplacement(
const ColumnString::Chars_t & data,
const ColumnString::Offsets & offsets,
Expand Down
Loading