Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add regex replace functionality to transformation filter extractors #301

Merged
merged 15 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,12 @@ message Extraction {
// If your regex contains capturing groups, use this field to determine which
// group should be selected.
uint32 subgroup = 3;

// The string to replace the matched portion of the source with
google.protobuf.StringValue replacement_text = 5;

// If set to true, all matches of the regex in the source will be replaced by the replacement_text.
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
bool replace_all = 6;
}

// Defines a transformation template.
Expand Down
7 changes: 7 additions & 0 deletions changelog/v1.27.2-patch2/extractor_regex_replace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
changelog:
- type: NEW_FEATURE
resolvesIssue: false
issueLink: https://github.com/solo-io/gloo/issues/8706
description: >
Update transformation filter extractors to support regex
replace/replace all operations on extracted values.
111 changes: 108 additions & 3 deletions source/extensions/filters/http/transformation/inja_transformer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ getHeader(const Http::RequestOrResponseHeaderMap &header_map,
Extractor::Extractor(const envoy::api::v2::filter::http::Extraction &extractor)
: headername_(extractor.header()), body_(extractor.has_body()),
group_(extractor.subgroup()),
extract_regex_(Solo::Regex::Utility::parseStdRegex(extractor.regex())) {
extract_regex_(Solo::Regex::Utility::parseStdRegex(extractor.regex())),
has_replacement_text_(extractor.has_replacement_text()),
replacement_text_(extractor.replacement_text().value()),
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
replace_all_(extractor.replace_all()) {
// mark count == number of sub groups, and we need to add one for match number
// 0 so we test for < instead of <= see:
// http://www.cplusplus.com/reference/regex/basic_regex/mark_count/
Expand All @@ -65,6 +68,26 @@ Extractor::Extractor(const envoy::api::v2::filter::http::Extraction &extractor)
fmt::format("group {} requested for regex with only {} sub groups",
group_, extract_regex_.mark_count()));
}

// if replace_all is set, we must have replacement text
if (replace_all_ && !has_replacement_text_) {
throw EnvoyException(
fmt::format("replace_all set but no replacement text provided"));
}

// if replace_all is set, subgroup should be 0
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
if (replace_all_ && group_ != 0) {
throw EnvoyException(
fmt::format("replace_all set but subgroup is not 0"));
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
}

// extractionFunc is either replaceValue or extractValue depending on whether
// replacement_text_ is empty or not
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
if (has_replacement_text_) {
extraction_func_ = std::bind(&Extractor::replaceValue, this, _1, _2);
} else {
extraction_func_ = std::bind(&Extractor::extractValue, this, _1, _2);
}
}

absl::string_view
Expand All @@ -74,13 +97,14 @@ Extractor::extract(Http::StreamFilterCallbacks &callbacks,
if (body_) {
const std::string &string_body = body();
absl::string_view sv(string_body);
return extractValue(callbacks, sv);
return extraction_func_(callbacks, sv);
} else {
const Http::HeaderMap::GetResult header_entries = getHeader(header_map, headername_);
if (header_entries.empty()) {
return "";
}
return extractValue(callbacks, header_entries[0]->value().getStringView());
const auto &header_value = header_entries[0]->value().getStringView();
return extraction_func_(callbacks, header_value);
}
}

Expand All @@ -105,6 +129,87 @@ Extractor::extractValue(Http::StreamFilterCallbacks &callbacks,
return "";
}

// Match a regex against the input value and replace the matched subgroup with the replacement_text_ value
// writes the result to replaced_value_ and returns a absl::string_view to it
// if replace_all_ is true, __all__ substrings matching the regex in the input value will be replaced
// with the replacement_text_ value
absl::string_view
Extractor::replaceValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const {
if (replace_all_) {
return replaceAllValues(callbacks, value);
} else {
return replaceIndividualValue(callbacks, value);
}
}

absl::string_view
Extractor::replaceIndividualValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const {
std::match_results<absl::string_view::const_iterator> regex_result;

// if there are no matches, return an empty string
if (!std::regex_search(value.begin(), value.end(), regex_result, extract_regex_)) {
ENVOY_STREAM_LOG(debug, "replaceValue: extractor regex did not match input", callbacks);
return "";
}

// if the subgroup specified is greater than the number of subgroups in the regex, return an empty string
if (group_ >= regex_result.size()) {
ASSERT("no such group in the regex");
jbohanon marked this conversation as resolved.
Show resolved Hide resolved
ENVOY_STREAM_LOG(debug, "replaceValue: invalid group specified for regex", callbacks);
return "";
}

// if the regex doesn't match the entire input value, return an empty string
if (regex_result[0].length() != long(value.length())) {
ENVOY_STREAM_LOG(debug, "replaceValue: Regex did not match entire input value. This is not allowed when replace_all is false", callbacks);
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
return "";
}

// Create a new string with the input value
std::string replaced(value.begin(), value.end());

// Replace the specified subgroup with the replacement_text_ value
const auto &sub_match = regex_result[group_];
auto start_pos = sub_match.first - value.begin();
auto length = sub_match.length();
replaced.replace(start_pos, length, replacement_text_);

// Store the replaced string in replaced_value_ and return an absl::string_view to it
replaced_value_ = std::move(replaced);
return absl::string_view(replaced_value_);
ashishb-solo marked this conversation as resolved.
Show resolved Hide resolved
}

absl::string_view
Extractor::replaceAllValues(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const {
std::string input(value.begin(), value.end());
std::string replaced;
bool matchFound = false;

// create an iterator to search for matches of extract_regex_ in the input string
std::sregex_iterator it(input.begin(), input.end(), extract_regex_);
std::sregex_iterator end_it; // default end iterator for comparison

// set matchFound to true if at least one match is found
if (it != end_it) {
matchFound = true;
}

if (!matchFound) {
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
ENVOY_STREAM_LOG(debug, "replaceAllValues: extractor regex did not match input", callbacks);
return "";
}
jbohanon marked this conversation as resolved.
Show resolved Hide resolved

// If a match was found, replace all instances of the regex in the input value with the replacement_text_ value
replaced = std::regex_replace(input, extract_regex_, replacement_text_);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar performance concerns here since we have had issues with std::regex in the past

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some notes/questions:

  • Extractor regexes are compiled at config time, which is good (or really, the alternative is unacceptable)
  • The replaceAll path (i.e. the one we're discussing here) is not maximally efficient -- it does an efficient check to identify whether there are any matches, and then does the call you've highlighted here, which will replace all matches.
    • I considered extending the efficient check to identify matches to identify all matches, after which we could manually replace matches with the replacement_text_, but I chose not to as it seemed like it would be difficult to implement/read/maintain, and would potentially be prone to bugs and edge cases. But I could go that route if the performance of regex_replace is problematically bad. That being said I think I would prefer to get some performance metrics before doing so
  • Can you link any issues or PRs you can recall where there's discussion about std::regex performance issues?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you link any issues or PRs you can recall where there's discussion about std::regex performance issues?

Here's an issue in DLP. Similar situation - regexes are compiled at the time of configuration, but are still extremely slow. There are a few benchmarks linked in the issue that show how... hopeless std::regex can be.

Happy to leave this as is for now, but if this issue does bubble up, there's a bit of additional and (hopefully) helpful information on that thread


// Store the replaced string in replaced_value_ and return an absl::string_view to it
replaced_value_ = std::move(replaced);
return absl::string_view(replaced_value_);
}

// A TransformerInstance is constructed by the InjaTransformer constructor at config time
// on the main thread. It access thread-local storage which is populated during the
// InjaTransformer::transform method call, which happens on the request path on any
Expand Down
13 changes: 13 additions & 0 deletions source/extensions/filters/http/transformation/inja_transformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ namespace HttpFilters {
namespace Transformation {

using GetBodyFunc = std::function<const std::string &()>;
using ExtractionFunc = std::function<absl::string_view(Http::StreamFilterCallbacks &callbacks, absl::string_view value)>;

struct ThreadLocalTransformerContext : public ThreadLocal::ThreadLocalObject {
public:
Expand Down Expand Up @@ -86,11 +87,23 @@ class Extractor : Logger::Loggable<Logger::Id::filter> {
private:
absl::string_view extractValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;
absl::string_view replaceValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;
absl::string_view replaceIndividualValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;
absl::string_view replaceAllValues(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;

const Http::LowerCaseString headername_;
const bool body_;
const unsigned int group_;
const std::regex extract_regex_;
const bool has_replacement_text_;
const std::string replacement_text_;
ben-taussig-solo marked this conversation as resolved.
Show resolved Hide resolved
const bool replace_all_;

ExtractionFunc extraction_func_;
mutable std::string replaced_value_;
ashishb-solo marked this conversation as resolved.
Show resolved Hide resolved
};

class InjaTransformer : public Transformer {
Expand Down
15 changes: 15 additions & 0 deletions test/extensions/filters/http/transformation/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,21 @@ envoy_gloo_cc_test(
],
)

envoy_gloo_cc_test(
name = "inja_transformer_replace_test",
srcs = ["inja_transformer_replace_test.cc"],
repository = "@envoy",
deps = [
"//source/extensions/filters/http/transformation:inja_transformer_lib",
"@envoy//source/common/common:random_generator_lib",
"@envoy//source/common/common:base64_lib",
"@envoy//test/test_common:environment_lib",
"@envoy//test/mocks/http:http_mocks",
"@envoy//test/mocks/server:server_mocks",
"@envoy//test/mocks/upstream:upstream_mocks",
],
)

envoy_cc_test_binary(
name = "inja_transformer_speed_test",
srcs = ["inja_transformer_speed_test.cc"],
Expand Down
Loading
Loading